In [8]:
import requests
import urllib.request
from bs4 import BeautifulSoup
import pandas as pd
import os
import re
import tqdm
import asyncio
import json

In [97]:
old_posts = pd.read_parquet("app_files/lw_data.parquet")

In [98]:
old_posts.columns

Index(['_id', 'Unnamed: 0', 'title', 'authors', 'score', 'karma', 'body',
       'postedAt', 'tags', 'commentCount', 'upvoteCount', 'url',
       'contains_wanted', 'skip_authors', 'year', 'month', 'quarter',
       'alignment_forum', 'ids', 'links', 'references', 'pingback_posts'],
      dtype='object')

In [99]:
folder_path = "data/posts/"
posts = pd.DataFrame()
for file in os.listdir(folder_path):
    with open(folder_path + file) as f:
        data = json.load(f)
        posts = pd.concat([posts, pd.DataFrame(data)])
posts = posts.drop_duplicates(subset=['_id'])

In [100]:
url = "https://www.lesswrong.com/graphql"
default_headers = {
  'accept': '*/*',
  'accept-language': 'en-US,en;q=0.9',
  'content-type': 'application/json',
  'user-agent': 'AE Studio-Neglected Means Researcher tristan.tran@ae.studio',
}

In [112]:
def get_posts(start_date:str,
              end_date: str,
              offset: int):
    """
    Get posts from LessWrong API
    :param start_date: str, start date in format'YYYY-MM-DD'
    :param end_date: str, end date in format 'YYYY-MM-DD'
    :param offset: int, offset for pagination
    """
    post_query = """
{
      posts(input: {
        terms: {
          limit:100
          view:"timeframe"
          sortedBy:"magic"
          after:"%sT07:00:00.000Z"
          before:"%sT06:59:59.999Z"
          offset: %s
          meta: null  # this seems to get both meta and non-meta posts
        }
      }
      ) {
        results {
          _id
          title
          slug
          pageUrl
          postedAt
          baseScore
          voteCount
          score
          commentCount
          meta
          question
          url
          author
          pingbacks
          coauthors {
            username
            _id
            slug
          }
          user {
            username
            _id
            slug
          }
        }
      }
    }
    """%(start_date, end_date, offset)
    response = requests.post(url, headers=default_headers, json={'query': post_query})
    response.raise_for_status()
    return response.json()
def get_all_posts(start_date, end_date):
    offset = 0
    results = []
    while True:
        response = get_posts(start_date, end_date, offset)
        if not response['data']['posts']['results']:
            break
        results.extend(response['data']['posts']['results'])
        offset += 100
    return results

In [113]:
r = get_posts("2021-01-01", "2021-01-02", 0)
len(r["data"]["posts"]["results"])

9

In [114]:
get_posts("2021-01-01", "2021-01-02", 8)

{'data': {'posts': {'results': [{'_id': '6kAtxZyiTdY34JWak',
     'title': 'Is Free Will A Myth?',
     'slug': 'is-free-will-a-myth',
     'pageUrl': 'https://www.lesswrong.com/posts/6kAtxZyiTdY34JWak/is-free-will-a-myth',
     'postedAt': '2021-01-01T20:56:50.221Z',
     'baseScore': 0,
     'voteCount': 0,
     'score': 0,
     'commentCount': 4,
     'meta': False,
     'question': False,
     'url': None,
     'author': None,
     'pingbacks': {},
     'coauthors': [],
     'user': {'username': 'Precious Oluwatobi Emmanuel',
      '_id': 'otR3PZ9KMsM2JyCC5',
      'slug': 'precious-oluwatobi-emmanuel'}}]}},
 'extensions': {'cacheControl': {'version': 1,
   'hints': [{'path': ['posts'], 'maxAge': 0},
    {'path': ['posts', 'results'], 'maxAge': 0},
    {'path': ['posts', 'results', 0, 'coauthors'], 'maxAge': 0},
    {'path': ['posts', 'results', 0, 'user'], 'maxAge': 0}]}}}

In [117]:
# loop that gets posts for a date range until the offset is empty
start_date = "2023-01-01"
end_date = "2023-01-02"
get_all_posts(start_date, end_date)
# this should get all posts and pingbacks

[{'_id': 'rTJrqtDLxAPxiW3sk',
  'title': 'My first year in AI alignment',
  'slug': 'my-first-year-in-ai-alignment',
  'pageUrl': 'https://www.lesswrong.com/posts/rTJrqtDLxAPxiW3sk/my-first-year-in-ai-alignment',
  'postedAt': '2023-01-02T01:28:03.470Z',
  'baseScore': 61,
  'voteCount': 33,
  'score': 0.0010619011009112,
  'commentCount': 10,
  'meta': False,
  'question': False,
  'url': None,
  'author': None,
  'pingbacks': {'Posts': ['PRMJCbBhsGgu5A6Ty']},
  'coauthors': [],
  'user': {'username': 'Alex_Altair',
   '_id': '5wu9jG4pm9q6xjZ9R',
   'slug': 'alex_altair'}},
 {'_id': 'dTWevKRiMM4ptcjjg',
  'title': 'Would it be good or bad for the US military to get involved in AI risk?',
  'slug': 'would-it-be-good-or-bad-for-the-us-military-to-get-involved',
  'pageUrl': 'https://www.lesswrong.com/posts/dTWevKRiMM4ptcjjg/would-it-be-good-or-bad-for-the-us-military-to-get-involved',
  'postedAt': '2023-01-01T19:02:30.892Z',
  'baseScore': 50,
  'voteCount': 23,
  'score': 0.0009097390

In [70]:
# we can use this to add pingbacks to the posts. if they don't have them
from requests.exceptions import HTTPError
def get_all_pingback(post_id):
    pingback_query ="""{
    posts(input: {terms: {view: "pingbackPosts", postId: "%s", limit: 15, offset: 0}}) {
        results {
        _id
        extendedScore
        baseScore
        score
        voteCount
        userId
        reviewVotesHighKarma
        commentCount
        htmlBody
        }
    }
    }"""%post_id
    response = requests.post(url, headers=default_headers, json={'query': pingback_query})
    response.raise_for_status()
    return response.json()["data"]["posts"]["results"]

def get_pingback_ids(post_id):
    try:
        pingbacks = get_all_pingback(post_id)
        return [pingback["_id"] for pingback in pingbacks]
    except HTTPError as e:
        print(post_id, "returns http error")
        print(e)
        return []
    except:
        print(post_id, "returns unknown error")
        return []
def get_refs_from_pingback(df):
    ids, pingbacks = zip(*df[["_id","pingback"]].values)
    refs = {id: [] for id in ids}
    for id, pingback in zip(ids, pingbacks):
        for pb in pingback:
            if pb in refs:
                refs[pb].append(id)

    df["refs"] = df["_id"].apply(lambda x: refs[x])
    return df

In [89]:
posts.to_parquet("data/posts.parquet")

In [76]:
comments = pd.read_parquet("app_files/lw_comments.parquet")

In [77]:
comments.columns

Index(['htmlBody', '_id', 'postId', 'parentCommentId', 'answer', 'postedAt',
       'author_id', 'username'],
      dtype='object')

In [5]:
folder_path = "data/comments/"
comments = pd.DataFrame()
for file in os.listdir(folder_path):
    with open(folder_path + file) as f:
        data = json.load(f)
        comments = pd.concat([comments, pd.DataFrame(data)])
comments = comments.drop_duplicates(subset=['_id'])
comments.columns

Index(['_id', 'user', 'htmlBody', 'postId', 'parentCommentId', 'baseScore',
       'answer', 'postedAt'],
      dtype='object')

In [6]:
comments.to_parquet("data/comments.parquet")

In [13]:
posts = pd.read_parquet("data/posts.parquet")

In [None]:
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('AnnaWegmann/Style-Embedding')
posts.fillna("", inplace=True)
# posts["_id"] = posts.index
posts["full_text"] = posts["title"] + "\n" + posts["body"]
texts = posts["full_text"].to_list()
style_embeddings = model.encode(texts)

In [15]:
posts["style_embedding"] = style_embeddings.tolist()
posts.to_parquet("data/posts.parquet")