In [1]:
import requests
import urllib.request
from bs4 import BeautifulSoup
import pandas as pd
import os
import re
import tqdm
import asyncio
import json

In [2]:
folder_path = "data/posts/"
posts = pd.DataFrame()
for file in os.listdir(folder_path):
    with open(folder_path + file) as f:
        data = json.load(f)
        posts = pd.concat([posts, pd.DataFrame(data)])
posts = posts.drop_duplicates(subset=['_id'])

In [3]:
posts.shape

(40220, 11)

In [20]:
url = "https://www.lesswrong.com/graphql"
default_headers = {
  'accept': '*/*',
  'accept-language': 'en-US,en;q=0.9',
  'content-type': 'application/json',
  'user-agent': 'AE Studio-Neglected Means Researcher tristan.tran@ae.studio',
}

# Loading posts from LessWrong

In [40]:
def get_posts(start_date:str,
              end_date: str,
              offset: int):
    """
    Get posts from LessWrong API
    :param start_date: str, start date in format'YYYY-MM-DD'
    :param end_date: str, end date in format 'YYYY-MM-DD'
    :param offset: int, offset for pagination
    """
    post_query = """
{
      posts(input: {
        terms: {
          limit:100
          view:"timeframe"
          sortedBy:"magic"
          after:"%sT07:00:00.000Z"
          before:"%sT06:59:59.999Z"
          offset: %s
          meta: null  # this seems to get both meta and non-meta posts
        }
      }
      ) {
        results {
          _id
          title
          slug
          pageUrl
          postedAt
          baseScore
          voteCount
          score
          commentCount
          meta
          question
          url
          author
          pingbacks
          coauthors {
            username
            _id
            slug
          }
          user {
            username
            _id
            slug
          }
        }
      }
    }
    """%(start_date, end_date, offset)
    response = requests.post(url, headers=default_headers, json={'query': post_query})
    # TODO implement a backoff strategy for ratelimits
    response.raise_for_status()
    return response.json()

def get_all_posts(start_date, end_date):
  """ Gets all posts from lightcone within_summary_

  Args:
      start_date (_type_): _description_
      end_date (_type_): _description_

  Returns:
      _type_: _description_
  """
  offset = 0
  results = []
  while True:
      response = get_posts(start_date, end_date, offset)
      if not response['data']['posts']['results']:
          break
      results.extend(response['data']['posts']['results'])
      offset += 100
  return results

In [41]:
r = get_posts("2021-01-01", "2021-01-02", 0)
len(r["data"]["posts"]["results"])

9

In [42]:
get_posts("2021-01-01", "2021-01-02", 8)

{'data': {'posts': {'results': [{'_id': '6kAtxZyiTdY34JWak',
     'title': 'Is Free Will A Myth?',
     'slug': 'is-free-will-a-myth',
     'pageUrl': 'https://www.lesswrong.com/posts/6kAtxZyiTdY34JWak/is-free-will-a-myth',
     'postedAt': '2021-01-01T20:56:50.221Z',
     'baseScore': 0,
     'voteCount': 0,
     'score': 0,
     'commentCount': 4,
     'meta': False,
     'question': False,
     'url': None,
     'author': None,
     'pingbacks': {},
     'coauthors': [],
     'user': {'username': 'Precious Oluwatobi Emmanuel',
      '_id': 'otR3PZ9KMsM2JyCC5',
      'slug': 'precious-oluwatobi-emmanuel'}}]}},
 'extensions': {'cacheControl': {'version': 1,
   'hints': [{'path': ['posts'], 'maxAge': 0},
    {'path': ['posts', 'results'], 'maxAge': 0},
    {'path': ['posts', 'results', 0, 'coauthors'], 'maxAge': 0},
    {'path': ['posts', 'results', 0, 'user'], 'maxAge': 0}]}}}

In [43]:
# loop that gets posts for a date range until the offset is empty
start_date = "2023-01-01"
end_date = "2023-01-02"
get_all_posts(start_date, end_date)
# this should get all posts and pingbacks

[{'_id': 'rTJrqtDLxAPxiW3sk',
  'title': 'My first year in AI alignment',
  'slug': 'my-first-year-in-ai-alignment',
  'pageUrl': 'https://www.lesswrong.com/posts/rTJrqtDLxAPxiW3sk/my-first-year-in-ai-alignment',
  'postedAt': '2023-01-02T01:28:03.470Z',
  'baseScore': 61,
  'voteCount': 33,
  'score': 0.0010619011009112,
  'commentCount': 10,
  'meta': False,
  'question': False,
  'url': None,
  'author': None,
  'pingbacks': {'Posts': ['PRMJCbBhsGgu5A6Ty']},
  'coauthors': [],
  'user': {'username': 'Alex_Altair',
   '_id': '5wu9jG4pm9q6xjZ9R',
   'slug': 'alex_altair'}},
 {'_id': 'dTWevKRiMM4ptcjjg',
  'title': 'Would it be good or bad for the US military to get involved in AI risk?',
  'slug': 'would-it-be-good-or-bad-for-the-us-military-to-get-involved',
  'pageUrl': 'https://www.lesswrong.com/posts/dTWevKRiMM4ptcjjg/would-it-be-good-or-bad-for-the-us-military-to-get-involved',
  'postedAt': '2023-01-01T19:02:30.892Z',
  'baseScore': 50,
  'voteCount': 23,
  'score': 0.0008938161

# Add Pingback if missing

In [70]:
# we can use this to add pingbacks to the posts. if they don't have them
from requests.exceptions import HTTPError
def get_all_pingback(post_id):
    """
    Worst case scenario, lightcone sends us files. They sometimes miss pingbacks or other columns
    This function gets the pingbacks for a post. If it fails, it returns an empty list.
    
    """
    pingback_query ="""{
    posts(input: {terms: {view: "pingbackPosts", postId: "%s", limit: 15, offset: 0}}) {
        results {
        _id
        extendedScore
        baseScore
        score
        voteCount
        userId
        reviewVotesHighKarma
        commentCount
        htmlBody
        }
    }
    }"""%post_id
    response = requests.post(url, headers=default_headers, json={'query': pingback_query})
    response.raise_for_status()
    return response.json()["data"]["posts"]["results"]

def get_pingback_ids(post_id):
    try:
        pingbacks = get_all_pingback(post_id)
        return [pingback["_id"] for pingback in pingbacks]
    except HTTPError as e:
        print(post_id, "returns http error")
        print(e)
        return []
    except:
        print(post_id, "returns unknown error")
        return []
def get_refs_from_pingback(df):
    ids, pingbacks = zip(*df[["_id","pingback"]].values)
    refs = {id: [] for id in ids}
    for id, pingback in zip(ids, pingbacks):
        for pb in pingback:
            if pb in refs:
                refs[pb].append(id)

    df["refs"] = df["_id"].apply(lambda x: refs[x])
    return df

In [81]:
posts = pd.read_parquet("app_files/lw_data.parquet")

In [4]:
with open("tag_whitelist.json","r") as f:
    whitelist = json.loads(f.read())

In [5]:
_not_whitelisted = posts['tags'].apply(set).apply(lambda x: x.intersection(set(whitelist))) == set()
not_whitelisted = posts.loc[_not_whitelisted]
not_whitelisted.shape[0]

26640

In [6]:
white_listed = posts.loc[~_not_whitelisted]

In [7]:
white_listed.shape

(13580, 11)

In [89]:
white_listed.to_parquet("app_files/lw_data.parquet")

In [89]:
posts.to_parquet("data/posts.parquet")

In [2]:
posts = pd.read_parquet("app_files/lw_data.parquet")

In [3]:
with open("app_files/titles.json", "w") as f:
    json.dump(posts["title"].tolist(), f)

# Load comments

In [13]:
comments = pd.read_parquet("app_files/lw_comments.parquet")

In [93]:
# only use comments that are in white_listed
comments = comments.loc[comments["postId"].isin(white_listed["_id"])]

In [77]:
comments.columns

Index(['htmlBody', '_id', 'postId', 'parentCommentId', 'answer', 'postedAt',
       'author_id', 'username'],
      dtype='object')

In [14]:
folder_path = "data/comments/"
comments = pd.DataFrame()
for file in os.listdir(folder_path):
    with open(folder_path + file) as f:
        data = json.load(f)
        comments = pd.concat([comments, pd.DataFrame(data)])
comments = comments.drop_duplicates(subset=['_id'])
comments.columns

Index(['_id', 'user', 'htmlBody', 'postId', 'parentCommentId', 'baseScore',
       'answer', 'postedAt'],
      dtype='object')

In [58]:
import requests
import json

url = "https://www.lesswrong.com/graphql"
comments_query = """
query multiCommentQuery($input: MultiCommentInput) {
  comments(input: $input) {
    results {
      _id
      htmlBody
      postId
      answer
      postedAt
      parentCommentId
      user {
        _id
        slug
        username
        displayName
        karma
      }
    }
  }
}
"""

def get_comments(post_id):
    payload = json.dumps({
        "operationName": "multiCommentQuery",
        "variables": {
            "input": {
                "terms": {
                    "view": "postCommentsTop",
                    "limit": 1000,
                    "postId": post_id
                },
                "enableCache": False,
                "enableTotal": True
            }
        },
        "query": comments_query
    })
    headers = {
        'accept': '*/*',
        'accept-language': 'en-US,en;q=0.9',
        'content-type': 'application/json',
        'origin': 'https://www.lesswrong.com',
        'user-agent': 'AE studio negelcted approaches Research'
    }

    response = requests.request("POST", url, headers=headers, data=payload)
    return response

response = get_comments("xgrvmaLFvkFr4hKjz")

{'data': {'comments': {'results': [{'_id': 'PMtnsnkd38QNwherK', 'htmlBody': '<p>Useful post. I can expand on one point and make a minor correction. Single Particle Cryo-EM is indeed a new(ish) powerful method of protein structure elucidation starting to make an impact in drug design. It is especially useful when a protein cannot easily be crystallised to allow more straightforward X-Ray structure determination. This is usually the case with transmembrane proteins for example. However it is actually best if the protein molecules are completely unaligned in any preferred direction as the simplest application of the refinement software assumes a perfectly random 3D orientation of the many thousands of protein copies imaged on the grid. In practice this is not so easy to achieve and corrections for unwanted preferred orientation need to be made.</p>', 'postId': 'xgrvmaLFvkFr4hKjz', 'answer': False, 'postedAt': '2024-05-05T17:42:39.534Z', 'parentCommentId': None, 'user': {'_id': 'savsErdiFT

In [59]:
# call the get_comments function for each post_id and create a dataframe
import time
import tqdm
comments = pd.DataFrame()
for post_id in tqdm.tqdm(posts["_id"].values):
    response = get_comments(post_id)
    comments = pd.concat([comments, pd.DataFrame(response.json()["data"]["comments"]["results"])])
    time.sleep(0.5)
comments = comments.drop_duplicates(subset=['_id'])
comments.to_parquet("data/comments.parquet")


100%|██████████| 40220/40220 [16:29:12<00:00,  1.48s/it]   


In [None]:
Index(['htmlBody', '_id', 'postId', 'parentCommentId', 'answer', 'postedAt',
       'author_id', 'username'],
      dtype='object')

In [75]:
comments = comments[["htmlBody", "_id", "postId", "parentCommentId", "answer", "postedAt", "user"]]
user_df =  pd.json_normalize(comments['user'])
user_df.rename(columns={"_id": "author_id"}, inplace=True)
comments = comments.join(user_df).drop(columns=["user"])

In [95]:
comments.to_parquet("app_files/lw_comments.parquet")

In [14]:
comments

Unnamed: 0,htmlBody,_id,postId,parentCommentId,answer,postedAt,author_id,slug,username,displayName,karma
0,<p>I've read the logs of the SoundLogic vs Tux...,tqHogpJv74eqbBvTD,dop3rLwFhW5gtpEgz,,False,2013-09-05T04:54:09.295Z,589WwYp3jytZqATFL,beoshaffer,beoShaffer,beoShaffer,2582.0
1,<p>I think not understanding how this happen m...,e9GJSAiFtqAZu6E2t,dop3rLwFhW5gtpEgz,2EtHAXb9uM8w5B9Xo,False,2013-09-05T16:54:15.982Z,4SuPdAqJpj7TzsaqG,luminosity,luminosity,luminosity,909.0
2,<p>Okay this is weak sauce. I really don't get...,2EtHAXb9uM8w5B9Xo,dop3rLwFhW5gtpEgz,,False,2013-09-05T08:58:44.182Z,qxJ28GN72aiJu96iF,kaj_sotala,Kaj_Sotala,Kaj_Sotala,47424.0
3,<p>Does SoundLogic endorse their decision to l...,PAKdFFZ6rXv5NSQJF,dop3rLwFhW5gtpEgz,,False,2013-09-05T06:07:37.650Z,n83meJ5yG2WQzygvw,sixes_and_sevens,sixes_and_sevens,sixes_and_sevens,6802.0
4,<p>This is actually a good analogy. A 2-year-o...,a5xiHpBiFv6rDNnYC,dop3rLwFhW5gtpEgz,4PGGAWfvHJQ9SZMcQ,False,2013-09-08T07:09:52.552Z,baGAQoNAH4hXaC6qf,ciphergoth,ciphergoth,Paul Crowley,15192.0
...,...,...,...,...,...,...,...,...,...,...,...
14,"<p>Yeah for humans in particular, I think the ...",w9YBArKaDkniNSgFj,PtEPqonFDv7ueYYpu,nvy5kn7D6eXatfvHX,False,2023-04-19T07:59:58.520Z,oxTHYnSBbLZP9F25d,nancylebovitz,NancyLebovitz,NancyLebovitz,32983.0
15,<p>I could change that. I was thinking of work...,2Kdv2P7k87FKtTvpg,PtEPqonFDv7ueYYpu,ExfaQxiFeNcrmm979,False,2023-04-19T07:41:53.386Z,W7ETRtvRMqYetyQE9,pablo_stafforini,Pablo_Stafforini,Pablo,3445.0
16,"<p>Currently using ""task specific""/""total"".</p>",hKYcdrL2nokyuHvwr,PtEPqonFDv7ueYYpu,Qp5X5SkcWny6Y72RW,False,2023-04-18T23:39:59.162Z,u7QCL4caxyyQGc7Tt,dougclow,dougclow,dougclow,607.0
17,"<p>Yeah, I'm aware.</p><p>I would edit the pos...",u6x3Etu3jmJESiCRL,PtEPqonFDv7ueYYpu,Gmsuf4YyAcY2qr3i9,False,2023-04-18T23:23:36.089Z,4SuPdAqJpj7TzsaqG,luminosity,luminosity,luminosity,909.0


# create user dataframe

In [80]:
users_df = users_df.drop_duplicates(subset=["user_id"])
users_df.to_parquet("app_files/users.parquet")

# Running Similarity Score Embeddings

In [96]:
posts = pd.read_parquet("app_files/lw_data.parquet")
posts.columns

Index(['_id', 'title', 'authors', 'score', 'karma', 'body', 'postedAt', 'tags',
       'commentCount', 'upvoteCount', 'url', 'pingback', 'refs'],
      dtype='object')

In [98]:
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('AnnaWegmann/Style-Embedding')
posts.fillna("", inplace=True)
# posts["_id"] = posts.index
posts["full_text"] = posts["title"] + "\n" + posts["body"]
texts = posts["full_text"].to_list()
style_embeddings = model.encode(texts)
import torch
torch.save(style_embeddings, "app_files/style_embeddings.pt")
# each article has a style embedding
# posts["style_embedding"] = style_embeddings.tolist()



In [12]:
with open("app_files/authors.json") as f:
    json.dump(posts.explode("authors")["authors"].unique(),f)

True