# Instructions

This notebook borrows heavily from the tutorial notebook for the code used in Gerlach et al. (2018), which is available at https://github.com/martingerlach/hSBM_Topicmodel. Please ensure that their code base sbmtm.py is downloaded to the same folder as the notebook and that the graph-tool package (https://graph-tool.skewed.de/) is intalled on your kernel. 

In [1]:
#Import required packages
from bs4 import BeautifulSoup
import numpy as np
import requests
import os
import re
import sys 
from tqdm import tqdm
from nltk.corpus import stopwords
from nltk.corpus import words
from nltk.tokenize import word_tokenize

import os
import pylab as plt
%matplotlib inline  

import graph_tool.all as gt
from sbmtm import sbmtm

## Import the Data

The code below scrapes and imports the data from PhilPapers.org. To re-import in a way that collects all of the most recent abstracts in Formal Epistemology, change "limit=2803" in the URL below to "limit=x", where x is the total number of formal epistemology papers listed on PhilPapers.

In [2]:
#Import the data.
URL = "https://philpapers.org/browse/formal-epistemology?cn=formal-epistemology&freeOnly=&proOnly=on&cId=5467&langFilter=&hideAbstracts=&showCategories=off&filterByAreas=&categorizerOn=&new=1&limit=2803&start=0&sort=cat&onlineOnly=&publishedOnly=&sqc=&newWindow=&format=html&jlist=&ap_c1=&ap_c2="
r = requests.get(URL)
soup = BeautifulSoup(r.text, "html.parser")

entries = soup.find_all("li", class_='entry')

#Set n to the total number of formal epistemology papers.
n = 2803

titles = [0]*n
authors = [0]*n
years = [0]*n
publications = [0]*n
abstracts = [0]*n

#Separate the abstracts, titles, authors, years, and publications for each paper. 
for i in range(0,n):
    abstracts[i]=entries[i].find_all("div", class_="abstract")
    titles[i] = entries[i].find_all("span", class_='articleTitle recTitle')
    authors[i] = entries[i].find_all("span", class_='name')
    years[i] = entries[i].find_all("span", class_='pubYear')
    publications[i] = entries[i].find_all("em", class_='pubName')

Clean the Data
------------------

The code below cleans the data. After this step, I also manually removed brackets and quotation marks from the .txt file containing the abstracts. This created a number of instances of the text token ' s ', which I also removed manually.

In [3]:
#Reshape the arrays for all data points.
titles = np.array(titles, dtype=object).reshape(-1,1)
authors = np.array(authors, dtype=object).reshape(-1,1)
years = np.array(years, dtype=object).reshape(-1,1)
publications = np.array(publications, dtype=object).reshape(-1,1)
abstracts = np.array(abstracts, dtype=object).reshape(-1,1)

#Replace papers with empty titles or abstracts with zeros.
for i in range(0,len(abstracts)):
    if len(abstracts[i][0])==0:
        abstracts[i]=0
    else:
        abstracts[i]=abstracts[i]
for i in range(0,len(abstracts)):
    if titles[i] in titles[0:i]:
        titles[i]=0
    else:
        titles[i]=titles[i]
        
#Combine all data arrays into a full array, and remove all entries with empty abstacts or titles.
full_array=np.hstack((titles,authors,years,publications,abstracts))
full_array=np.delete(full_array,np.where(full_array[:,4]==0),0)
full_array=np.delete(full_array,np.where(full_array[:,0]==0),0)


#Separate out all data arrays.
titles=full_array[:,0]
authors=full_array[:,1]
years=full_array[:,2]
publications=full_array[:,3]
abstracts=full_array[:,4]


#Remove stopwords and HTML tags from abstracts.
all_stopwords = stopwords.words('english')
other_removals = ['epistemology','formal','paper','article','chapter',',','.','...','shrink',':',';','!','(',')','"','“','”','``','1','2','3','4','5','6','7','8','9','0','sections','`','dropbox','google','cambridge','kindle', '’', 'argue','argued','argument','argues','new','first','also','must','propose','proposal','claim','claimed','claims','work','framework','-','--','---','oxford','doogle','https','doi:','{','}','$','?','-/-','two','three','four','five','six','seven','eight','nine','a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z','\\\\usepackage','@','``','“','”','‘','–','>','<','\\\\cS','arguments','argument','accounts','account','problem','theory','theories','one','show','reasoning','way','may','given','case','analysis','study','whether','cases','make','result','recent','would','upon','debate','key','describe','since','use','used','using','suggest','either','problem','different','approach','based','problems','part','various','proposed','shown','concern','concerned','particular','present','general','provide','many','literature','us','several','sense','provides','ways','process','approaches','within','discuss','main','best','therefore','review','proposes','incorporating','famous','considerable','please','send','drive','university','students']
all_stopwords = np.append(all_stopwords,other_removals)
for i in range(0,len(abstracts)):
    abstracts[i][0] = BeautifulSoup(str(abstracts[i][0]), "lxml").text
    abstracts[i][0] = word_tokenize(str(abstracts[i][0]))
    abstracts[i][0] = [word for word in abstracts[i][0] if not word.lower() in all_stopwords]
    abstracts[i][0] = (" ").join(abstracts[i][0])
    
    
#Save abstracts as a .txt file. 
np.savetxt('abstracts.txt',abstracts,fmt='%s')
with open('abstracts.txt') as f:
    html_cleaned_abstracts = BeautifulSoup(f, "lxml").text
with open('abstracts.txt', 'w') as output_file:
    output_file.write(html_cleaned_abstracts)
    
    
#Prepare abstracts and titles for analysis by the sbtm model.
fname_data = 'abstracts.txt'
filename = os.path.join(fname_data)
with open(filename,'r', encoding = 'utf8') as f:
    x = f.readlines()
texts = [h.split() for h in x]
documents = [str(titles[i]) for i in range(0,len(titles))]

Run the model
-------------

The code below infers the heirarchical stochastic block model for the abstracts, producing partition(s) of word and document nodes at multiple levels of abstraction. For the sake of the analysis in my paper, I focus solely on the partitions of the word nodes. For n=2809 inference typically takes less than five minutes, but may take longer depending on your machine, and can take much longer for larger n.

In [4]:
## we create an instance of the sbmtm-class
model = sbmtm()

## we have to create the word-document network from the corpus
model.make_graph(texts,documents)

## we can also skip the previous step by saving/loading a graph
# model.save_graph(filename = 'graph.xml.gz')
# model.load_graph(filename = 'graph.xml.gz')

## fit the model
gt.seed_rng(32) ## seed for graph-tool's random number generator --> same results
model.fit()

Save the results
-------------

The code below extracts the data used in the paper and saves them as .txt files.

In [5]:
#Save the topics as levels 1 and 2 as .txt files.
with open('level1_topics.txt', 'w') as f:
    print(model.topics(l=1,n=20), file=f)
with open('level2_topics.txt', 'w') as f:
    print(model.topics(l=2,n=20), file=f)
    
#Obtain, rearrange and save the topic distributions for each document
doc_topic_mixture=model.get_groups(l=1)['p_tw_d']
doc_topic_mixture_L2=model.get_groups(l=2)['p_tw_d']


topic_dists_by_doc = [0]*len(titles)
for i in range(0,len(titles)):
    topic_dists_by_doc[i] = [doc_topic_mixture[j][i] for j in range(0,len(doc_topic_mixture))]
    
topic_dists_by_doc_L2 = [0]*len(titles)
for i in range(0,len(titles)):
    topic_dists_by_doc_L2[i] = [doc_topic_mixture_L2[j][i] for j in range(0,len(doc_topic_mixture_L2))]
    
    
np.savetxt('topic_dists_by_doc.txt',topic_dists_by_doc,fmt='%s')
np.savetxt('topic_dists_by_doc_L2.txt',topic_dists_by_doc_L2,fmt='%s')


#Obtain and save the year associated with each document, and the number of unique years within the corpus. 
unique_years = set(years)
unique_years = list(unique_years)
for i in range(0,len(unique_years)):
    unique_years[i] = BeautifulSoup(str(unique_years[i]), "lxml").text
for i in range(0,len(years)):
    years[i] = BeautifulSoup(str(years[i]), "lxml").text
np.savetxt('years.txt',years,fmt='%s')      
np.savetxt('unique_years.txt',unique_years,fmt='%s')