# Topic Modeling of Ruhlman Titles

Megan O, 4/25

Using this tutorial: https://rstudio-pubs-static.s3.amazonaws.com/79360_850b2a69980c4488b1db95987a24867a.html

(we learned about Latent Dirichlet Allocation in NLP on 4/25)

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
import seaborn as sns
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
import gensim
from gensim import corpora, models
%matplotlib inline 

In [2]:
df = pd.DataFrame.from_csv('aclean.csv')
df.head()

Unnamed: 0,Year,Title,Students,Advisor,Abstract,Student_Year,Student_Major,Advisor_Major
0,1997,"""King Lear"" Through Film: Brook and Kozintsev",Alexandra Parsons,Yu Jin Ko,"The final scene of Shakespeare's ""King Lear"" p...",1997,Unspecified,English
1,1997,The Grotesque Nature of Paradise in Milton's P...,Elizabeth Reich,Jody Mikalachki,In The Interpretacion of Dreams Freud writes t...,1997,Unspecified,English
2,1997,"Eliminating Stereotypes, Identifying Confoundi...",Aimee Jabro-Young,Paul Wink,This study compared differences in political b...,1998,Unspecified,Psychology
3,1997,The Art of Hatred: The Representation of Jewis...,Inna Kantor,Frances Malino,During much of the nineteenth century. Europea...,1997,Unspecified,Jewish Studies
4,1997,"The Problems of, and Possibilities for, Norweg...",Ingrid Moen,Barbara Geller,The goal of my talk is to outline the ways in ...,1997,Unspecified,Religion


In [10]:
titles = df['Title'].tolist()
titles = [str(t) for t in titles]

In [11]:
tokenizer = RegexpTokenizer(r'\w+')

# create English stop words list
en_stop = get_stop_words('en')

# Create p_stemmer of class PorterStemmer
p_stemmer = PorterStemmer()
    
doc_set = titles

# list for tokenized documents in loop
texts = []


In [12]:
# loop through document list
for i in doc_set:
    
    # clean and tokenize document string
    raw = i.lower()
    tokens = tokenizer.tokenize(raw)

    # remove stop words from tokens
    stopped_tokens = [i for i in tokens if not i in en_stop]
    
    # stem tokens
    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
    
    # add tokens to list
    texts.append(stemmed_tokens)


In [30]:

# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(texts)
    
# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in texts]

# generate LDA model
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=50, id2word = dictionary, passes=20)

In [32]:
ldamodel.print_topics(num_topics=50, num_words=10)

[(0,
  u'0.059*model + 0.047*brain + 0.032*reson + 0.030*magnet + 0.021*mous + 0.020*receptor + 0.018*build + 0.016*organ + 0.016*rett + 0.015*use'),
 (1,
  u'0.030*poetri + 0.023*s + 0.022*west + 0.020*centuri + 0.020*hall + 0.017*icon + 0.017*pendleton + 0.017*object + 0.015*bilingu + 0.013*address'),
 (2,
  u'0.036*human + 0.023*social + 0.022*titl + 0.017*help + 0.017*collect + 0.015*find + 0.014*engin + 0.014*antimicrobi + 0.013*right + 0.013*s'),
 (3,
  u'0.065*learn + 0.058*interact + 0.044*teach + 0.025*present + 0.025*program + 0.024*physic + 0.020*signal + 0.017*s + 0.017*lab + 0.014*type'),
 (4,
  u'0.067*develop + 0.029*popul + 0.029*search + 0.028*year + 0.021*liber + 0.019*genet + 0.018*problem + 0.017*role + 0.016*archipelago + 0.014*gal'),
 (5,
  u'0.062*east + 0.053*pendleton + 0.050*panel + 0.029*approach + 0.020*re + 0.019*west + 0.018*treatment + 0.017*global + 0.017*139 + 0.017*new'),
 (6,
  u'0.041*s + 0.040*histori + 0.032*polici + 0.026*china + 0.024*relationshi