In [1]:
#Special module written for this class
#This provides access to data and to helper functions from previous weeks
#Make sure you update it before starting this notebook
import lucem_illud_2020 #pip install -U git+git://github.com/Computational-Content-Analysis-2020/lucem_illud_2020.git


import sklearn
import sklearn.feature_extraction.text

#import scipy.cluster.hierarchy
import gensim#For topic modeling
import requests #For downloading our datasets
import numpy as np #for arrays
import pandas #gives us DataFrames
import matplotlib.pyplot as plt #For graphics
import matplotlib.cm #Still for graphics

#This 'magic' command makes the plots work better
#in the notebook, don't use it outside of a notebook.
#Also you can ignore the warning, it
%matplotlib inline

In [2]:
df = pandas.read_csv('t\df_80.csv', index_col=0)

df70 = pandas.read_csv('t\df_70.csv', index_col=0)
df80 = pandas.read_csv('t\df_80.csv', index_col=0)
df90 = pandas.read_csv('t\df_90.csv', index_col=0)
df00 = pandas.read_csv('t\df_00.csv', index_col=0)
df10 = pandas.read_csv('t\df_10.csv', index_col=0)

dd = pandas.concat([df70, df80, df90, df00, df10])

In [3]:
#Apply our functions
df['tokenized_text'] = df['AB'].apply(lambda x: lucem_illud_2020.word_tokenize(x))
df['normalized_tokens'] = df['tokenized_text'].apply(lambda x: lucem_illud_2020.normalizeTokens(x))

In [4]:
ngTFVectorizer = sklearn.feature_extraction.text.TfidfVectorizer(max_df=0.5, 
                                                                 max_features=3000, min_df=3, 
                                                                 stop_words='english', 
                                                                 norm='l2')
groupsTFVects = ngTFVectorizer.fit_transform(df['AB'])

In [5]:
def dropMissing(wordLst, vocab):
    return [w for w in wordLst if w in vocab]

df['reduced_tokens'] = df['normalized_tokens'].apply(lambda x: dropMissing(x, ngTFVectorizer.vocabulary_.keys()))

In [6]:
dictionary = gensim.corpora.Dictionary(df['reduced_tokens'])

In [7]:
## create a list of tuples containing each token and its count 
## use the first half and save the remainder for testing 
corpus = [dictionary.doc2bow(text) for text in df['reduced_tokens']]

In [8]:
# serialize the corpus as a file and load it 
gensim.corpora.MmCorpus.serialize('abstract.mm', corpus)
abmm = gensim.corpora.MmCorpus('abstract.mm')

In [9]:
# correctly formatted corpus that can be used for topic modeling and induction 
senlda = gensim.models.ldamodel.LdaModel(corpus=abmm, id2word=dictionary, num_topics=20, alpha='auto', eta='auto')

In [10]:
ldaDF = pandas.DataFrame({
        'titles' : df['TI'],
        'topics' : [senlda[dictionary.doc2bow(l)] for l in df['reduced_tokens']]
    })

In [11]:
topicsProbDict = {i : [0] * len(ldaDF) for i in range(senlda.num_topics)}

for index, topicTuples in enumerate(ldaDF['topics']):
    for topicNum, prob in topicTuples:
        topicsProbDict[topicNum][index] = prob

for topicNum in range(senlda.num_topics):
    ldaDF['topic_{}'.format(topicNum)] = topicsProbDict[topicNum]

In [12]:
K = senlda.num_topics  # N documents, K topics
topic_labels = ['Topic #{}'.format(k) for k in range(K)]

topicsDict = {}
for topicNum in range(senlda.num_topics):
    topicWords = [w for w, p in senlda.show_topic(topicNum)]
    topicsDict['Topic_{}'.format(topicNum)] = topicWords

wordRanksDF = pandas.DataFrame(topicsDict)
wordRanksDF

Unnamed: 0,Topic_0,Topic_1,Topic_2,Topic_3,Topic_4,Topic_5,Topic_6,Topic_7,Topic_8,Topic_9,Topic_10,Topic_11,Topic_12,Topic_13,Topic_14,Topic_15,Topic_16,Topic_17,Topic_18,Topic_19
0,patient,woman,health,health,determinant,study,family,woman,age,salt,factor,behavior,determinant,health,factor,age,patient,health,determinant,child
1,family,health,determinant,determinant,health,male,study,determinant,determinant,patient,determinant,work,support,determinant,family,dental,level,patient,factor,determinant
2,study,determinant,study,study,model,female,child,client,study,determinant,model,health,study,male,aggression,determinant,determinant,study,child,study
3,factor,age,group,self,contraceptive,determinant,factor,variable,child,low,age,determinant,mother,factor,determinant,research,school,determinant,level,patient
4,behavior,rate,care,level,care,person,determinant,behavior,population,development,level,factor,health,study,health,health,age,effect,health,age
5,size,family,change,patient,use,factor,age,analysis,health,infant,variable,age,child,level,effect,behavior,mortality,behavior,individual,high
6,important,child,patient,result,change,support,relate,sex,result,intake,significant,group,model,behavior,individual,group,self,factor,effect,health
7,predictor,use,analysis,factor,factor,infant,high,report,year,study,child,population,female,female,study,study,abortion,support,model,effect
8,relate,group,class,psychological,child,child,mother,age,status,change,suggest,life,adolescent,result,group,level,finding,high,support,rate
9,health,variable,factor,control,sexual,effect,level,measure,patient,suggest,study,level,male,care,present,child,analysis,present,high,behavior


In [13]:
import pyLDAvis.gensim
lda_model = senlda
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary=lda_model.id2word)
vis