In [1]:
#Special module written for this class
#This provides access to data and to helper functions from previous weeks
#Make sure you update it before starting this notebook
import lucem_illud_2020 #pip install -U git+git://github.com/Computational-Content-Analysis-2020/lucem_illud_2020.git


import sklearn
import sklearn.feature_extraction.text

#import scipy.cluster.hierarchy
import gensim#For topic modeling
import requests #For downloading our datasets
import numpy as np #for arrays
import pandas #gives us DataFrames
import matplotlib.pyplot as plt #For graphics
import matplotlib.cm #Still for graphics

#This 'magic' command makes the plots work better
#in the notebook, don't use it outside of a notebook.
#Also you can ignore the warning, it
%matplotlib inline

In [2]:
df = pandas.read_csv('t\df_70.csv', index_col=0)

In [3]:
#Apply our functions
df['tokenized_text'] = df['AB'].apply(lambda x: lucem_illud_2020.word_tokenize(x))
df['normalized_tokens'] = df['tokenized_text'].apply(lambda x: lucem_illud_2020.normalizeTokens(x))

In [4]:
ngTFVectorizer = sklearn.feature_extraction.text.TfidfVectorizer(max_df=0.5, 
                                                                 max_features=1000, min_df=3, 
                                                                 stop_words='english', 
                                                                 norm='l2')
groupsTFVects = ngTFVectorizer.fit_transform(df['AB'])

In [5]:
def dropMissing(wordLst, vocab):
    return [w for w in wordLst if w in vocab]

df['reduced_tokens'] = df['normalized_tokens'].apply(lambda x: dropMissing(x, ngTFVectorizer.vocabulary_.keys()))

In [6]:
dictionary = gensim.corpora.Dictionary(df['reduced_tokens'])

In [7]:
## create a list of tuples containing each token and its count 
## use the first half and save the remainder for testing 
corpus = [dictionary.doc2bow(text) for text in df['reduced_tokens']]

In [8]:
# serialize the corpus as a file and load it 
gensim.corpora.MmCorpus.serialize('abstract.mm', corpus)
abmm = gensim.corpora.MmCorpus('abstract.mm')

In [9]:
# correctly formatted corpus that can be used for topic modeling and induction 
senlda = gensim.models.ldamodel.LdaModel(corpus=abmm, id2word=dictionary, num_topics=20, alpha='auto', eta='auto')

In [10]:
# sen1Bow = dictionary.doc2bow(df['reduced_tokens'][0])
# sen1lda = senlda[sen1Bow]
# print("The topics of the text: {}".format(df['TI'][0]))
# print("are: {}".format(sen1lda))

In [11]:
ldaDF = pandas.DataFrame({
        'titles' : df['TI'],
        'topics' : [senlda[dictionary.doc2bow(l)] for l in df['reduced_tokens']]
    })

In [12]:
topicsProbDict = {i : [0] * len(ldaDF) for i in range(senlda.num_topics)}

for index, topicTuples in enumerate(ldaDF['topics']):
    for topicNum, prob in topicTuples:
        topicsProbDict[topicNum][index] = prob

for topicNum in range(senlda.num_topics):
    ldaDF['topic_{}'.format(topicNum)] = topicsProbDict[topicNum]

In [13]:
K = senlda.num_topics  # N documents, K topics
topic_labels = ['Topic #{}'.format(k) for k in range(K)]

topicsDict = {}
for topicNum in range(senlda.num_topics):
    topicWords = [w for w, p in senlda.show_topic(topicNum)]
    topicsDict['Topic_{}'.format(topicNum)] = topicWords

wordRanksDF = pandas.DataFrame(topicsDict)
wordRanksDF

Unnamed: 0,Topic_0,Topic_1,Topic_2,Topic_3,Topic_4,Topic_5,Topic_6,Topic_7,Topic_8,Topic_9,Topic_10,Topic_11,Topic_12,Topic_13,Topic_14,Topic_15,Topic_16,Topic_17,Topic_18,Topic_19
0,concept,variable,health,change,health,performance,affect,behavior,group,health,fertility,study,determinant,patient,environment,study,class,female,group,group
1,sex,determinant,family,factor,patient,use,difference,group,life,determinant,perspective,treatment,effect,treatment,behavior,determinant,experience,male,child,health
2,self,human,determinant,marital,effect,therapy,examine,sex,variable,group,family,information,man,determinant,factor,family,temporal,infant,age,year
3,distance,analysis,study,determinant,determinant,female,research,difference,old,life,suicide,activity,study,complete,variable,use,variable,day,control,study
4,group,influence,increase,year,study,self,control,drug,determinant,factor,marital,vaccination,result,female,determinant,health,attitude,test,infant,individual
5,determinant,fertility,consumer,life,mental,achievement,factor,research,person,patient,size,result,life,factor,sex,status,study,behavior,reaction,medical
6,male,political,mental,approach,survey,study,health,use,age,planning,difference,practice,variable,student,report,research,function,interaction,suggest,drug
7,study,behavior,group,follow,work,experience,child,individual,different,study,equation,reaction,role,economic,patient,preventive,child,determinant,determinant,experience
8,child,measure,process,sex,area,old,problem,high,experience,behavior,model,health,difference,general,health,work,sex,mother,behavior,patient
9,factor,economic,self,increase,major,political,study,finding,compliance,experience,drug,vary,economic,rate,sexual,report,relationship,factor,response,member


In [21]:
import pyLDAvis.gensim
lda_model = senlda
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary=lda_model.id2word)
vis