In [1]:
#Special module written for this class
#This provides access to data and to helper functions from previous weeks
#Make sure you update it before starting this notebook
import lucem_illud_2020 #pip install -U git+git://github.com/Computational-Content-Analysis-2020/lucem_illud_2020.git


import sklearn
import sklearn.feature_extraction.text

#import scipy.cluster.hierarchy
import gensim#For topic modeling
import requests #For downloading our datasets
import numpy as np #for arrays
import pandas #gives us DataFrames
import matplotlib.pyplot as plt #For graphics
import matplotlib.cm #Still for graphics

#This 'magic' command makes the plots work better
#in the notebook, don't use it outside of a notebook.
#Also you can ignore the warning, it
%matplotlib inline

In [2]:
df70 = pandas.read_csv('t\df_70.csv', index_col=0)
df80 = pandas.read_csv('t\df_80.csv', index_col=0)
df90 = pandas.read_csv('t\df_90.csv', index_col=0)
df00 = pandas.read_csv('t\df_00.csv', index_col=0)
df10 = pandas.read_csv('t\df_10.csv', index_col=0)

In [3]:
df70 = df70.sample(100, replace = True)
df80 = df80.sample(100, replace = True)
df90 = df90.sample(100, replace = True)
df00 = df00.sample(100, replace = True)
df10 = df10.sample(100, replace = True)

In [4]:
df = pandas.concat([df70, df80, df90, df00, df10])

In [5]:
#Apply our functions
df['tokenized_text'] = df['AB'].apply(lambda x: lucem_illud_2020.word_tokenize(x))
df['normalized_tokens'] = df['tokenized_text'].apply(lambda x: lucem_illud_2020.normalizeTokens(x))

In [6]:
ngTFVectorizer = sklearn.feature_extraction.text.TfidfVectorizer(max_df=0.5, 
                                                                 max_features=3000, min_df=3, 
                                                                 stop_words='english', 
                                                                 norm='l2')
groupsTFVects = ngTFVectorizer.fit_transform(df['AB'])

In [7]:
def dropMissing(wordLst, vocab):
    return [w for w in wordLst if w in vocab]

df['reduced_tokens'] = df['normalized_tokens'].apply(lambda x: dropMissing(x, ngTFVectorizer.vocabulary_.keys()))

In [8]:
dictionary = gensim.corpora.Dictionary(df['reduced_tokens'])

In [9]:
## create a list of tuples containing each token and its count 
## use the first half and save the remainder for testing 
corpus = [dictionary.doc2bow(text) for text in df['reduced_tokens']]

In [10]:
# serialize the corpus as a file and load it 
gensim.corpora.MmCorpus.serialize('abstract.mm', corpus)
abmm = gensim.corpora.MmCorpus('abstract.mm')

In [11]:
# correctly formatted corpus that can be used for topic modeling and induction 
senlda = gensim.models.ldamodel.LdaModel(corpus=abmm, id2word=dictionary, num_topics=20, alpha='auto', eta='auto')

In [12]:
ldaDF = pandas.DataFrame({
        'titles' : df['TI'],
        'topics' : [senlda[dictionary.doc2bow(l)] for l in df['reduced_tokens']]
    })

In [13]:
topicsProbDict = {i : [0] * len(ldaDF) for i in range(senlda.num_topics)}

for index, topicTuples in enumerate(ldaDF['topics']):
    for topicNum, prob in topicTuples:
        topicsProbDict[topicNum][index] = prob

for topicNum in range(senlda.num_topics):
    ldaDF['topic_{}'.format(topicNum)] = topicsProbDict[topicNum]

In [14]:
K = senlda.num_topics  # N documents, K topics
topic_labels = ['Topic #{}'.format(k) for k in range(K)]

topicsDict = {}
for topicNum in range(senlda.num_topics):
    topicWords = [w for w, p in senlda.show_topic(topicNum)]
    topicsDict['Topic_{}'.format(topicNum)] = topicWords

wordRanksDF = pandas.DataFrame(topicsDict)
wordRanksDF

Unnamed: 0,Topic_0,Topic_1,Topic_2,Topic_3,Topic_4,Topic_5,Topic_6,Topic_7,Topic_8,Topic_9,Topic_10,Topic_11,Topic_12,Topic_13,Topic_14,Topic_15,Topic_16,Topic_17,Topic_18,Topic_19
0,health,health,health,health,health,health,health,study,child,year,child,health,child,health,study,determinant,health,health,child,health
1,determinant,study,determinant,mental,determinant,group,age,health,determinant,behavior,patient,support,study,study,determinant,level,factor,factor,factor,child
2,factor,self,study,determinant,patient,study,determinant,patient,woman,health,health,determinant,factor,determinant,health,individual,study,risk,woman,family
3,age,determinant,child,model,high,determinant,study,problem,class,determinant,activity,woman,mortality,care,stress,study,patient,determinant,group,factor
4,study,report,birth,economic,factor,use,patient,determinant,study,school,age,study,fertility,intervention,female,education,life,level,age,level
5,report,care,method,result,study,research,result,age,health,size,physical,education,analysis,research,result,occupational,determinant,result,health,year
6,woman,result,family,stress,family,factor,analysis,year,research,work,study,variable,age,behavior,factor,increase,result,high,study,patient
7,disease,status,age,support,model,woman,care,cost,grade,study,school,level,woman,model,male,behaviour,status,study,determinant,high
8,use,child,low,self,policy,female,population,factor,effect,result,determinant,age,determinant,area,function,relate,analysis,ci,intervention,effect
9,analysis,factor,change,study,research,result,year,high,sex,factor,result,result,significant,child,effect,year,variable,relate,effect,determinant


In [15]:
from gensim.models import ldaseqmodel

In [16]:
def year(row):
    if  row['YY'] < 1980: # there is one article published in 1968 and will be included here 
        return "1970"
    elif row['YY'] < 1990:
        return "1980"
    elif row['YY'] < 2000:
        return "1990"
    elif row['YY'] < 2010:
        return "2000"
    elif row['YY'] < 2020:
        return "2010"
    else: return "2020"

In [17]:
df['year'] = df.apply(lambda x: year(x), axis=1)

In [18]:
df.groupby('year').count()['AB']

year
1970    100
1980    100
1990    100
2000    100
2010    100
Name: AB, dtype: int64

In [19]:
p1 = 100
p2 = 100
p3 = 100
p4 = 100
p5 = 100
time_slice = [p5,p4,p3,p2,p1]

In [21]:
lda_dyn = ldaseqmodel.LdaSeqModel(corpus=corpus, id2word=dictionary, time_slice=time_slice,num_topics=5)

  convergence = np.fabs((bound - old_bound) / old_bound)
