In [1]:
#Special module written for this class
#This provides access to data and to helper functions from previous weeks
#Make sure you update it before starting this notebook
import lucem_illud_2020 #pip install -U git+git://github.com/Computational-Content-Analysis-2020/lucem_illud_2020.git


import sklearn
import sklearn.feature_extraction.text

#import scipy.cluster.hierarchy
import gensim#For topic modeling
import requests #For downloading our datasets
import numpy as np #for arrays
import pandas #gives us DataFrames
import matplotlib.pyplot as plt #For graphics
import matplotlib.cm #Still for graphics

#This 'magic' command makes the plots work better
#in the notebook, don't use it outside of a notebook.
#Also you can ignore the warning, it
%matplotlib inline

In [2]:
df = pandas.read_csv('t\df_70.csv', index_col=0)

In [3]:
#Apply our functions
df['tokenized_text'] = df['AB'].apply(lambda x: lucem_illud_2020.word_tokenize(x))
df['normalized_tokens'] = df['tokenized_text'].apply(lambda x: lucem_illud_2020.normalizeTokens(x))

### Exercise 1

Construct cells immediately below this that construct features and cluster your documents using K-means and a variety of cluster numbers. Interrogate the cluster contents in terms of both documents and features. Identify the "optimal" cluster number with Silhouette analysis. Plot clusters and features after reducing with PCA. What does this cluster structure reveal about the organization of documents in your corpora?

In [4]:
ngTFVectorizer = sklearn.feature_extraction.text.TfidfVectorizer(max_df=0.5, 
                                                                 max_features=1000, min_df=3, 
                                                                 stop_words='english', 
                                                                 norm='l2')
groupsTFVects = ngTFVectorizer.fit_transform(df['AB'])

In [5]:
def dropMissing(wordLst, vocab):
    return [w for w in wordLst if w in vocab]

df['reduced_tokens'] = df['normalized_tokens'].apply(lambda x: dropMissing(x, ngTFVectorizer.vocabulary_.keys()))

In [6]:
dictionary = gensim.corpora.Dictionary(df['reduced_tokens'])

In [7]:
## create a list of tuples containing each token and its count 
## use the first half and save the remainder for testing 
corpus = [dictionary.doc2bow(text) for text in df['reduced_tokens']]

In [8]:
# serialize the corpus as a file and load it 
gensim.corpora.MmCorpus.serialize('abstract.mm', corpus)
abmm = gensim.corpora.MmCorpus('abstract.mm')

In [9]:
# correctly formatted corpus that can be used for topic modeling and induction 
senlda = gensim.models.ldamodel.LdaModel(corpus=abmm, id2word=dictionary, num_topics=20, alpha='auto', eta='auto')

In [10]:
# sen1Bow = dictionary.doc2bow(df['reduced_tokens'][0])
# sen1lda = senlda[sen1Bow]
# print("The topics of the text: {}".format(df['TI'][0]))
# print("are: {}".format(sen1lda))

In [11]:
ldaDF = pandas.DataFrame({
        'titles' : df['TI'],
        'topics' : [senlda[dictionary.doc2bow(l)] for l in df['reduced_tokens']]
    })

In [12]:
topicsProbDict = {i : [0] * len(ldaDF) for i in range(senlda.num_topics)}

for index, topicTuples in enumerate(ldaDF['topics']):
    for topicNum, prob in topicTuples:
        topicsProbDict[topicNum][index] = prob

for topicNum in range(senlda.num_topics):
    ldaDF['topic_{}'.format(topicNum)] = topicsProbDict[topicNum]

In [13]:
K = senlda.num_topics  # N documents, K topics
topic_labels = ['Topic #{}'.format(k) for k in range(K)]

topicsDict = {}
for topicNum in range(senlda.num_topics):
    topicWords = [w for w, p in senlda.show_topic(topicNum)]
    topicsDict['Topic_{}'.format(topicNum)] = topicWords

wordRanksDF = pandas.DataFrame(topicsDict)
wordRanksDF

Unnamed: 0,Topic_0,Topic_1,Topic_2,Topic_3,Topic_4,Topic_5,Topic_6,Topic_7,Topic_8,Topic_9,Topic_10,Topic_11,Topic_12,Topic_13,Topic_14,Topic_15,Topic_16,Topic_17,Topic_18,Topic_19
0,psychological,factor,control,consumer,sample,factor,marital,economic,determinant,group,group,health,study,concept,environment,patient,female,family,health,rate
1,attitude,life,locus,study,child,therapy,determinant,determinant,increase,study,experience,determinant,determinant,self,work,therapist,group,determinant,group,high
2,work,medicine,study,participation,age,adult,relation,political,self,behavior,variable,variable,state,treatment,measure,experience,male,economic,child,effect
3,research,fertility,problem,member,performance,personality,group,class,environmental,sex,determinant,study,factor,determinant,determinant,treatment,test,mobility,study,determinant
4,positive,determinant,internal,health,similar,psychosomatic,age,population,concept,patient,result,planning,effect,factor,process,factor,child,process,sex,migration
5,determinant,time,health,fertility,suggest,cultural,male,individual,analysis,difference,behavior,effect,view,family,changing,change,study,low,determinant,function
6,experience,disease,difference,group,treatment,population,effect,increase,life,activity,effect,country,policy,case,information,determinant,behavior,age,patient,fertility
7,human,year,finding,attitude,development,therapeutic,general,factor,scale,outcome,study,major,economic,study,time,year,interaction,status,behavior,level
8,use,condition,factor,determinant,report,family,examine,problem,nature,result,role,analysis,symptom,characteristic,include,approach,day,suicide,mental,suicide
9,smoking,experimental,design,infant,year,education,tend,study,factor,significantly,sexual,behavior,need,vaccination,relation,present,variable,intervention,control,family
