In [1]:
!pip install lda
!conda install gensim --yes

#using the tutorial from https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.



In [167]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
import nltk
#nltk.download('wordnet')


import lda
import pandas as pd

In [168]:
## pre-processing 
stemmer = SnowballStemmer("english")
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [169]:
##reading data
chain = pd.read_csv('data/pilot1x10_sept_27/info_cleaned.csv')[["generation","response"]]
chain["condition"] = "chain"
network = pd.read_csv('data/pilot3x5x5_sept_30/info_cleaned.csv')[["generation","response"]]
network["condition"] = "network"

data = pd.concat([chain,network])
data.head()

Unnamed: 0,generation,response,condition
0,0,Most people in the past didn't die from heart ...,chain
1,0,Early on many people did not die of heart dise...,chain
2,0,"In the old days, people didn't die of cancer o...",chain
3,0,In the past people didn't die of the diseases ...,chain
4,0,While antibiotics have been used to fight bact...,chain


In [170]:
##
processed_docs = data['response'].map(preprocess)
processed_docs[:10]

0    [peopl, past, heart, diseas, live, long, devel...
1    [earli, peopl, heart, diseas, stroke, die, inf...
2    [day, peopl, cancer, heart, diseas, modern, il...
3    [past, peopl, diseas, common, today, live, lon...
4    [antibiot, fight, bacteri, infect, year, effec...
5    [peopl, live, long, experi, heart, problem, pe...
6    [peopl, past, cancer, live, long, antibiot, pe...
7    [decad, peopl, die, reason, cancer, heart, dis...
8    [histori, peopl, cancer, hear, diseas, lifesty...
9    [recent, histori, human, death, weren, caus, c...
Name: response, dtype: object

In [171]:
#Create dictionary of words
dictionary = gensim.corpora.Dictionary(processed_docs)
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 alexand
1 antibiot
2 caus
3 citi
4 claim
5 come
6 develop
7 diseas
8 fleme
9 golden
10 heart


In [172]:
len(dictionary)

584

In [173]:
#Delete extreme words
dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=500)

In [174]:
#Create the corpus
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_doc_4310 = bow_corpus[10]
for i in range(len(bow_doc_4310)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_4310[i][0], 
                                               dictionary[bow_doc_4310[i][0]], 
bow_doc_4310[i][1]))

Word 3 ("come") appears 1 time.
Word 4 ("develop") appears 1 time.
Word 5 ("diseas") appears 2 time.
Word 7 ("heart") appears 1 time.
Word 8 ("live") appears 1 time.
Word 9 ("long") appears 1 time.
Word 12 ("penicillin") appears 3 time.
Word 15 ("save") appears 1 time.
Word 16 ("superbug") appears 3 time.
Word 33 ("stronger") appears 1 time.
Word 35 ("accid") appears 1 time.
Word 36 ("cancer") appears 1 time.
Word 38 ("cure") appears 1 time.
Word 39 ("day") appears 1 time.
Word 49 ("time") appears 1 time.
Word 50 ("chang") appears 1 time.
Word 52 ("death") appears 1 time.
Word 55 ("injuri") appears 1 time.
Word 58 ("today") appears 1 time.
Word 59 ("treat") appears 1 time.
Word 61 ("like") appears 1 time.
Word 67 ("alli") appears 1 time.
Word 81 ("make") appears 1 time.
Word 86 ("work") appears 1 time.
Word 87 ("abl") appears 1 time.
Word 88 ("estim") appears 1 time.
Word 89 ("introduc") appears 1 time.


In [175]:
# TF-IDF
from gensim import corpora, models

tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
from pprint import pprint

for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.2038942972252099),
 (1, 0.18194116380617711),
 (2, 0.2699678674937903),
 (3, 0.2590218480054993),
 (4, 0.36388232761235423),
 (5, 0.10992934325564296),
 (6, 0.21019265565416922),
 (7, 0.14488581282424157),
 (8, 0.30828479458055397),
 (9, 0.18701002745496514),
 (10, 0.11499820690443105),
 (11, 0.17710814837761873),
 (12, 0.14147924218060778),
 (13, 0.28220447620470335),
 (14, 0.33103366855603344),
 (15, 0.16806844102344565),
 (16, 0.1923389440465296),
 (17, 0.23176453691546034),
 (18, 0.2699678674937903)]


In [176]:
# #Bag of words
# lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=3, id2word=dictionary, passes=2, workers=6)

# for idx, topic in lda_model.print_topics(-1):
#     print('Topic: {} \nWords: {}'.format(idx, topic))

In [177]:
num_topics = 3
data = pd.concat([chain,network])

In [180]:
#Fit model: tF-idf
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=num_topics, id2word=dictionary, passes=20, workers=4)

for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.035*"superbug" + 0.034*"creat" + 0.032*"cancer" + 0.032*"know" + 0.026*"cure" + 0.025*"mutat" + 0.024*"kill" + 0.021*"reason" + 0.021*"livestock" + 0.020*"immun"
Topic: 1 Word: 0.028*"develop" + 0.028*"year" + 0.022*"diseas" + 0.022*"live" + 0.020*"heart" + 0.020*"long" + 0.020*"die" + 0.019*"fight" + 0.019*"save" + 0.018*"like"
Topic: 2 Word: 0.026*"overus" + 0.024*"death" + 0.024*"super" + 0.021*"bug" + 0.020*"million" + 0.019*"increas" + 0.018*"live" + 0.018*"effect" + 0.017*"medicin" + 0.017*"longer"


In [179]:
#Check one of the stories
for index, score in sorted(lda_model_tfidf[bow_corpus[10]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))


Score: 0.733862042427063	 
Topic: 0.025*"develop" + 0.024*"year" + 0.020*"medicin" + 0.019*"save" + 0.019*"like" + 0.018*"die" + 0.017*"chang" + 0.017*"diseas" + 0.017*"current" + 0.017*"drug"

Score: 0.25483423471450806	 
Topic: 0.028*"diseas" + 0.028*"need" + 0.027*"bug" + 0.027*"super" + 0.025*"penicillin" + 0.023*"longer" + 0.021*"live" + 0.020*"invent" + 0.017*"creat" + 0.017*"begin"

Score: 0.01130377221852541	 
Topic: 0.032*"caus" + 0.027*"overus" + 0.027*"reduc" + 0.026*"rate" + 0.026*"death" + 0.024*"million" + 0.022*"cancer" + 0.022*"know" + 0.020*"go" + 0.020*"problem"


In [161]:
lda_model_tfidf[bow_corpus[1]]

[(0, 0.45748794), (3, 0.52122015)]

In [181]:
#Create the probability of being in each topic
topics = np.zeros((num_topics,len(data)))
for j,response in enumerate(bow_corpus):
    values = dict(lda_model_tfidf[response])
    for i in range(num_topics):
        if values.get(i):
            topics[i,j] =values[i]
        else:
            topics[i,j] = 0

In [182]:
#save the data
for i in range(num_topics):
    data[i] = topics[i,:]
    
data.to_csv("data/lda_quick_test.tsv",sep="\t")
data.head()

Unnamed: 0,generation,response,condition,0,1,2
0,0,Most people in the past didn't die from heart ...,chain,0.016391,0.968033,0.015576
1,0,Early on many people did not die of heart dise...,chain,0.476841,0.344351,0.178808
2,0,"In the old days, people didn't die of cancer o...",chain,0.148689,0.565656,0.285656
3,0,In the past people didn't die of the diseases ...,chain,0.014095,0.969729,0.016176
4,0,While antibiotics have been used to fight bact...,chain,0.033822,0.933603,0.032574


In [183]:
data.groupby("condition").mean()

Unnamed: 0_level_0,generation,0,1,2
condition,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
chain,1.87234,0.283555,0.440313,0.275763
network,1.972973,0.142015,0.269246,0.588271


In [184]:
data.groupby("condition").std()

Unnamed: 0_level_0,generation,0,1,2
condition,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
chain,1.377096,0.354534,0.386173,0.333615
network,1.413952,0.227173,0.297278,0.33754


In [185]:
data.groupby(["generation"]).mean()

Unnamed: 0_level_0,0,1,2
generation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.107524,0.642495,0.247902
1,0.142999,0.331886,0.525115
2,0.218678,0.304552,0.47677
3,0.295754,0.225272,0.478974
4,0.224397,0.143514,0.632089


In [186]:
data.groupby(["condition","generation"]).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2
condition,generation,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
chain,0,0.144554,0.781544,0.072165
chain,1,0.291436,0.44356,0.265004
chain,2,0.247567,0.419737,0.332697
chain,3,0.390013,0.260152,0.349836
chain,4,0.370199,0.234969,0.394832
network,0,0.082837,0.549796,0.36506
network,1,0.044041,0.257437,0.698522
network,2,0.199419,0.227762,0.572819
network,3,0.232915,0.20202,0.565066
network,4,0.151496,0.097786,0.750718
