In [1]:
!pip install lda
!conda install gensim --yes

#using the tutorial from https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.



In [20]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
import nltk
#nltk.download('wordnet')

np.random.seed(64)

import lda
import pandas as pd

In [21]:
## pre-processing 
stemmer = SnowballStemmer("english")
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [22]:
##reading data
chain = pd.read_csv('data/pilot1x10_sept_27/info_cleaned.csv')[["generation","response"]]
chain["condition"] = "chain"
network = pd.read_csv('data/pilot3x5x5_sept_30/info_cleaned.csv')[["generation","response"]]
network["condition"] = "network"

data = pd.concat([chain,network])
data.head()

Unnamed: 0,generation,response,condition
0,0,Most people in the past didn't die from heart ...,chain
1,0,Early on many people did not die of heart dise...,chain
2,0,"In the old days, people didn't die of cancer o...",chain
3,0,In the past people didn't die of the diseases ...,chain
4,0,While antibiotics have been used to fight bact...,chain


In [23]:
##
processed_docs = data['response'].map(preprocess)
processed_docs[:10]

0    [peopl, past, heart, diseas, live, long, devel...
1    [earli, peopl, heart, diseas, stroke, die, inf...
2    [day, peopl, cancer, heart, diseas, modern, il...
3    [past, peopl, diseas, common, today, live, lon...
4    [antibiot, fight, bacteri, infect, year, effec...
5    [peopl, live, long, experi, heart, problem, pe...
6    [peopl, past, cancer, live, long, antibiot, pe...
7    [decad, peopl, die, reason, cancer, heart, dis...
8    [histori, peopl, cancer, hear, diseas, lifesty...
9    [recent, histori, human, death, weren, caus, c...
Name: response, dtype: object

In [24]:
#Create dictionary of words
dictionary = gensim.corpora.Dictionary(processed_docs)
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 alexand
1 antibiot
2 caus
3 citi
4 claim
5 come
6 develop
7 diseas
8 fleme
9 golden
10 heart


In [25]:
len(dictionary)

584

In [26]:
#Delete extreme words
dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=500)

In [27]:
#Create the corpus
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_doc_4310 = bow_corpus[10]
for i in range(len(bow_doc_4310)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_4310[i][0], 
                                               dictionary[bow_doc_4310[i][0]], 
bow_doc_4310[i][1]))

Word 3 ("come") appears 1 time.
Word 4 ("develop") appears 1 time.
Word 5 ("diseas") appears 2 time.
Word 7 ("heart") appears 1 time.
Word 8 ("live") appears 1 time.
Word 9 ("long") appears 1 time.
Word 12 ("penicillin") appears 3 time.
Word 15 ("save") appears 1 time.
Word 16 ("superbug") appears 3 time.
Word 33 ("stronger") appears 1 time.
Word 35 ("accid") appears 1 time.
Word 36 ("cancer") appears 1 time.
Word 38 ("cure") appears 1 time.
Word 39 ("day") appears 1 time.
Word 49 ("time") appears 1 time.
Word 50 ("chang") appears 1 time.
Word 52 ("death") appears 1 time.
Word 55 ("injuri") appears 1 time.
Word 58 ("today") appears 1 time.
Word 59 ("treat") appears 1 time.
Word 61 ("like") appears 1 time.
Word 67 ("alli") appears 1 time.
Word 81 ("make") appears 1 time.
Word 86 ("work") appears 1 time.
Word 87 ("abl") appears 1 time.
Word 88 ("estim") appears 1 time.
Word 89 ("introduc") appears 1 time.


In [28]:
# TF-IDF
from gensim import corpora, models

tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
from pprint import pprint

for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.2038942972252099),
 (1, 0.18194116380617711),
 (2, 0.2699678674937903),
 (3, 0.2590218480054993),
 (4, 0.36388232761235423),
 (5, 0.10992934325564296),
 (6, 0.21019265565416922),
 (7, 0.14488581282424157),
 (8, 0.30828479458055397),
 (9, 0.18701002745496514),
 (10, 0.11499820690443105),
 (11, 0.17710814837761873),
 (12, 0.14147924218060778),
 (13, 0.28220447620470335),
 (14, 0.33103366855603344),
 (15, 0.16806844102344565),
 (16, 0.1923389440465296),
 (17, 0.23176453691546034),
 (18, 0.2699678674937903)]


In [29]:
# #Bag of words
# lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=3, id2word=dictionary, passes=2, workers=6)

# for idx, topic in lda_model.print_topics(-1):
#     print('Topic: {} \nWords: {}'.format(idx, topic))

In [73]:
num_topics = 4
data = pd.concat([chain,network])

In [74]:
#Fit model: tF-idf
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=num_topics, id2word=dictionary, passes=2, workers=4)

for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.030*"death" + 0.025*"increas" + 0.023*"need" + 0.023*"reduc" + 0.021*"overus" + 0.021*"effect" + 0.020*"super" + 0.018*"superbug" + 0.018*"rise" + 0.017*"discov"
Topic: 1 Word: 0.023*"medicin" + 0.020*"year" + 0.018*"live" + 0.018*"develop" + 0.017*"problem" + 0.017*"million" + 0.017*"caus" + 0.016*"diseas" + 0.016*"chang" + 0.016*"ill"
Topic: 2 Word: 0.030*"diseas" + 0.029*"potenti" + 0.022*"longer" + 0.022*"prescrib" + 0.021*"bodi" + 0.021*"immun" + 0.020*"help" + 0.019*"make" + 0.018*"fight" + 0.016*"live"
Topic: 3 Word: 0.031*"cancer" + 0.022*"know" + 0.022*"invent" + 0.021*"save" + 0.020*"live" + 0.018*"diseas" + 0.017*"long" + 0.016*"heart" + 0.016*"penicillin" + 0.016*"bug"


In [75]:
#Check one of the stories
for index, score in sorted(lda_model_tfidf[bow_corpus[10]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))


Score: 0.9755772352218628	 
Topic: 0.023*"medicin" + 0.020*"year" + 0.018*"live" + 0.018*"develop" + 0.017*"problem" + 0.017*"million" + 0.017*"caus" + 0.016*"diseas" + 0.016*"chang" + 0.016*"ill"


In [76]:
lda_model_tfidf[bow_corpus[1]]

[(0, 0.17406012), (1, 0.5276338), (3, 0.28929508)]

In [77]:
#Create the probability of being in each topic
topics = np.zeros((num_topics,len(data)))
for j,response in enumerate(bow_corpus):
    values = dict(lda_model_tfidf[response])
    for i in range(num_topics):
        if values.get(i):
            topics[i,j] =values[i]
        else:
            topics[i,j] = 0

In [78]:
#save the data
for i in range(num_topics):
    data[i] = topics[i,:]
    
data.to_csv("data/lda_quick_test.tsv",sep="\t")
data.head()

Unnamed: 0,generation,response,condition,0,1,2,3
0,0,Most people in the past didn't die from heart ...,chain,0.01155,0.824665,0.011374,0.152411
1,0,Early on many people did not die of heart dise...,chain,0.174057,0.527655,0.0,0.289278
2,0,"In the old days, people didn't die of cancer o...",chain,0.0,0.971687,0.0,0.0
3,0,In the past people didn't die of the diseases ...,chain,0.010607,0.96793,0.010736,0.010727
4,0,While antibiotics have been used to fight bact...,chain,0.319894,0.636012,0.021799,0.022295


In [79]:
data.groupby("condition").mean()

Unnamed: 0_level_0,generation,0,1,2,3
condition,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
chain,1.87234,0.151665,0.416267,0.081544,0.34756
network,1.972973,0.266504,0.409893,0.171839,0.149246


In [80]:
data.groupby("condition").std()

Unnamed: 0_level_0,generation,0,1,2,3
condition,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
chain,1.377096,0.275122,0.422409,0.200332,0.407104
network,1.413952,0.358676,0.398983,0.292185,0.262359


In [82]:
data.groupby(["generation"]).mean()

Unnamed: 0_level_0,0,1,2,3
generation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.075049,0.734054,0.025648,0.157578
1,0.221046,0.4874,0.077442,0.210286
2,0.260946,0.175748,0.194736,0.367797
3,0.228058,0.343946,0.226046,0.201193
4,0.343907,0.303233,0.164375,0.188486
