In [1]:
import gensim
from gensim import matutils, corpora
from gensim.models.ldamodel import LdaModel
import pandas as pd
import nltk
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
import pyLDAvis
import pyLDAvis.gensim

import snowball

In [2]:
tweet_data = pd.read_csv("CancerReport-clean-en.txt", delimiter="\t")

In [3]:
tweet_data.shape

(72662, 5)

In [4]:
tweet_data.head()

Unnamed: 0,id,tweet,date,tweeter_name,tweeter_info
0,185077342425714688,RT @jockosports: 2 more days until my #colonos...,2012-03-28 14:54:11,ATaylor4029,Angela Taylor
1,185081180222263297,My arm is so sore :( had a HPV injection at sc...,2012-03-28 15:09:26,XxShopoholicxX,Sarah
2,185096288109461507,MsRhea Mike HEALTHY Wednesday Awareness mak...,2012-03-28 16:09:28,brendasunshine,Brenda Williams
3,185108334121193472,@xomorgyyxo I asked him three times if it was ...,2012-03-28 16:57:20,RanaNoah,Rana
4,185117800669523968,I have been poked and prodded today.and my boo...,2012-03-28 17:34:57,cinnamom201351,Sandy Cox


In [5]:
tweets = tweet_data["tweet"].tolist()

In [6]:
tweets[0]

'RT @jockosports: 2 more days until my #colonoscopy. I hope to raise awareness of colon cancer and prevention.  Follow my journey on #402 ...'

In [7]:
# this will be *all* tweets; independent of the hashtag
toked_tweets = snowball.build_gensim_corpus(tweets)

In [8]:
print(list(toked_tweets[0]))

['rt', 'days', 'colonoscopy', 'hope', 'raise', 'awareness', 'colon', 'cancer', 'prevention', 'follow', 'journey']


In [9]:
dictionary = corpora.Dictionary(toked_tweets)

In [10]:
dictionary.save("snowball.tweets.dict")

In [11]:
print(dictionary)

Dictionary(11977 unique tokens: ['haber', 'ruiz', 'days', 'buzz', 'andyrichter']...)


In [12]:
gensim_corpus = [dictionary.doc2bow(tweet) for tweet in toked_tweets]#[[dictionary.token2id[t] for t in tweet] for tweet in toked_tweets]

In [13]:
lda = LdaModel(gensim_corpus, num_topics=20,
                    passes=10, alpha=0.001, id2word=dictionary)

In [14]:
lda.save("snowball-LDA-20-topics.model")

In [15]:
lda.show_topics()

['0.054*rt + 0.050*bless + 0.023*also + 0.020*control + 0.019*birth + 0.019*many + 0.018*october + 0.018*important + 0.017*match + 0.015*amp',
 '0.103*http + 0.102*co + 0.084*cancer + 0.040*screening + 0.039*breast + 0.036*rt + 0.034*mammograms + 0.031*women + 0.030*screenings + 0.025*mammogram',
 '0.087*planned + 0.063*parenthood + 0.052*mammograms + 0.029*rt + 0.028*pa + 0.019*eng + 0.019*imusicrcti + 0.017*defundpp + 0.017*plannedparenthood + 0.017*moh',
 '0.298*crclub + 0.077*rt + 0.057*gurmeetramrahim + 0.038*msgthefilm + 0.024*guruji + 0.022*god + 0.018*saloniradhainsa + 0.018*amp + 0.017*usa + 0.009*insan_divya',
 '0.282*msg + 0.166*gurmeetramrahim + 0.113*rt + 0.016*arg + 0.014*ned + 0.013*ger + 0.012*wid + 0.012*mex + 0.011*success + 0.010*chi',
 '0.084*world + 0.073*celebrations + 0.042*cup + 0.033*rejoicing + 0.031*party + 0.031*zenith + 0.029*guruharsahai + 0.024*aus + 0.015*superb + 0.013*single',
 '0.140*https + 0.045*co + 0.037*rt + 0.024*care + 0.024*periscope + 0.022*p

In [16]:
for_viz = pyLDAvis.gensim.prepare(lda, gensim_corpus, dictionary)

  topic_proportion = (topic_freq / topic_freq.sum()).order(ascending=False)
  sort('saliency', ascending=False). \
  return token_table.sort(['Term', 'Topic'])


In [17]:
pyLDAvis.display(for_viz)

In [18]:
toked_tweets_by_tag = snowball.build_gensim_corpus(tweets, split_up_by_tag=True)

In [19]:
toked_tweets_by_tag.keys()

dict_keys(['mammogram', '#vaccinated', '#crc', '#WomensHealth', 'HPV vaccination', 'cancer prevention', 'vaxx', 'HPV', '#stopcancerb4itstarts', 'pap test', 'Gardasil', 'cancer screening', '#GoingToTheDoctor', 'other', 'cervical cancer', 'colonoscopy', '#fightcancer', 'human papillomavirus', '#screened', 'pap smear'])

In [20]:
import importlib; importlib.reload(snowball)

<module 'snowball' from '/Users/byron/dev/snowball/data/snowball.py'>

In [21]:
lda_cervical, corpus_cervical, dict_cervical= snowball.gen_lda_model(toked_tweets_by_tag['cervical cancer'])

In [22]:
for_viz_cervical = pyLDAvis.gensim.prepare(lda_cervical, corpus_cervical, dict_cervical)

  topic_proportion = (topic_freq / topic_freq.sum()).order(ascending=False)
  sort('saliency', ascending=False). \
  return token_table.sort(['Term', 'Topic'])


In [23]:
pyLDAvis.display(for_viz_cervical)