In [8]:
import gensim
from gensim import matutils, corpora
from gensim.models.ldamodel import LdaModel
import pandas as pd
import nltk
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
import pyLDAvis
import pyLDAvis.gensim

import snowball

In [9]:
tweet_data = pd.read_csv("CancerReport-clean-en.txt", delimiter="\t")

In [10]:
tweet_data.shape

(72662, 5)

In [11]:
tweet_data.head()

Unnamed: 0,id,tweet,date,tweeter_name,tweeter_info
0,185077342425714688,RT @jockosports: 2 more days until my #colonos...,2012-03-28 14:54:11,ATaylor4029,Angela Taylor
1,185081180222263297,My arm is so sore :( had a HPV injection at sc...,2012-03-28 15:09:26,XxShopoholicxX,Sarah
2,185096288109461507,MsRhea Mike HEALTHY Wednesday Awareness mak...,2012-03-28 16:09:28,brendasunshine,Brenda Williams
3,185108334121193472,@xomorgyyxo I asked him three times if it was ...,2012-03-28 16:57:20,RanaNoah,Rana
4,185117800669523968,I have been poked and prodded today.and my boo...,2012-03-28 17:34:57,cinnamom201351,Sandy Cox


In [12]:
tweets = tweet_data["tweet"].tolist()

In [13]:
tweets[0]

'RT @jockosports: 2 more days until my #colonoscopy. I hope to raise awareness of colon cancer and prevention.  Follow my journey on #402 ...'

In [30]:
# this will be *all* tweets; independent of the hashtag
toked_tweets = snowball.build_gensim_corpus(tweets)

In [15]:
print(list(toked_tweets[0]))

['rt', 'days', 'colonoscopy', 'hope', 'raise', 'awareness', 'colon', 'cancer', 'prevention', 'follow', 'journey']


In [16]:
dictionary = corpora.Dictionary(toked_tweets)

In [17]:
dictionary.save("snowball.tweets.dict")

In [18]:
print(dictionary)

Dictionary(11977 unique tokens: ['shitting', 'succe', 'civilians', 'discovered', 'promo']...)


In [19]:
gensim_corpus = [dictionary.doc2bow(tweet) for tweet in toked_tweets]#[[dictionary.token2id[t] for t in tweet] for tweet in toked_tweets]

In [20]:
lda = LdaModel(gensim_corpus, num_topics=20,
                    passes=10, alpha=0.001, id2word=dictionary)

In [21]:
lda.save("snowball-LDA-20-topics.model")

In [22]:
lda.show_topics()

['0.133*cancer + 0.115*http + 0.114*co + 0.041*rt + 0.039*cervical + 0.038*breast + 0.036*screening + 0.027*women + 0.025*prevention + 0.023*screenings',
 '0.102*craze + 0.055*rt + 0.046*crc + 0.035*gre + 0.033*mex + 0.028*usavcrc + 0.021*nga + 0.019*ussoccer_wnt + 0.018*pp + 0.015*child',
 '0.172*fans + 0.073*http + 0.073*co + 0.044*col + 0.018*rejoicing + 0.016*zenith + 0.015*guruharsahai + 0.013*jpi + 0.013*nlmg + 0.012*colonoscopy',
 '0.110*http + 0.097*co + 0.034*rt + 0.027*high + 0.022*moh + 0.021*wide + 0.013*fifawwc + 0.011*seen + 0.010*top + 0.010*lot',
 '0.054*celebrations + 0.026*rt + 0.024*really + 0.023*party + 0.021*tapping + 0.020*wave + 0.019*running + 0.018*fill + 0.018*quarter + 0.018*game',
 '0.051*ht + 0.042*pink + 0.031*chi + 0.023*imusicrcti + 0.021*divine + 0.020*imusic_rcti + 0.019*prostate + 0.019*pm + 0.019*magic + 0.017*kotakmisteriimusic',
 '0.177*crc + 0.078*ned + 0.078*rt + 0.045*worldcup + 0.040*world + 0.035*celebration + 0.034*arg + 0.027*ger + 0.027*fr

In [23]:
for_viz = pyLDAvis.gensim.prepare(lda, gensim_corpus, dictionary)

  topic_proportion = (topic_freq / topic_freq.sum()).order(ascending=False)
  sort('saliency', ascending=False). \
  return token_table.sort(['Term', 'Topic'])


In [24]:
pyLDAvis.display(for_viz)

In [31]:
toked_tweets_by_tag = snowball.build_gensim_corpus(tweets, split_up_by_tag=True)

In [33]:
toked_tweets_by_tag.keys()

dict_keys(['cervical cancer', '#GoingToTheDoctor', 'mammogram', '#crc', 'HPV vaccination', '#vaccinated', 'cancer prevention', 'pap test', '#WomensHealth', 'vaxx', 'colonoscopy', 'HPV', 'cancer screening', 'human papillomavirus', '#screened', '#stopcancerb4itstarts', 'pap smear', 'Gardasil', '#fightcancer', 'other'])

In [42]:
import importlib; importlib.reload(snowball)

<module 'snowball' from '/Users/byron/dev/snowball/data/snowball.py'>

In [43]:
lda_cervical, corpus_cervical, dict_cervical= snowball.gen_lda_model(toked_tweets_by_tag['cervical cancer'])

In [44]:
for_viz_cervical = pyLDAvis.gensim.prepare(lda_cervical, corpus_cervical, dict_cervical)

  topic_proportion = (topic_freq / topic_freq.sum()).order(ascending=False)
  sort('saliency', ascending=False). \
  return token_table.sort(['Term', 'Topic'])


In [45]:
pyLDAvis.display(for_viz_cervical)