In [2]:
import gensim
from gensim import matutils, corpora
from gensim.models.ldamodel import LdaModel
import pandas as pd
import nltk
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
import pyLDAvis
import pyLDAvis.gensim
import numpy as np 
import snowball

In [3]:
###
# note! as of 1/15, using *only* tweets that contain terms drawn from a whitelist.
# this is a `high precision' set, but greatly reduces the # of tweets (to 13345; 
# ignoring 100177 tweets)
tweet_data = pd.read_csv("CancerReport-clean-whitelisted-en.txt", delimiter="\t")

In [4]:
tweet_data.shape

(13345, 40)

In [5]:
tweet_data.head()

Unnamed: 0,tweet_id,tweet_text,tweet_created_at,in_reply_to_status_id_str,in_reply_to_screen_name,retweet_count,favorite_count,machine_translated_language,geo_lat,geo_long,...,retweet_created_at,retweet_screen_name,retweet_user_created_at,retweet_person_name,retweet_statuses_count,retweet_friends_count,retweet_followers_count,retweet_urls,retweet_hashtag_text,retweet_usermentions_screen_name
0,185081180222263297,My arm is so sore :( had a HPV injection at sc...,2012-03-28 15:09:26,,,,,,0,0,...,,,,,,,,,,
1,185096288109461507,MsRhea Mike HEALTHY Wednesday Awareness mak...,2012-03-28 16:09:28,,,,,,0,0,...,,,,,,,,,,
2,185108334121193472,@xomorgyyxo I asked him three times if it was ...,2012-03-28 16:57:20,,,,,,0,0,...,,,,,,,,,,
3,185070816092880896,Just informed by a 26 year old Human papillom...,2012-03-28 14:28:15,,,,,,0,0,...,,,,,,,,,,
4,185173819793735680,"If someone tells you that ""everybody"" has HPV ...",2012-03-28 21:17:33,,,,,,0,0,...,,,,,,,,,,


In [6]:
primary_tweets = tweet_data[tweet_data["retweet"] == False]

In [7]:
tweets = tweet_data["tweet_text"].tolist()

In [8]:
tweets[0]

"My arm is so sore :( had a HPV injection at school today :'( xxx"

In [9]:
# this will be *all* tweets; independent of the hashtag
toked_tweets, kept_indices = snowball.build_gensim_corpus(tweets)

In [10]:
print(list(toked_tweets[0]))

['arm', 'sore', 'hpv', 'injection', 'school', 'today', 'xxx']


In [11]:
retweet_counts = np.nan_to_num(tweet_data["retweet_count"].tolist()) # note that I am assuming NaN == 0

In [12]:
retweet_counts[:10]

array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.])

In [13]:
dictionary = corpora.Dictionary(toked_tweets)

In [14]:
dictionary.save("snowball.tweets.dict")

In [15]:
print(dictionary)

Dictionary(3529 unique tokens: ['katiecouric', 'days', 'ow', 'diagnosing', 'prevention']...)


In [16]:
gensim_corpus = [dictionary.doc2bow(tweet) for tweet in toked_tweets]#[[dictionary.token2id[t] for t in tweet] for tweet in toked_tweets]

In [17]:
lda = LdaModel(gensim_corpus, num_topics=20,
                    passes=10, alpha=0.001, id2word=dictionary)

In [18]:
lda.save("snowball-LDA-20-topics.model")

In [19]:
lda.show_topics()

['0.080*co + 0.078*http + 0.045*hpv + 0.019*adverse + 0.018*patients + 0.018*thought + 0.018*diet + 0.016*star + 0.015*agency + 0.014*reactions',
 '0.130*hpv + 0.115*http + 0.114*co + 0.090*vaccine + 0.045*girls + 0.033*rt + 0.018*boys + 0.017*young + 0.014*getting + 0.014*human',
 '0.088*https + 0.055*cancer + 0.051*rt + 0.047*cervical + 0.027*think + 0.018*give + 0.018*spread + 0.017*aren + 0.014*comes + 0.013*enough',
 '0.106*cancer + 0.097*cervical + 0.080*http + 0.072*co + 0.050*rt + 0.042*awareness + 0.030*prevention + 0.022*warts + 0.020*dies + 0.018*genital',
 '0.099*cancer + 0.088*http + 0.087*cervical + 0.081*co + 0.049*prevent + 0.039*rt + 0.031*screening + 0.021*women + 0.020*help + 0.019*tests',
 '0.121*hpv + 0.095*http + 0.092*co + 0.046*vaccine + 0.038*rt + 0.031*vaccination + 0.025*study + 0.020*cancers + 0.014*girls + 0.014*health',
 '0.095*cancer + 0.080*cervical + 0.068*co + 0.066*http + 0.038*rt + 0.037*know + 0.015*women + 0.015*causes + 0.013*sb + 0.013*common',
 

In [20]:
for_viz = pyLDAvis.gensim.prepare(lda, gensim_corpus, dictionary)

  topic_proportion = (topic_freq / topic_freq.sum()).order(ascending=False)
  sort('saliency', ascending=False). \
  return token_table.sort(['Term', 'Topic'])


In [21]:
pyLDAvis.display(for_viz)

In [22]:
toked_tweets_by_tag, tags_to_raw_tweets = snowball.build_gensim_corpus(tweets, split_up_by_tag=True)

In [23]:
toked_tweets_by_tag.keys()

dict_keys(['cancer prevention', 'HPV vaccination', '#fightcancer', '#WomensHealth', 'cervical cancer', 'vaxx', 'colonoscopy', '#screened', 'mammogram', 'HPV', 'pap smear', '#vaccinated', 'Gardasil'])

In [24]:
import importlib; importlib.reload(snowball)

<module 'snowball' from '/Users/byron/dev/snowball/data/snowball.py'>

In [25]:
lda_cervical, corpus_cervical, dict_cervical= snowball.gen_lda_model(toked_tweets_by_tag['cervical cancer'])

In [26]:
for_viz_cervical = pyLDAvis.gensim.prepare(lda_cervical, corpus_cervical, dict_cervical)

  topic_proportion = (topic_freq / topic_freq.sum()).order(ascending=False)
  sort('saliency', ascending=False). \
  return token_table.sort(['Term', 'Topic'])


In [27]:
pyLDAvis.display(for_viz_cervical)

In [28]:
### 
# split by arm
###
cervical_arm_tags = ["pap smear", "pap test", "HPV", "human papillomavirus","HPV vaccination","Gardasil","cervical cancer"]
comparison_study_arm_tags = ["#GoingToTheDoctor", "#WomensHealth", "colonoscopy","cancer prevention","cancer screening","mammogram","vaxx","#fightcancer","#stopcancerb4itstarts","#screened","#vaccinated"]

# get tweets corresponding to each 
cervical_arm_tweets = []
for tag in cervical_arm_tags: 
    cervical_arm_tweets.extend(toked_tweets_by_tag[tag])

comparison_arm_tweets = []
for tag in comparison_study_arm_tags:
    comparison_arm_tweets.extend(toked_tweets_by_tag[tag])

print("num cervical arm: %s; num comparison arm: %s" % (len(cervical_arm_tweets), len(comparison_arm_tweets)))



num cervical arm: 14826; num comparison arm: 715


In [29]:
# stratified LDA
lda_cervical_arm, corpus_cervical_arm, dict_cervical_arm = snowball.gen_lda_model(cervical_arm_tweets)
for_viz_cervical = pyLDAvis.gensim.prepare(lda_cervical, corpus_cervical, dict_cervical)
pyLDAvis.display(for_viz_cervical)

  topic_proportion = (topic_freq / topic_freq.sum()).order(ascending=False)
  sort('saliency', ascending=False). \
  return token_table.sort(['Term', 'Topic'])


In [30]:
lda_comp_arm, corpus_comp_arm, dict_comp_arm = snowball.gen_lda_model(comparison_arm_tweets)
for_viz_comp = pyLDAvis.gensim.prepare(lda_comp_arm, corpus_comp_arm, dict_comp_arm)
pyLDAvis.display(for_viz_comp)

  topic_proportion = (topic_freq / topic_freq.sum()).order(ascending=False)
  sort('saliency', ascending=False). \
  return token_table.sort(['Term', 'Topic'])
