In [1]:
from collections import defaultdict
import csv 
import pdb

import gensim
from gensim import matutils, corpora
from gensim.models.ldamodel import LdaModel
import pandas as pd
import nltk
import statsmodels 
import statsmodels.api as sm
import matplotlib.pyplot as plt 
import seaborn as sns
import numpy as np

import snowball

In [2]:
###
# 1/15 -- switching to whitelisted tweets *only*!
tweet_data = pd.read_csv("CancerReport-clean-whitelisted-en.txt", delimiter="\t", low_memory=False)

In [3]:
###
# now to analyze the retweet counts; first pull out just the retweets
retweets = tweet_data[tweet_data["retweet"] == True]
unique_retweet_id_list = list(set(retweets["retweet_id_str"].tolist())) 
# group the tweets by the tweet they are, erm, retweeting
grouped_retweets = retweets.groupby("retweet_id_str")
# then count up retweets and extract original (retweeted) text
# see code in snowball.py for doing this
orig_tweet_texts, retweet_counts = snowball._count_up_retweets(grouped_retweets)

In [4]:
###
# topic modeling
# kept_indices are the set of indices corresponding to tweets not discarded as noise 
toked_tweets, kept_indices = snowball.build_gensim_corpus(orig_tweet_texts, split_up_by_tag=False)
lda, gensim_corpus, dictionary = snowball.gen_lda_model(toked_tweets)
inferred_topic_matrix = lda.inference(gensim_corpus)[0]
# renorm, due to weirdness in gensim (???)
row_sums = inferred_topic_matrix.sum(axis=1)
inferred_topic_matrix = inferred_topic_matrix / row_sums[:, np.newaxis]
# remove the tweets that were cleaned/not included in gensim corpus
retweet_counts = [retweet_counts[idx] for idx in kept_indices]
orig_tweet_texts = [orig_tweet_texts[idx] for idx in kept_indices]

In [5]:
# estimate the topical composition of the original tweets
orig_tweets_bow = [dictionary.doc2bow(tweet) for tweet in toked_tweets]
orig_tweets_inferred_topics = [lda.get_document_topics(doc) for doc in orig_tweets_bow]

# note: this is ascending order, so most re-tweeted are last.
sorted_tweets = sorted(zip(retweet_counts, orig_tweet_texts, orig_tweets_inferred_topics))

In [6]:
# this is just ordinary least squares (OLS)
regression_results = sm.OLS(retweet_counts, inferred_topic_matrix).fit()
print(regression_results.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.008
Model:                            OLS   Adj. R-squared:                  0.005
Method:                 Least Squares   F-statistic:                     2.962
Date:                Fri, 15 Jan 2016   Prob (F-statistic):            0.00164
Time:                        08:42:47   Log-Likelihood:                -16533.
No. Observations:                3468   AIC:                         3.309e+04
Df Residuals:                    3458   BIC:                         3.315e+04
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
x1             0.9947      2.724      0.365      0.7

In [7]:
# crude look at what these topics look like (top words in each)
for topic_idx, topic in enumerate(lda.print_topics()):
    print("topic %s:\n %s\n" % (topic_idx+1, topic))

topic 1:
 0.064*cancer + 0.058*cervical + 0.038*co + 0.036*http + 0.029*hpv + 0.017*women + 0.016*one + 0.014*th + 0.013*amp + 0.012*ve

topic 2:
 0.078*hpv + 0.041*vax + 0.038*tx + 0.034*vaxchoice + 0.031*get + 0.024*nohb + 0.019*think + 0.018*human + 0.017*pap + 0.016*anal

topic 3:
 0.137*co + 0.136*http + 0.102*hpv + 0.069*vaccine + 0.048*gardasil + 0.023*vaccines + 0.016*via + 0.012*health + 0.012*girls + 0.011*side

topic 4:
 0.100*co + 0.097*http + 0.069*hpv + 0.049*cancer + 0.039*cervical + 0.032*vaccine + 0.015*gardasil + 0.014*amp + 0.011*cancers + 0.010*risk

topic 5:
 0.076*pap + 0.039*cancer + 0.039*smears + 0.030*women + 0.027*amp + 0.027*smear + 0.027*cervical + 0.017*breast + 0.017*co + 0.015*http

topic 6:
 0.035*pap + 0.030*smear + 0.027*year + 0.026*new + 0.023*stop + 0.021*dr + 0.021*truth + 0.020*every + 0.019*gardasil + 0.018*co

topic 7:
 0.129*cancer + 0.116*cervical + 0.098*co + 0.093*http + 0.020*screening + 0.017*awareness + 0.013*amp + 0.013*prevention + 0.0

In [8]:
def _to_file(path, tweets, headers=["retweet count", "tweet", "topics"]):
    with open(path, 'w') as outf: 
        csv_w = csv.writer(outf)
        csv_w.writerow(headers)
        csv_w.writerows(tweets)

In [9]:
_to_file("50-most-retweeted.csv", sorted_tweets[-50:])
_to_file("50-least-retweeted.csv", sorted_tweets[:50])

In [10]:
def get_tweet_probs_for_topic(t_idx, tweets, tweet_topic_tuples, k=20):
    # t_idx should be zero-indexed!!!
    probs = []
    for tw in tweet_topic_tuples:
        topic_d = dict(tw)
        try:
            prob = topic_d[t_idx]
        except:
            prob = 0
        probs.append(prob)
    
    return probs 

def top_tweets_for_topics(t_idx, tweets, tweet_topic_tuples, k=20, write_out=True):
    topic_probs = get_tweet_probs_for_topic(t_idx, tweets, tweet_topic_tuples, k=k)
    sorted_by_t = sorted(zip(topic_probs, tweets), reverse=True)
    if write_out: 
        with open("top-%s-tweets-topic-%s.csv" % (k, t_idx+1), 'w') as outf:
            csv_w = csv.writer(outf)
            csv_w.writerows(sorted_by_t)
    return sorted_by_t[:k]

In [11]:
# now we sample representative tweets from top topics (per our regression)
# these are 4,7,9 (when zero-indexd --corresponding to 5,8,10 above)
topic5_tweets = top_tweets_for_topics(4, orig_tweet_texts, orig_tweets_inferred_topics, k=50)
topic8_tweets = top_tweets_for_topics(7, orig_tweet_texts, orig_tweets_inferred_topics, k=50)
topic10_tweets = top_tweets_for_topics(9, orig_tweet_texts, orig_tweets_inferred_topics, k=50)

# for reference/contrast; this topic is totally uncorrelated
topic2_tweets = top_tweets_for_topics(1, orig_tweet_texts, orig_tweets_inferred_topics, k=50)