In [1]:
from collections import defaultdict
import csv 
import pdb

import gensim
from gensim import matutils, corpora
from gensim.models.ldamodel import LdaModel
import pandas as pd
import nltk
import statsmodels 
import statsmodels.api as sm
import matplotlib.pyplot as plt 
import seaborn as sns
import numpy as np

import snowball

In [2]:
# all data
tweet_data = pd.read_csv("CancerReport-clean-all-data-en.txt", delimiter="\t", low_memory=False)

In [3]:
###
# now to analyze the retweet counts; first pull out just the retweets
retweets = tweet_data[tweet_data["retweet"] == True]
unique_retweet_id_list = list(set(retweets["retweet_id_str"].tolist())) 
# group the tweets by the tweet they are, erm, retweeting
grouped_retweets = retweets.groupby("retweet_id_str")
# then count up retweets and extract original (retweeted) text
# see code in snowball.py for doing this
orig_tweet_texts, retweet_counts = snowball._count_up_retweets(grouped_retweets)

In [4]:
###
# topic modeling
# kept_indices are the set of indices corresponding to tweets not discarded as noise 
toked_tweets, kept_indices = snowball.build_gensim_corpus(orig_tweet_texts, split_up_by_tag=False)
lda, gensim_corpus, dictionary = snowball.gen_lda_model(toked_tweets)
inferred_topic_matrix = lda.inference(gensim_corpus)[0]
# renorm, due to weirdness in gensim (???)
row_sums = inferred_topic_matrix.sum(axis=1)
inferred_topic_matrix = inferred_topic_matrix / row_sums[:, np.newaxis]
# remove the tweets that were cleaned/not included in gensim corpus
retweet_counts = [retweet_counts[idx] for idx in kept_indices]
orig_tweet_texts = [orig_tweet_texts[idx] for idx in kept_indices]

In [6]:
# estimate the topical composition of the original tweets
orig_tweets_bow = [dictionary.doc2bow(tweet) for tweet in toked_tweets]
orig_tweets_inferred_topics = [lda.get_document_topics(doc) for doc in orig_tweets_bow]

# note: this is ascending order, so most re-tweeted are last.
sorted_tweets = sorted(zip(retweet_counts, orig_tweet_texts, orig_tweets_inferred_topics))

In [7]:
# this is just ordinary least squares (OLS)
regression_results = sm.OLS(retweet_counts, inferred_topic_matrix).fit()
print(regression_results.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.005
Model:                            OLS   Adj. R-squared:                  0.004
Method:                 Least Squares   F-statistic:                     6.179
Date:                Tue, 12 Jan 2016   Prob (F-statistic):           9.80e-09
Time:                        14:42:38   Log-Likelihood:                -58731.
No. Observations:               11556   AIC:                         1.175e+05
Df Residuals:                   11546   BIC:                         1.176e+05
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
x1            -0.5462      2.410     -0.227      0.8

In [5]:
# crude look at what these topics look like (top words in each)
for topic_idx, topic in enumerate(lda.print_topics()):
    print("topic %s:\n %s\n" % (topic_idx+1, topic))

topic 1:
 0.058*co + 0.056*http + 0.050*de + 0.029*la + 0.021*gt + 0.018*du + 0.011*pm + 0.011*epcrchampionscup + 0.009*movie + 0.009*standwithpp

topic 2:
 0.046*co + 0.045*http + 0.035*le + 0.031*mcrcomeback + 0.025*loscrcl + 0.025*les + 0.021*est + 0.020*pour + 0.019*et + 0.018*vafcrcl

topic 3:
 0.054*msg + 0.039*crclub + 0.029*mammogram + 0.027*gurmeetramrahim + 0.026*co + 0.024*http + 0.017*amp + 0.014*msgthefilm + 0.013*get + 0.011*plannedparenthood

topic 4:
 0.114*co + 0.097*http + 0.029*https + 0.024*fightcancer + 0.020*gardasil + 0.018*help + 0.014*rt + 0.013*please + 0.013*hpv + 0.013*amp

topic 5:
 0.129*co + 0.123*http + 0.050*cancer + 0.031*prevention + 0.017*screening + 0.011*breast + 0.011*mammograms + 0.010*amp + 0.009*study + 0.009*breastcancer

topic 6:
 0.064*co + 0.060*http + 0.058*planned + 0.057*mammograms + 0.057*parenthood + 0.018*defundpp + 0.015*mammogram + 0.010*doesn + 0.010*women + 0.010*ji

topic 7:
 0.105*co + 0.101*http + 0.093*hpv + 0.050*vaccine + 0.

In [8]:
def _to_file(path, tweets, headers=["retweet count", "tweet", "topics"]):
    with open(path, 'w') as outf: 
        csv_w = csv.writer(outf)
        csv_w.writerow(headers)
        csv_w.writerows(tweets)

In [9]:
_to_file("50-most-retweeted.csv", sorted_tweets[-50:])
_to_file("50-least-retweeted.csv", sorted_tweets[:50])

In [29]:
# now we sample representative tweets from top topics (per our regression)
# these are 2,3,8 (when zero-indexd --corresponding to 3,4,9 above)
def get_tweet_probs_for_topic(t_idx, tweets, tweet_topic_tuples, k=20):
    # t_idx should be zero-indexed!!!
    probs = []
    for tw in tweet_topic_tuples:
        topic_d = dict(tw)
        try:
            prob = topic_d[t_idx]
        except:
            prob = 0
        probs.append(prob)
    
    return probs 

def top_tweets_for_topics(t_idx, tweets, tweet_topic_tuples, k=20, write_out=True):
    topic_probs = get_tweet_probs_for_topic(t_idx, tweets, tweet_topic_tuples, k=k)
    sorted_by_t = sorted(zip(topic_probs, tweets), reverse=True)
    if write_out: 
        with open("top-%s-tweets-topic-%s.csv" % (k, t_idx+1), 'w') as outf:
            csv_w = csv.writer(outf)
            csv_w.writerows(sorted_by_t)
    return sorted_by_t[:k]

In [32]:
topic3_tweets = top_tweets_for_topics(2, orig_tweet_texts, orig_tweets_inferred_topics, k=50)
topic4_tweets = top_tweets_for_topics(3, orig_tweet_texts, orig_tweets_inferred_topics, k=50)
topic6_tweets = top_tweets_for_topics(5, orig_tweet_texts, orig_tweets_inferred_topics, k=50)
topic9_tweets = top_tweets_for_topics(8, orig_tweet_texts, orig_tweets_inferred_topics, k=50)

# for reference/contrast; this topic is totally uncorrelated
topic2_tweets = top_tweets_for_topics(1, orig_tweet_texts, orig_tweets_inferred_topics, k=50)

In [31]:
topic6_tweets

[(0.99918256130790206,
  "Thousands call Planned Parenthood for mammograms after Obama's false claim... http://t.co/Nsx0W5gc"),
 (0.99918256130790206,
  "President's claim that Planned Parenthood provides mammograms debunked (again) http://t.co/F9Uvp6ji via @TwitchyTeam"),
 (0.99918256130790206,
  'President Obama Lies to the Country Again About Non-Existent Planned Parenthood Mammograms - Katie Pavlich http://t.co/228hE84T'),
 (0.99910089910089928,
  "Planned Parenthood: We don't do mammograms, but Obama said they did! http://t.co/AqaKDRdHrv via @waynedupreeshow #tcot #DefundPP"),
 (0.99900110987791357,
  "President's claim that Planned Parenthood provides mammograms debunked (again) http://t.co/CcUx5f3J"),
 (0.99900110987791357,
  "Planned Parenthood: uh, no, the President was wrong, we don't provide mammograms - http://t.co/xWiHNbGn"),
 (0.99900110987791357,
  "Planned Parenthood: Obama Wrong, 'We Don't Provide Mammograms' http://t.co/t0tDGMdz #tcot"),
 (0.99900110987791357,
  "Plan