In [1]:
import tweepy
import pandas as pd
import numpy as np
from sklearn.decomposition import LatentDirichletAllocation as LDA
from sklearn.feature_extraction.text import CountVectorizer
import gzip
import pickle
import datetime

In [2]:
class State(object):
    """
    this is for SKLEARN lda models only
    plain old python object to keep the model state before pickling
    """
    def __init__(self, posts=None, lda=None, vectorizer=None):
        """
        inputs
        -----
        posts - dataset used AFTER processing (pandas)
        lda - fitted LDA model
        vectorizer - vectorizer used
        note: you can get the dtm by running get_dtm(df, col, vectorizer)
                get_dtm(posts, 'cleaned', vectorizer)
        """
        self.posts = posts
        self.lda = lda
        self.vectorizer = vectorizer

    def save(self, fname, protocol=-1):
        """
        save the object.. uses gzip for 75% reduction in size
        """
        with gzip.open(fname, 'wb') as f:
            pickle.dump(self.__dict__, f, protocol)
            f.close()

    def load(self, fname):
        """
        load the gzipped object
        """

        with gzip.open(fname, 'rb') as f:
            tmp_dict = pickle.load(f)
            f.close()
        self.__dict__.update(tmp_dict)

    def params(self):
        """
        use to destructure object into posts, lda, vectorizer
        will probabably want to get the dtm, use:
                    get_dtm(posts, 'cleaned', vectorizer)
        usage:
        posts, lda, vectorizer = blah.params()
        dtm = get_dtm(posts, 'cleaned', vectorizer)
        """
        return (self.posts, self.lda, self.vectorizer)

# You don't need to run this. This was done to set up the model and perform diagnostics

In [2]:
# consumer_key = <>
# consumer_secret = <>
# access_key = <>
# access_secret = <>

# authorization = tweepy.OAuthHandler(consumer_key, consumer_secret)
# authorization.set_access_token(access_key, access_secret)

# client = tweepy.API(authorization)

# tweets = []
# for tweet in tweepy.Cursor(client.user_timeline, screen_name = 'bhorowitz', include_rts = False).items():
#     if(tweet.created_at <= datetime.datetime(2017, 6, 30)):
#         tweets.append([tweet.created_at, tweet.text])
    
# tweets = tweets[::-1]
# tweets = pd.DataFrame(tweets)
# tweets.columns = ['created_at', 'text']

# K = 5
# num_top_words_to_see = 10
# V = 1000

# vectorizer = CountVectorizer(stop_words = "english",
#                              max_features = V)
# X = vectorizer.fit_transform(tweets['text'])

# lda_model = LDA(n_topics = K, 
#             learning_method = "online")
# lda_model.fit(X)

# document_topic_dist = lda_model.transform(X)
# topic_word_dist = lda_model.components_
# volcabulary_indices_words_dict = dict((v,k) for k,v in vectorizer.vocabulary_.items())

# state = State(posts = tweets, lda = lda_model, vectorizer = vectorizer)
# state.save(fname = "saved_lda_model")

# Run this

In [3]:
load_state = State()
load_state.load("saved_lda_model")

tweets = load_state.params()[0]

vectorizer = load_state.params()[2]
X = vectorizer.fit_transform(tweets['text'])

lda_model = load_state.params()[1]

document_topic_dist = lda_model.transform(X)
topic_word_dist = lda_model.components_
volcabulary_indices_words_dict = dict((v,k) for k,v in vectorizer.vocabulary_.items())

K = lda_model.n_topics

In [4]:
def get_top_words_for_topics_sideways(volcabulary_indices_words_dict, topic_word_dist):
    top_indices_of_topics = np.argsort(topic_word_dist, axis = 1)[:, np.arange(1, 10+1)*-1]
    top_words_of_topics = pd.DataFrame(np.ndarray((K, 10)))
    for i in range(K):
        for j in range(10):
            top_words_of_topics.loc[i, j] = volcabulary_indices_words_dict[top_indices_of_topics[i, j]]
    top_words_of_topics.index.name = 'Topic #'
    top_words_of_topics.columns = ["Word %d"%i for i in range(1, 10 + 1)]
    return(top_words_of_topics)
get_top_words_for_topics_sideways(volcabulary_indices_words_dict, topic_word_dist)

Unnamed: 0_level_0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Word 10
Topic #,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,http,genius,new,post,thanks,chapter,annotated,story,wrote,9a25ur4jsp
1,think,know,good,doing,need,pmarca,just,quote,way,used
2,https,great,naithanjones,yes,people,work,hard,pmarca,agree,true
3,like,jamarlinmartin,right,just,did,trying,talk,make,think,blahblahblah9tn
4,don,pmarca,super,appreciate,said,a16z,sure,black,thank,nntaleb


In [5]:
def view_most_representative_posts_for_topic(topic):
    return(list(tweets.loc[np.flipud(document_topic_dist[:,topic].argsort())[0:40], 'text']))
view_most_representative_posts_for_topic(0)

['@martina_skelly Thanks! I annotated Chapter 1 on @Genius if you want the back story: http://t.co/MyZtcRLVcf',
 "@EmmanuelAmber in case you haven't seen it, I annotated chapter 1 on @Genius http://t.co/9a25ur4JSP",
 '@NiamhBushnell Thanks! I annotated chapter 1 if you are interested in the backstory http://t.co/9a25ur4JSP @Genius',
 '@ebolarin Thanks! I annotated chapter 1 on @Genius if you are interested in the back story: http://t.co/9a25ur4JSP',
 '@DanielPearson Thanks! I annotated chapter 1 with photos and stuff on @genius http://t.co/t1hddcNMhX',
 '@WillGrannis Thanks! I annotated chapter 1 on @genius with photos and other stuff: http://t.co/fEutBEfm00',
 '@krmayank Thanks! I annotated chapter 1 on @genius with photos and other stuff: http://t.co/fEutBEfm00',
 '@dror_sharon Great to hear it! I annotated chapter 1 of the book at @genius http://t.co/9a25ur4JSP',
 '@genevievebos Thank you! I also told the backstory of Chapter 1 on @genius\nif you are interested http://t.co/t1hddcNMh

In [6]:
topic_names = ['@Genius / @NewsGenius', 'Startup advice', 'Community / Startup advice', 'Community / Toussaint L\'Ouverture', 'Community / Startup advice']

# For monthly aggregation

In [7]:
separated_years = [int(i.strftime('%Y')) for i in tweets['created_at']]
separated_months = [int(i.strftime('%m')) for i in tweets['created_at']]
tweets['ym_timestamp'] = [datetime.datetime.strptime((str(separated_years[t]) + str(separated_months[t])), "%Y%m")
                                                                              for t in range(len(separated_years))]
data_split_by_month = [group[1] for group in tweets.groupby("ym_timestamp")]
num_months = len(data_split_by_month)
month_time_seq = [len(i) for i in data_split_by_month]
month_indices = np.insert(values = 0, obj = 0, arr = np.cumsum(month_time_seq))
month_timestamps = [data_split_by_month[i].iloc[0]['created_at'] for i in range(num_months)]
month_strings = [i.strftime('%Y') + '-' + i.strftime('%m') for i in month_timestamps]

In [8]:
def plot_sklearn_pot_select_topics_monthly(data, document_topic_matrix, topics = [i for i in range(K)]):
    num_selected_topics = len(topics)
    pot_selected_topics = np.zeros((num_months, num_selected_topics))
    # for all months,
    for i in range(num_months):
        # get documents for that month
        documents_in_timestamp_range = document_topic_matrix[month_indices[i]:month_indices[i+1], topics]
        ## set topic prop to 0 for a document if it is below 1/K
        documents_in_timestamp_range[documents_in_timestamp_range < 1/K] = 0
        # get doc_topic distribution, averaged over all the documents in that interval
        pot_for_timestamp_interval = np.sum(documents_in_timestamp_range, axis = 0)/documents_in_timestamp_range.shape[0]
        # update popularity_of_topic_over_time
        pot_selected_topics[i,:] = pot_for_timestamp_interval
    pot_selected_topics_df = pd.DataFrame(pot_selected_topics)
    pot_selected_topics_df.columns = ["%d"%(i) for i in range(K)]
    pot_selected_topics_df.insert(0, "date", month_strings)
    return pot_selected_topics_df

In [9]:
res_monthly = plot_sklearn_pot_select_topics_monthly(tweets, document_topic_dist)

res_monthly_melted = pd.melt(res_monthly, id_vars=['date'], value_vars=["%d"%(i) for i in range(K)])
res_monthly_melted.to_csv("pot_lda_monthly_melted.csv")

month_time_seq_df = pd.DataFrame(month_time_seq)
month_time_seq_df.columns = ["num_posts_in_month"]
month_time_seq_df.to_csv("num_posts_in_month.csv")

month_indices_df = pd.DataFrame(month_indices)
month_indices_df.columns = ["month_indices"]
month_indices_df.to_csv("month_indices.csv")

month_strings_df = pd.DataFrame(month_strings)
month_strings_df.columns = ["date"]
month_strings_df.to_csv("month_strings.csv")

# For quarterly aggregation

In [25]:
# separated_years = [int(i.strftime('%Y')) for i in tweets['created_at']]
# separated_end_of_quarter_months = [(((int(i.strftime('%m'))-1)//3)+1)*3 for i in tweets['created_at']]
# tweets['yq_timestamp'] = [datetime.datetime.strptime((str(separated_years[t]) + str(separated_end_of_quarter_months[t])), "%Y%m")
#                                                                               for t in range(len(separated_years))]
# data_split_by_quarter = [group[1] for group in tweets.groupby("yq_timestamp")]
# num_quarters = len(data_split_by_quarter)
# quarter_time_seq = [len(i) for i in data_split_by_quarter]
# quarter_indices = np.insert(values = 0, obj = 0, arr = np.cumsum(quarter_time_seq))
# quarter_timestamps = [data_split_by_quarter[i].iloc[0]['created_at'] for i in range(num_quarters)]
# quarter_strings = [i.strftime('%Y') + '-' + str((((int(i.strftime('%m'))-1)//3)+1)*3) for i in quarter_timestamps]

In [38]:
# def plot_sklearn_pot_select_topics_quarterly(data, document_topic_matrix, topics = [i for i in range(K)]):
#     num_selected_topics = len(topics)
#     pot_selected_topics = np.zeros((num_quarters, num_selected_topics))
#     # for all quarters,
#     for i in range(num_quarters):
#         # get documents for that month
#         documents_in_timestamp_range = document_topic_matrix[quarter_indices[i]:quarter_indices[i+1], topics]
#         ## set topic prop to 0 for a document if it is below 1/K
#         documents_in_timestamp_range[documents_in_timestamp_range < 1/K] = 0
#         # get doc_topic distribution, averaged over all the documents in that interval
#         pot_for_timestamp_interval = np.sum(documents_in_timestamp_range, axis = 0)/documents_in_timestamp_range.shape[0]
#         # update popularity_of_topic_over_time
#         pot_selected_topics[i,:] = pot_for_timestamp_interval
#     pot_selected_topics_df = pd.DataFrame(pot_selected_topics)
#     pot_selected_topics_df.columns = ["%d"%(i) for i in range(K)]
#     pot_selected_topics_df.insert(0, "date", quarter_strings)
#     return pot_selected_topics_df

In [29]:
# res = plot_sklearn_pot_select_topics(tweets, document_topic_dist)

# res_melted = pd.melt(res, id_vars=['date'], value_vars=["%d"%(i) for i in range(K)])
# res_melted.to_csv("pot_lda_quarterly_melted.csv")

# quarter_time_seq_df = pd.DataFrame(quarter_time_seq)
# quarter_time_seq_df.columns = ["num_posts_in_quarter"]
# quarter_time_seq_df.to_csv("num_posts_in_quarter.csv")

# quarter_indices_df = pd.DataFrame(quarter_indices)
# quarter_indices_df.columns = ["quarter_indices"]
# quarter_indices_df.to_csv("quarter_indices.csv")

# quarter_strings_df = pd.DataFrame(quarter_strings)
# quarter_strings_df.columns = ["date"]
# quarter_strings_df.to_csv("quarter_strings.csv")

## Do this for both monthly and quarterly aggregation

In [10]:
topic_names_df = pd.DataFrame(topic_names)
topic_names_df.columns = ["name"]
topic_names_df.to_csv("topic_names.csv")

pd.DataFrame(tweets['text']).to_csv("tweets.csv")