In [62]:
import os

import pandas as pd
import pickle
import string
from string import digits
import re
import numpy as np

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

### reading in the aggregated data

In [63]:
directory = '/Users/andreacsorcinelli/Documents/01_Personal/02_Job Stuff/01_Metis/02_Metis_GH/03_Projects/04_Fletcher/01_Notebooks/'

In [64]:
#agregating data
agg_data = []

for file in os.listdir(directory):
    if file.endswith('.pkl'):
        with open(file , 'rb') as picklefile:
            ind_data = pickle.load(picklefile)
            agg_data.append(ind_data)

df = pd.concat(agg_data)

In [65]:
df.shape

(392487, 4)

In [66]:
for tweet in df['tweet'][:3]:
    print(tweet, '\n')

@1BR4HlM How can i forget that anthem man 😂 

RT @PettyNegr0: Mary J. Blige did not go through her divoreces, give us Not Gon Cry, Just Fine, Family Affair, and No More Drama to be disr… 

RT @Luvnediting: @KrisBordessa @DesignationSix @FoxNews @Purina @Starbucks @SuperBetaPro @PetSmart @NFL @GoldBondInc @Terminix @sleepnumber… 



### exploratory analysis

In [67]:
len(df)

392487

In [68]:
tweet_lengths = df.tweet.str.len()
tweet_lengths.head()

0     43.0
1    140.0
2    140.0
3    140.0
4    140.0
Name: tweet, dtype: float64

In [69]:
df.dtypes

Unnamed: 0    object
_id           object
datetime      object
tweet         object
dtype: object

In [70]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 392487 entries, 0 to 10207
Data columns (total 4 columns):
Unnamed: 0    392487 non-null object
_id           392476 non-null object
datetime      392476 non-null object
tweet         392476 non-null object
dtypes: object(4)
memory usage: 15.0+ MB


In [71]:
df.fillna('null', inplace = True)

### preprocessing

In [72]:
df.head()

Unnamed: 0.1,Unnamed: 0,_id,datetime,tweet
0,0,5b75de5782816f1bcaa7aaba,2018-08-16 20:28:07,@1BR4HlM How can i forget that anthem man 😂
1,1,5b75de5882816f1bcaa7aabb,2018-08-16 20:28:07,RT @PettyNegr0: Mary J. Blige did not go throu...
2,2,5b75de5882816f1bcaa7aabc,2018-08-16 20:28:08,RT @Luvnediting: @KrisBordessa @DesignationSix...
3,3,5b75de5882816f1bcaa7aabd,2018-08-16 20:28:08,RT @mflynnJR: My thoughts are we shouldn’t tak...
4,4,5b75de5882816f1bcaa7aabe,2018-08-16 20:28:08,RT @DesignationSix: I have some @FoxNews spons...


In [73]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 392487 entries, 0 to 10207
Data columns (total 4 columns):
Unnamed: 0    392487 non-null object
_id           392487 non-null object
datetime      392487 non-null object
tweet         392487 non-null object
dtypes: object(4)
memory usage: 15.0+ MB


In [74]:
# removing user mentions
df['tweet'].replace(r"(?:\@|https?\://)\S+",'', regex=True, inplace=True)
#print (df['tweet'])
df.head()

Unnamed: 0.1,Unnamed: 0,_id,datetime,tweet
0,0,5b75de5782816f1bcaa7aaba,2018-08-16 20:28:07,How can i forget that anthem man 😂
1,1,5b75de5882816f1bcaa7aabb,2018-08-16 20:28:07,RT Mary J. Blige did not go through her divor...
2,2,5b75de5882816f1bcaa7aabc,2018-08-16 20:28:08,RT
3,3,5b75de5882816f1bcaa7aabd,2018-08-16 20:28:08,RT My thoughts are we shouldn’t take any poli...
4,4,5b75de5882816f1bcaa7aabe,2018-08-16 20:28:08,"RT I have some sponsors tagged here,so they ..."


In [75]:
# create a new pandas series that stores all of the tweets with punctuation removed 

remove_punct = str.maketrans('', '', string.punctuation)
tweets_clean_1 = df['tweet'].str.translate(remove_punct)
tweets_clean_1.head()

0                   How can i forget that anthem man 😂
1    RT  Mary J Blige did not go through her divore...
2                                       RT            
3    RT  My thoughts are we shouldn’t take any poli...
4    RT  I have some  sponsors tagged hereso they w...
Name: tweet, dtype: object

In [76]:
# convert the tweets to lower case and remove numbers

In [77]:
tweets_clean_2 = tweets_clean_1.str.lower()

remove_digits = str.maketrans('', '', string.digits)
tweets_clean_2 = tweets_clean_2.str.translate(remove_digits)
tweets_clean_2.head()

0                   how can i forget that anthem man 😂
1    rt  mary j blige did not go through her divore...
2                                       rt            
3    rt  my thoughts are we shouldn’t take any poli...
4    rt  i have some  sponsors tagged hereso they w...
Name: tweet, dtype: object

In [78]:
# remove links
tweets_clean_3 = tweets_clean_2.str.replace('https','')
tweets_clean_3.head()

0                   how can i forget that anthem man 😂
1    rt  mary j blige did not go through her divore...
2                                       rt            
3    rt  my thoughts are we shouldn’t take any poli...
4    rt  i have some  sponsors tagged hereso they w...
Name: tweet, dtype: object

In [79]:
# remove retweet
tweets_clean_4 = tweets_clean_3.str.replace('rt','')
tweets_clean_4.head()

0                   how can i forget that anthem man 😂
1      mary j blige did not go through her divorece...
2                                                     
3      my thoughts are we shouldn’t take any politi...
4      i have some  sponsors tagged hereso they wil...
Name: tweet, dtype: object

In [24]:
# lemmatize or stem?

### NMF & LDA

In [23]:
# 20 topics

def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print ("Topic %d:" % (topic_idx))
        print (" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

documents = tweets_clean_4

no_features = 1000

# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english',token_pattern="\\b[a-zA-Z][a-zA-Z]+\\b",)
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tf = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names()

no_topics = 20

# Run NMF
nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)

# Run LDA
lda = LatentDirichletAllocation(n_topics=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)

no_top_words = 10
display_topics(nmf, tfidf_feature_names, no_top_words)
display_topics(lda, tf_feature_names, no_top_words)



Topic 0:
playe explains nowthisnews american brilliantly orourke ted taking think man
Topic 1:
denlesks idiots cop tcodqquziinup idea doesn sign team week nfl
Topic 2:
jasonoverstreet violent bit minutes little best video ve watch just
Topic 3:
theme drake singlehandedly imsadbro raise josh amp written racist overplayed
Topic 4:
nfl preseason game new days kickoff players amp la rule
Topic 5:
meet theellenshow tcorzxakbhw betoorourke like thank looks thoughtful leadership stevekerr
Topic 6:
stop wish player morningmoneyben lazy protested saying people national anthem
Topic 7:
example political minute leader divisive conviction address steveschmidtses issue pe
Topic 8:
time understand politician consider taken jemelehill cho assess sounds like
Topic 9:
espn football night monday air dont breaking apparently want expose
Topic 10:
cruz texas rourke govhowarddean outclasses votes deserve guy beto ted
Topic 11:
catchy fucking baby shark bbiss written racist shit overplayed way
Topic 12:
kno

In [26]:
# try 10 topics

def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print ("Topic %d:" % (topic_idx))
        print (" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

documents = tweets_clean_4

no_features = 1000

# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english',token_pattern="\\b[a-zA-Z][a-zA-Z]+\\b",)
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tf = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names()

no_topics = 10

# Run NMF
nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)

# Run LDA
lda = LatentDirichletAllocation(n_topics=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)

no_top_words = 10
display_topics(nmf, tfidf_feature_names, no_top_words)
display_topics(lda, tf_feature_names, no_top_words)



Topic 0:
playe explains nowthisnews american brilliantly orourke ted taking cruz think
Topic 1:
denlesks idiots cop tcodqquziinup idea doesn sign team week nfl
Topic 2:
jasonoverstreet violent bit minutes little best video ve just watch
Topic 3:
written racist overplayed shit old way fuckin song national anthem
Topic 4:
nfl season players years preseason new game victor days kofieyeboah
Topic 5:
meet theellenshow tcorzxakbhw betoorourke like vote perfectly officialjld tcofugbraakl aiculated
Topic 6:
stop wish morningmoneyben player protested lazy saying people national anthem
Topic 7:
example political minute leader divisive conviction address steveschmidtses issue pe
Topic 8:
time players understand politician consider taken jemelehill cho assess sounds
Topic 9:
anthem national espn football monday night air want rise dont
Topic 0:
nfl bit kneeling integrity minute jemelehill listen votes thoughtful football
Topic 1:
explains playe just nfl ve jasonoverstreet violent time people seaso

In [20]:
# NMF 5 topics

def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print ("Topic %d:" % (topic_idx))
        print (" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

documents = tweets_clean_4

no_features = 1000

# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english',token_pattern="\\b[a-zA-Z][a-zA-Z]+\\b",)
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

no_topics = 5

# Run NMF
nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)

no_top_words = 5
display_topics(nmf, tfidf_feature_names, no_top_words)

Topic 0:
playe explains brilliantly american orourke
Topic 1:
cop idea idiots doesn sign
Topic 2:
like violent bit minutes little
Topic 3:
anthem national written racist overplayed
Topic 4:
salute words candid thoughtful watch


In [21]:
# LDA with 5 topics

def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print ("Topic %d:" % (topic_idx))
        print (" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

documents = tweets_clean_4

no_features = 1000

# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tf = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names()

no_topics = 5

# Run LDA
lda = LatentDirichletAllocation(n_topics=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)

no_top_words = 5
display_topics(lda, tf_feature_names, no_top_words)



Topic 0:
nfl man cruz ted think
Topic 1:
anthem national thoughtful words written
Topic 2:
like players nfl watch just
Topic 3:
beto taking nfl knee texas
Topic 4:
watch nfl candid salute vote


In [19]:
# NMF 5 topics; max_df = .50

def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print ("Topic %d:" % (topic_idx))
        print (" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

documents = tweets_clean_4

no_features = 1000

# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=0.50, min_df=2, max_features=no_features, stop_words='english',token_pattern="\\b[a-zA-Z][a-zA-Z]+\\b",)
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

no_topics = 5

# Run NMF
nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)

no_top_words = 5
display_topics(nmf, tfidf_feature_names, no_top_words)

Topic 0:
playe explains nowthisnews brilliantly american
Topic 1:
denlesks cop idiots tcodqquziinup idea
Topic 2:
jasonoverstreet violent bit minutes little
Topic 3:
anthem national written racist overplayed
Topic 4:
betoorourke salute kingjames words tcoeyrbqgciz


In [18]:
# LDA with 5 topics; max_df = .50

def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print ("Topic %d:" % (topic_idx))
        print (" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

documents = tweets_clean_4

no_features = 1000

# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.50, min_df=2, max_features=no_features, stop_words='english')
tf = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names()

no_topics = 5

# Run LDA
lda = LatentDirichletAllocation(n_topics=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)

no_top_words = 5
display_topics(lda, tf_feature_names, no_top_words)



Topic 0:
beto cruz ted orourke brilliantly
Topic 1:
watch just thoughtful words candid
Topic 2:
man taking explains betoorourke trump
Topic 3:
like players brown jim week
Topic 4:
think american nowthisnews anthem national


In [48]:
# NMF 5 topics; max_df = .75

def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print ("Topic %d:" % (topic_idx))
        print (" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

documents = tweets_clean_4

no_features = 1000

# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=0.75, min_df=2, max_features=no_features, stop_words='english',token_pattern="\\b[a-zA-Z][a-zA-Z]+\\b",)
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

no_topics = 5

# Run NMF
nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)

no_top_words = 5
display_topics(nmf, tfidf_feature_names, no_top_words)

Topic 0:
playe explains brilliantly american orourke
Topic 1:
cop idea idiots doesn sign
Topic 2:
like violent bit minutes little
Topic 3:
anthem national written racist overplayed
Topic 4:
salute words candid thoughtful watch


In [57]:
# NMF 5 topics; max_df = .75

def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print ("Topic %d:" % (topic_idx))
        print (" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

documents = tweets_clean_4

no_features = 1000

# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=0.75, min_df=2, max_features=no_features, stop_words='english',token_pattern="\\b[a-zA-Z][a-zA-Z]+\\b",)
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

no_topics = 2

# Run NMF
nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)

no_top_words = 5
display_topics(nmf, tfidf_feature_names, no_top_words)

Topic 0:
playe explains brilliantly american orourke
Topic 1:
cop idea idiots sign doesn


In [58]:
# NMF 2 topics; max_df = .75

def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print ("Topic %d:" % (topic_idx))
        print (" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

documents = tweets_clean_4

no_features = 1000

# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=0.75, min_df=2, max_features=no_features, stop_words='english',token_pattern="\\b[a-zA-Z][a-zA-Z]+\\b",)
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

no_topics = 2

# Run NMF
nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)

no_top_words = 7
display_topics(nmf, tfidf_feature_names, no_top_words)

Topic 0:
playe explains brilliantly american orourke ted taking
Topic 1:
cop idea idiots sign doesn team week


In [80]:
# NMF 4 topics; max_df = .75

def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print ("Topic %d:" % (topic_idx))
        print (" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

documents = tweets_clean_4

no_features = 1000

# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=0.75, min_df=2, max_features=no_features, stop_words='english',token_pattern="\\b[a-zA-Z][a-zA-Z]+\\b",)
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

no_topics = 4

# Run NMF
nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)

no_top_words = 7
display_topics(nmf, tfidf_feature_names, no_top_words)

Topic 0:
playe explains brilliantly american taking orourke cruz
Topic 1:
cop idea idiots sign doesn team week
Topic 2:
like watch violent bit minutes little video
Topic 3:
anthem national written racist overplayed old shit


In [83]:
# LDA with 4 topics; max_df = .75

def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print ("Topic %d:" % (topic_idx))
        print (" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

documents = tweets_clean_4

no_features = 1000

# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.75, min_df=2, max_features=no_features, stop_words='english')
tf = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names()

no_topics = 4

# Run LDA
lda = LatentDirichletAllocation(n_topics=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)

no_top_words = 7
display_topics(lda, tf_feature_names, no_top_words)



Topic 0:
beto nfl man cruz ted orourke think
Topic 1:
anthem national written old racist shit overplayed
Topic 2:
nfl players words football candid salute vote
Topic 3:
watch nfl like just best video ve
