### Project 4
### Brendon Happ
### NLP

In [1]:
import pandas as pd
import numpy as np
import os
import pickle
import re
from smart_open import smart_open
from time import time

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.porter import PorterStemmer
from nltk.sentiment.vader import SentimentIntensityAnalyzer as sia

import spacy

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer,TfidfTransformer 
from sklearn.cluster import KMeans
from sklearn.preprocessing import normalize
from sklearn.decomposition import NMF, TruncatedSVD

import gensim
from gensim import corpora, models, similarities, matutils
from gensim.models.phrases import Phraser, Phrases
from gensim.models.ldamulticore import LdaMulticore

import matplotlib.pyplot as plt

#plotly
from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.grid_objs import Grid, Column #sliders
import plotly.graph_objs as go #horizontal bar charts
import plotly.plotly as py
from plotly import tools #side by side
init_notebook_mode(connected=True)

In [2]:
full_df_list = []

for (dirname, dirs, files) in os.walk('clean_pickles'):
    for filename in files:
        with open(os.path.join('clean_pickles', filename), 'rb') as f:
            full_df_list.append(pd.read_pickle(f))

In [3]:
full_df = pd.concat(full_df_list, axis=0, ignore_index=True)

In [4]:
with open('bi_tweets.pickle', 'rb') as f:
    bi_tweets = pd.read_pickle(f)

In [5]:
with open('hashtag_mention_df.pickle', 'rb') as f:
    hashtag_mention_df = pd.read_pickle(f)

In [6]:
full_df = full_df.merge(hashtag_mention_df, on='tweet_id')

In [7]:
hashtags = full_df.hashtags.tolist()

In [8]:
hashtags = [list(x) for x in hashtags]

In [14]:
hashtags2 = []
for lst in hashtags:
    hashtags2.append(['#'+x for x in lst])

### Apply Hashtags

In [18]:
hashtag_bi_tweets = [a + b for a, b in zip(bi_tweets, hashtags2)]

In [21]:
string_tweets = []
for tweet in hashtag_bi_tweets:
    string_tweets.append(' '.join(word.strip() for word in tweet))

In [22]:
string_tweets = pd.Series(string_tweets)

In [23]:
additional_stop_word_list = ['need', 'get', 'go', 'time', 'done', 'got', 'pre', 'lol', 'wanna', 'actually',
                             'im', 'just', 'like', 'want', 'gotta', 'dont', 'rt', 'retweet', 'amp', 'rr',
                             'mar', 'america', 'work', 'going', 'gonna', 'buddy', 'did', 'ready', 'true', 
                             'fall', 'year', 'thing', 'world', 'new', 'video', 'make', 'look', 'sure', 
                             'sense', 'new', 'happen', 'difference', 'let', 'come', 'say', 'tweet', 'medium', 
                             'think', 'way', 'know', 'friend', 'tell', 'try', 'wait', 'start', 'count', 'lot',
                             'hear', 'ill', 'home', 'house', 'bout', 'lil', 'yes', 'help', 'talk', 'guess',
                             #Descriptive
                             'feeling', 'good', 'great', 'best', 'really', 'nice', 'feel', 'thank', 'hate', 
                             'love', 'beautiful', 'bad', 'hard',
                             #Day Times
                             'today', 'morning', 'day', 'tomorrow', 'early', 'tonight', 'night', 'hour', 
                             'daily', 'late', 'week']
                             
                             #Gym Stop Words
                             #'workout', 'morning', 'feel', 'day', 'gym', 'tomorrow', 'partner', 'let', 
                             #'sleep', 'start', 'tonight', 'leg', 'eat', 'early', 'come', 'sore', 'hour', 
                             #'clothe', 'exercise', 'lost_weight', 'lose', 'play', 'sport', 'know', 'people' 
                             #'thing', 'think', 'right', 'thank', 'say', 'way', 'try', 'tell', 'lose_weight', 
                             #'help', 'fat', 'diet', 'food', 'motivation']

In [24]:
from sklearn.feature_extraction import text 
stop_words = text.ENGLISH_STOP_WORDS.union(additional_stop_word_list)

In [25]:
num_topics = 3

In [26]:
vectorizer = CountVectorizer(analyzer='word', max_features=100000, stop_words=stop_words)
tweet_counts = vectorizer.fit_transform(string_tweets)

In [27]:
transformer = TfidfTransformer(smooth_idf=False);
tweet_tfidf = transformer.fit_transform(tweet_counts)

In [28]:
tweet_tfidf_norm = normalize(tweet_tfidf, norm='l1', axis=1)

In [29]:
nmf = NMF(n_components=num_topics, init='nndsvd')

In [30]:
nmf.fit(tweet_tfidf_norm)

NMF(alpha=0.0, beta_loss='frobenius', init='nndsvd', l1_ratio=0.0,
  max_iter=200, n_components=3, random_state=None, shuffle=False,
  solver='cd', tol=0.0001, verbose=0)

In [31]:
top_words = 20
feature_names = vectorizer.get_feature_names()
topic_dict = {}
for i in range(num_topics):
    word_idx = nmf.components_[i].argsort()[:-top_words-1:-1] 
    words = [feature_names[idx] for idx in word_idx]
    topic_dict['Topic # ' + '{:02d}'.format(i+1)] = words

**Topic 1 and 2 are garbage. These topics are covering the tweets that accounts put out to appear real**

In [32]:
topic_names = ['exercise', 'exercise', 'political/news']

In [33]:
pd.DataFrame(topic_dict)

Unnamed: 0,Topic # 01,Topic # 02,Topic # 03
0,workout,news,exercise
1,gym,trump,eat
2,partner,local,right
3,kill,man,diet
4,sleep,police,run
5,leg,kill,walk
6,eat,politic,people
7,sore,state,ball
8,ass,shoot,lose_weight
9,clothe,politics,fat


In [34]:
t0 = time()
docweights = nmf.transform(tweet_tfidf)
print("done in %0.3fs." % (time() - t0))

done in 0.872s.


In [35]:
topic_df = pd.DataFrame({'topic':docweights.argmax(axis=1),
                         'tweet': string_tweets},
                  columns=['topic', 'tweet'])

In [36]:
topic_df[topic_df.topic==0].head()

Unnamed: 0,topic,tweet
3012,0,morning mariner musing manager scott_servai ex...
3400,0,mariner morning report the first day full squa...
6180,0,oso year later
8247,0,mayorsign
12147,0,sthelenclimber


In [37]:
topic_df[topic_df.topic==1].head()

Unnamed: 0,topic,tweet
0,1,israeli police say palestinian home west bank ...
1,1,former iaaf chief lamine diack face new corrup...
2,1,spacex_launch rocket month after accident then...
3,1,the late fatality fear crash arkansa news #news
4,1,danish graceland museum change name after laws...


In [38]:
true_topic_df = topic_df[topic_df.topic==1]

In [39]:
true_tweet_counts = vectorizer.fit_transform(true_topic_df.tweet)
true_tweet_tfidf = transformer.fit_transform(true_tweet_counts)

In [40]:
true_tweet_tfidf_norm = normalize(true_tweet_tfidf, norm='l1', axis=1)

In [41]:
num_topics = 5
nmf = NMF(n_components=num_topics, init='nndsvd')
nmf.fit(true_tweet_tfidf_norm)

NMF(alpha=0.0, beta_loss='frobenius', init='nndsvd', l1_ratio=0.0,
  max_iter=200, n_components=5, random_state=None, shuffle=False,
  solver='cd', tol=0.0001, verbose=0)

In [42]:
top_words = 20
feature_names = vectorizer.get_feature_names()
topic_dict = {}
for i in range(num_topics):
    word_idx = nmf.components_[i].argsort()[:-top_words-1:-1] 
    words = [feature_names[idx] for idx in word_idx]
    topic_dict['Topic # ' + '{:02d}'.format(i+1)] = words

In [43]:
topic_names = ['news', 'right', 'left', 'police/news', 'right']

In [44]:
pd.DataFrame(topic_dict)

Unnamed: 0,Topic # 01,Topic # 02,Topic # 03,Topic # 04,Topic # 05
0,news,trump,blacklivesmatter,man,people
1,state,president,black,local,black
2,topnews,donald,blacktwitter,police,white
3,topnew,politic,blm,sports,right
4,fake,break,staywoke,woman,god
5,kill,obama,blackskinisnotacrime,shoot,obama
6,syria,hillary,racism,kill,music
7,attack,clinton,policebrutality,sport,life
8,local,maga,cop,politics,vote
9,china,vote,support,arrest,american


In [45]:
t0 = time()
docweights = nmf.transform(true_tweet_tfidf)
print("done in %0.3fs." % (time() - t0))

done in 1.620s.


In [46]:
true_topic_df = pd.DataFrame({'topic':docweights.argmax(axis=1),
                              'tweet': true_topic_df.tweet},
                               columns=['topic', 'tweet'])

In [50]:
true_topic_df.tweet[0]


'israeli police say palestinian home west bank attack news #news'

In [51]:
true_df = full_df.merge(true_topic_df, how='right', right_index=True, left_index=True)

In [53]:
true_df.head()

Unnamed: 0,author_x,content_x,region,language,publish_date,following,followers,updates,post_type,account_type,retweet,account_category,new_june_2018,tweet_id,author_y,content_y,hashtags,mentions,topic,tweet
0,SEATTLE_POST,israeli police say palestinian home in west ba...,United States,English,2015-12-22 10:49:00,4887,12659,15470,,local,0,NewsFeed,0,679252328986238977,SEATTLE_POST,Israeli police say Palestinian home in West Ba...,{news},[],0,israeli police say palestinian home west bank ...
1,SEATTLE_POST,former iaaf chief lamine diack faces new corru...,United States,English,2015-12-22 11:16:00,4887,12659,15471,,local,0,NewsFeed,0,679259242541395972,SEATTLE_POST,Former IAAF chief Lamine Diack faces new corru...,{sports},[],3,former iaaf chief lamine diack face new corrup...
2,SEATTLE_POST,spacex launches rocket months after accident ...,United States,English,2015-12-22 11:40:00,4887,12659,15472,,local,0,NewsFeed,0,679265189372407808,SEATTLE_POST,SpaceX launches rocket 6 months after accident...,{local},[],3,spacex_launch rocket month after accident then...
3,SEATTLE_POST,the latest fatalities feared in i crash in ark...,United States,English,2015-12-22 12:20:00,4887,12659,15473,,local,0,NewsFeed,0,679275250069078016,SEATTLE_POST,The Latest: Fatalities feared in I-49 crash in...,{news},[],0,the late fatality fear crash arkansa news #news
4,SEATTLE_POST,danish graceland museum changes name after law...,United States,English,2015-12-22 13:07:00,4887,12657,15474,,local,0,NewsFeed,0,679287094682902528,SEATTLE_POST,Danish “Graceland” museum changes name after l...,{entertainment},[],3,danish graceland museum change name after laws...


In [52]:
analyser = sia()
sentiment = []
for sentence in true_df.content:
    ps = analyser.polarity_scores(sentence)
    sentiment.append(ps['compound'])
    

AttributeError: 'DataFrame' object has no attribute 'content'

In [220]:
true_df['sentiment'] = sentiment

In [224]:
#with open('true_df.pickle', 'wb') as file:
#    pickle.dump(true_df, file, protocol=pickle.HIGHEST_PROTOCOL)

In [222]:
true_df.head()

Unnamed: 0,author,content,region,language,publish_date,following,followers,updates,post_type,account_type,retweet,account_category,new_june_2018,tweet_id,topic,tweet,sentiment
0,SEATTLE_POST,israeli police say palestinian home in west ba...,United States,English,2015-12-22 10:49:00,4887,12659,15470,,local,0,NewsFeed,0,679252328986238977,1,israeli police say palestinian home west bank ...,-0.4588
1,SEATTLE_POST,former iaaf chief lamine diack faces new corru...,United States,English,2015-12-22 11:16:00,4887,12659,15471,,local,0,NewsFeed,0,679259242541395972,4,former iaaf chief lamine diack face new corrup...,-0.2732
2,SEATTLE_POST,spacex launches rocket months after accident ...,United States,English,2015-12-22 11:40:00,4887,12659,15472,,local,0,NewsFeed,0,679265189372407808,4,spacex_launch rocket month after accident then...,-0.4767
3,SEATTLE_POST,the latest fatalities feared in i crash in ark...,United States,English,2015-12-22 12:20:00,4887,12659,15473,,local,0,NewsFeed,0,679275250069078016,1,the late fatality fear crash arkansa news,-0.8689
4,SEATTLE_POST,danish graceland museum changes name after law...,United States,English,2015-12-22 13:07:00,4887,12657,15474,,local,0,NewsFeed,0,679287094682902528,2,danish graceland museum change name after laws...,0.2263
