In [38]:
import pandas as pd
import numpy as np
import pickle
import nltk

from datetime import datetime

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

# sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

In [39]:
def custom_tokenizer(text):
    full_punc = '’‘“”.–…�🇺🇸★➠' + string.punctuation
    # remove punctuation
    remove_punct = str.maketrans('', '', full_punc)
    text = text.translate(remove_punct)

    # remove digits and convert to lower case
    remove_digits = str.maketrans('', '', string.digits)
    text = text.lower().translate(remove_digits)

    # tokenize
    tokens = word_tokenize(text)

    # remove stop words
    punc = [str(i) for i in string.punctuation]
    cust_stop_words = (['rt', 'retweet', 'get', 'one', 'im', 'thing', 'get', 'dont', 'wow',
                       'lol', 'amp', 'n', 'didnt', 'people', 'like', 'want', 'know', 'go',
                        'think', 'need', 'right', 'good', 'would', 'going', 'never', 'see',
                        'time', 'call', 'said', 'got', 'us', 'p', 'look', 'mr'])
    stop_words = cust_stop_words + stopwords.words('english')
    tokens_stop = [y for y in tokens if y not in stop_words]

    # stem
#    stemmer = SnowballStemmer('english')
#    tokens_stem = [stemmer.stem(y) for y in tokens_stop] 

    return tokens_stop

In [40]:
with open("rtrolls_df.pkl", 'rb') as picklefile:
    df_rtrolls = pickle.load(picklefile)    
    
import json
with open('topics2words.json', 'r') as fp:
    topic_dict = json.load(fp)

In [41]:
df_rtrolls.head()

Unnamed: 0,author,content,region,language,following,followers,updates,retweet,account_category,date,hour,day,topicnumber,strengthoftopic,week
0,10_GOP,"""We have a sitting Democrat US Senator on tria...",Unknown,English,1052,9636,253,0,RightTroll,2017-10-01,19,6,9,0.020203,143
1,10_GOP,Marshawn Lynch arrives to game in anti-Trump s...,Unknown,English,1054,9637,254,0,RightTroll,2017-10-01,22,6,7,0.002535,143
2,10_GOP,Daughter of fallen Navy Sailor delivers powerf...,Unknown,English,1054,9637,255,1,RightTroll,2017-10-01,22,6,12,0.000442,143
3,10_GOP,JUST IN: President Trump dedicates Presidents ...,Unknown,English,1062,9642,256,0,RightTroll,2017-10-01,23,6,1,0.014205,143
4,10_GOP,"19,000 RESPECTING our National Anthem! #StandF...",Unknown,English,1050,9645,246,1,RightTroll,2017-10-01,2,6,3,0.00102,143


In [42]:
#group by week
temp_df = df_rtrolls.groupby(["week", "topicnumber"]).count().reset_index()
# temp_df

topic_weeks_df = temp_df[['week', 'topicnumber', 'content']]
# topic_weeks_df

In [43]:
temp_df = topic_weeks_df[((topic_weeks_df['topicnumber'] == 0) |
        (topic_weeks_df['topicnumber'] == 15) |
        (topic_weeks_df['topicnumber'] == 2) | 
        (topic_weeks_df['topicnumber'] == 4) |
        (topic_weeks_df['topicnumber'] == 19) |
        (topic_weeks_df['topicnumber'] == 11) |     
        (topic_weeks_df['topicnumber'] == 16) |               
        (topic_weeks_df['topicnumber'] == 7) |
        (topic_weeks_df['topicnumber'] == 5) |               
        (topic_weeks_df['topicnumber'] == 13))]


In [44]:
data_fillna = temp_df.pivot_table('content', 'week', 'topicnumber').fillna(0).unstack().reset_index()

In [45]:
data_fillna.head()

Unnamed: 0,topicnumber,week,0
0,0,8,23.0
1,0,10,0.0
2,0,11,1.0
3,0,12,7.0
4,0,14,3.0


In [46]:
#we lose the count label column in the previous steps, so we're just renaming it here, and reordering columns based on 
#how they are arranged in the viz csv
data_fillna.columns = ["topicnumber", "week", "content"]
data_fillna = data_fillna[["week", "topicnumber", "content"]]
data_fillna.head()

Unnamed: 0,week,topicnumber,content
0,8,0,23.0
1,10,0,0.0
2,11,0,1.0
3,12,0,7.0
4,14,0,3.0


In [47]:
data_fillna.sort_values('week', inplace=True)


In [48]:
#backup file
data_fillna.to_csv("topicsbyweek.csv", index = False)

In [62]:
df_rtrolls[df_rtrolls['week'] == 143]

Unnamed: 0,author,content,region,language,following,followers,updates,retweet,account_category,date,hour,day,topicnumber,strengthoftopic,week
0,10_GOP,"""We have a sitting Democrat US Senator on tria...",Unknown,English,1052,9636,253,0,RightTroll,2017-10-01,19,6,9,0.020203,143
1,10_GOP,Marshawn Lynch arrives to game in anti-Trump s...,Unknown,English,1054,9637,254,0,RightTroll,2017-10-01,22,6,7,0.002535,143
2,10_GOP,Daughter of fallen Navy Sailor delivers powerf...,Unknown,English,1054,9637,255,1,RightTroll,2017-10-01,22,6,12,0.000442,143
3,10_GOP,JUST IN: President Trump dedicates Presidents ...,Unknown,English,1062,9642,256,0,RightTroll,2017-10-01,23,6,1,0.014205,143
4,10_GOP,"19,000 RESPECTING our National Anthem! #StandF...",Unknown,English,1050,9645,246,1,RightTroll,2017-10-01,2,6,3,0.001020,143
5,10_GOP,"Dan Bongino: ""Nobody trolls liberals better th...",Unknown,English,1050,9644,247,0,RightTroll,2017-10-01,2,6,19,0.023520,143
6,10_GOP,'@SenatorMenendez @CarmenYulinCruz Doesn't mat...,Unknown,English,1050,9644,249,0,RightTroll,2017-10-01,2,6,8,0.005730,143
7,10_GOP,"As much as I hate promoting CNN article, here ...",Unknown,English,1050,9646,250,0,RightTroll,2017-10-01,3,6,1,0.010097,143
8,10_GOP,After the 'genocide' remark from San Juan Mayo...,Unknown,English,1050,9646,251,0,RightTroll,2017-10-01,3,6,10,0.003204,143
9,10_GOP,After the 'genocide' remark from San Juan Mayo...,Unknown,English,1050,9646,251,0,RightTroll,2017-10-01,3,6,10,0.003204,143


In [55]:
df_rtrolls.week.describe()

count    70706.000000
mean       128.653891
std         26.521060
min          8.000000
25%        136.000000
50%        137.000000
75%        137.000000
max        168.000000
Name: week, dtype: float64