# Vocabulary analysis notebook

In [2]:
import pandas as pd
from collections import defaultdict
import numpy as np
import matplotlib.pyplot as plt

import ast
import nltk, re, pprint
from nltk import word_tokenize

# !pip install wordcloud
from wordcloud import WordCloud

import sys

Define header for the raw csv data:

In [4]:
header_list = ["tweetid","userid","user_display_name","user_screen_name","user_reported_location","user_profile_description","user_profile_url","follower_count","following_count","account_creation_date","account_language","tweet_language","tweet_text","tweet_time","tweet_client_name","in_reply_to_tweetid","in_reply_to_userid","quoted_tweet_tweetid","is_retweet","retweet_userid","retweet_tweetid","latitude","longitude","quote_count","reply_count","like_count","retweet_count","hashtags","urls","user_mentions","poll_choices"]
header_dict = {val:i for i,val in enumerate(header_list)}

Parsing function that loads data into pandas in chunks

In [5]:
def parse(f,chunksize = 100000, cols = None, test_bool = False, dtype = None):
    lines = 9041*1000
    for i,gm_chunk in enumerate(pd.read_csv("twitter/csv/ira_tweets_csv_hashed.csv",chunksize = chunksize, usecols = cols, dtype = dtype)):
        f(gm_chunk)
        if test_bool:
            print('Done')
            break
        j = i*chunksize/lines
        sys.stdout.write('\r')
        # the exact output you're looking for:
        bar_int = int(j*20+1)
        sys.stdout.write("[%-20s] %d%%" % ('='*bar_int, int(j*100+1)))
        sys.stdout.flush()

# Vocabulary - words over time (per day)

### Part 1 - Preprocessing

In [6]:
cols = ["tweet_language", "tweet_text", "tweet_time"]
global df_tweets_en
df_tweets_en = pd.DataFrame(columns=cols)
follower_dict = defaultdict()
def f(chunk):
    global df_tweets_en
    df_tweets_en = pd.concat([df_tweets_en, chunk[chunk.tweet_language == 'en']])
    
# set test_bool to True to use only a testing subset of data
parse(f, 10000, cols = cols, test_bool = False)



In [6]:
df_tweets_en

Unnamed: 0,tweet_language,tweet_text,tweet_time
8,en,"As sun and cloud give way to moon and shadow, ...",2015-02-16 16:19
10,en,"Down in the comfort of strangers, I...",2014-07-28 23:02
11,en,Im laughing more than i should #USA,2014-07-28 09:24
12,en,"No, I'm not saying I'm sorry",2014-08-08 00:43
32,en,Laugh it all off in your face,2014-08-17 10:46
33,en,It takes courage to make a fool of yourself. –...,2014-12-27 08:54
34,en,I want to be normal and live with a boyfriend,2015-03-16 14:33
41,en,« Reverie is when ideas float in our mind with...,2015-01-08 07:45
42,en,Meet The Democrat’s Next “OBAMA” https://t.co/...,2017-06-09 15:39
43,en,Kellyanne Conway denies reports of leaking con...,2017-06-12 12:39


#### Tweet cleaning

In [7]:
EMOJIS = re.compile('[\U00010000-\U0010ffff]', flags=re.UNICODE)
def remove_emojis(text):
    return EMOJIS.sub(r'', text)

# remove_emojis('RT @Steven31015146: 🚨💥🚨💥🚨💥🚨💥BREAKING: NBC blabla RT bla #MAGA https://t.co/90ElTWQ2XA jep')

In [8]:
def clean_tweets(tweets):
    # Remove twitter retweet handles (retweets)
    processed_tweets = [re.sub('RT @\w*: ', '', t) for t in tweets]
    # Remove urls
    processed_tweets = [re.sub(' http[s]?:[^\s]+', '', t) for t in processed_tweets]
    # Remove emojis
    processed_tweets = [remove_emojis(t) for t in processed_tweets]
    # Remove hashtags
    processed_tweets = [re.sub('#\w*', '', t) for t in processed_tweets]
    # Remove references to other twitter accounts
    processed_tweets = [re.sub('@\w*', '', t) for t in processed_tweets]
    # Remove other handles
    processed_tweets = [re.sub('[&$]\w*', '', t) for t in processed_tweets]
    
    return processed_tweets

def clean_tweet(tweet):
    # Remove twitter retweet handles (retweets)
    processed_tweet = re.sub('RT @\w*: ', '', tweet)
    # Remove urls
    processed_tweet = re.sub(' http[s]?:?([^\s]*)', '', processed_tweet)
    # Remove emojis
    processed_tweet = remove_emojis(processed_tweet)
    # Remove hashtags
    processed_tweet = re.sub('#\w*', '', processed_tweet)
    # Remove references to other twitter accounts
    processed_tweet = re.sub('@\w*', '', processed_tweet)
    # Remove other handles
    processed_tweet = re.sub('[&$]\w*', '', processed_tweet)
    
    return processed_tweet

In [9]:
df_tweets_en['clean_tweet'] = clean_tweets(df_tweets_en.tweet_text)
df_tweets_en

Unnamed: 0,tweet_language,tweet_text,tweet_time,clean_tweet
8,en,"As sun and cloud give way to moon and shadow, ...",2015-02-16 16:19,"As sun and cloud give way to moon and shadow, ..."
10,en,"Down in the comfort of strangers, I...",2014-07-28 23:02,"Down in the comfort of strangers, I..."
11,en,Im laughing more than i should #USA,2014-07-28 09:24,Im laughing more than i should
12,en,"No, I'm not saying I'm sorry",2014-08-08 00:43,"No, I'm not saying I'm sorry"
32,en,Laugh it all off in your face,2014-08-17 10:46,Laugh it all off in your face
33,en,It takes courage to make a fool of yourself. –...,2014-12-27 08:54,It takes courage to make a fool of yourself. –...
34,en,I want to be normal and live with a boyfriend,2015-03-16 14:33,I want to be normal and live with a boyfriend
41,en,« Reverie is when ideas float in our mind with...,2015-01-08 07:45,« Reverie is when ideas float in our mind with...
42,en,Meet The Democrat’s Next “OBAMA” https://t.co/...,2017-06-09 15:39,Meet The Democrat’s Next “OBAMA”
43,en,Kellyanne Conway denies reports of leaking con...,2017-06-12 12:39,Kellyanne Conway denies reports of leaking con...


#### Tweet tokenization

In [12]:
import nltk
from nltk.corpus import stopwords

stopwords_eng = stopwords.words('english')
stopwords_eng += ['would', 'http', 'https']

wnl = nltk.WordNetLemmatizer()

def tokenize_tweet(tweet, lemmatize=False):
    return set(tokenize_tweet_to_list(tweet, lemmatize))

def tokenize_tweet_to_list(tweet, lemmatize=False):
    pattern = '\w+'
    tokenized = nltk.regexp_tokenize(tweet, pattern)
    if lemmatize:
        content = [wnl.lemmatize(w.lower()) for w in tokenized if w.lower() not in stopwords_eng and len(w)>2 and not w.isdigit()]
    else:
        content = [w.lower() for w in tokenized if w.lower() not in stopwords_eng and len(w)>2 and not w.isdigit()]
    return content

In [13]:
df_tweets_en['tokens'] = [tokenize_tweet(tweet, lemmatize=True) for tweet in df_tweets_en.clean_tweet]
df_tweets_en.iloc[0]

tweet_language                                                   en
tweet_text        As sun and cloud give way to moon and shadow, ...
tweet_time                                         2015-02-16 16:19
clean_tweet       As sun and cloud give way to moon and shadow, ...
tokens            {way, world, make, moon, rhythm, faith, sun, s...
Name: 8, dtype: object

Group tweets by month:

In [14]:
df_tweets_en['tweet_time'] = pd.to_datetime(df_tweets_en.tweet_time)
# Group tweets by month
g = df_tweets_en.groupby(pd.Grouper(key='tweet_time',freq='M'))

# join tokens of tweets from one month to list
df_monthly_grouped_tweets = g.tokens.apply(lambda x: list(x)).to_frame(name="tweet_token_list")
df_monthly_grouped_tweets

Unnamed: 0_level_0,tweet_token_list
tweet_time,Unnamed: 1_level_1
2009-11-30,"[{correctly, keep, trying, swimming, lol, retw..."
2009-12-31,[]
2010-01-31,"[{itunes, album, reminder, link, post}]"
2010-02-28,"[{chile60th, see, patagonia, orbit, ever, day,..."
2010-03-31,"[{dam, alp, southern, zealand, big, new, man, ..."
2010-04-30,"[{world, 1f1dkq, hello, com, twitpic}]"
2010-05-31,"[{comic, way, best}]"
2010-06-30,"[{enjoy, cure, save, interval, birth, death}, ..."
2010-07-31,"[{buddy, year, whiz, quinn, sullivan, guitar, ..."
2010-08-31,"[{think, workday, two, need, choice, night, le..."


Functions to calculate TF-IDF:

In [15]:
def idf(w, docs):
    return np.log( (len(docs)) / (1 + sum([w in doc for doc in docs])) )

def tfidf(docs):
    if len(docs) == 0:
        return dict()
    tokens = [token for doc in docs for token in doc]
    f = nltk.FreqDist(tokens)
    tfidf = {w:f[w]*idf(w, docs) for w in f.keys()}
    return tfidf

## Part 2 - Create and save wordclouds for each month based on calculated TF-IDF

In [20]:
# Generate a word cloud image
import os
from os import path
from PIL import Image
#define path for a mask
d = path.dirname(__file__) if "__file__" in locals() else os.getcwd()
#upload mask for thre cloud
twitter_mask = np.array(Image.open(path.join(d, "twitter-logo.jpg")))

def save_wordcloud(date, tf_idf, num_tweets):
    wordcloud = WordCloud(background_color="white", max_words=150, mask = twitter_mask).generate_from_frequencies(tf_idf)

    # Display the generated image:
    plt.figure(figsize=(10, 10))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.title("Word-cloud based on TF-IDF of %d tweets (%s)" % (num_tweets, date))
    plt.axis("off")
    reverse_date = date.split("-")
    plt.savefig("vocab-wordcloud-lemmatized/wordcloud-%s-%s.png" % (reverse_date[1], reverse_date[0]))
    plt.close()
#     plt.show()

In [17]:
tfidf(df_monthly_grouped_tweets.iloc[0].tweet_token_list)

{'correctly': -0.6931471805599453,
 'keep': -0.6931471805599453,
 'trying': -0.6931471805599453,
 'swimming': -0.6931471805599453,
 'lol': -0.6931471805599453,
 'retweet': -0.6931471805599453}

#### Save only wordclouds having more than 20 tokens (words) in a month
! Note that the following code block can take a while to finish (few hours at least) !

In [21]:
# Save monthly wordclouds to file
for row in df_monthly_grouped_tweets.iterrows():
    date = "%d-%d" % (row[0].month, row[0].year)
    tf_idf = tfidf(row[1].tweet_token_list)
    num_tweets = len(row[1].tweet_token_list)
    if len(tf_idf) > 20:
        print('working on %s' % date)
        save_wordcloud(date, tf_idf, num_tweets)

working on 12-2012
working on 1-2013
working on 4-2013
working on 5-2013
working on 6-2013
working on 7-2013
working on 8-2013
working on 9-2013
working on 10-2013
working on 11-2013
working on 12-2013
working on 1-2014
working on 2-2014
working on 3-2014
working on 4-2014
working on 5-2014
working on 6-2014
working on 7-2014
working on 8-2014
working on 9-2014
working on 10-2014
working on 11-2014
working on 12-2014
working on 1-2015
working on 2-2015
working on 3-2015
working on 4-2015
working on 5-2015
working on 6-2015
working on 7-2015
working on 8-2015
working on 9-2015
working on 10-2015
working on 11-2015
working on 12-2015
working on 1-2016
working on 2-2016
working on 3-2016
working on 4-2016
working on 5-2016
working on 6-2016
working on 7-2016
working on 8-2016
working on 9-2016
working on 10-2016
working on 11-2016
working on 12-2016
working on 1-2017
working on 2-2017
working on 3-2017
working on 4-2017
working on 5-2017
working on 6-2017
working on 7-2017
working on 8-20

## Part 3 - Groupping tweets

#### Group tokens by day and save to file
In case of exceeding memory limit, we will later load the data from text file directly.

In [None]:
df_monthly_grouped_tweets['tfidf'] = [tfidf(tweet_list) for tweet_list in df_monthly_grouped_tweets.tweet_token_list]

In [None]:
df_tweets_en['tweet_time'] = pd.to_datetime(df_tweets_en.tweet_time)
# Group tweets by day
g = df_tweets_en.groupby(pd.Grouper(key='tweet_time',freq='D'))

# join tokens of tweets from one day to list
df_daily_grouped_tokens = g.tokens.apply(lambda x: [inner for outer in x for inner in outer])
df_daily_grouped_tokens

In [None]:
# Save daily grouped tokens
df_daily_grouped_tokens.to_csv('daily_en_tokens.csv')

#### Group tokens by month and save to file

In [None]:
df_daily_tokens = pd.read_csv('daily_en_tokens.csv', names=['day','tokens'])
df_daily_tokens = df_daily_tokens[df_daily_tokens.tokens != '[]']
df_daily_tokens['day'] = pd.to_datetime(df_daily_tokens.day)
df_daily_tokens

In [None]:
# Group tweets by month
gm = df_daily_tokens.groupby(pd.Grouper(key='day',freq='M'))

# join tokens of tweets from one day to list
df_monthly_grouped_tokens = gm.tokens.apply(lambda x: list(x))
df_monthly_grouped_tokens

To use the list of tokens, we first need to change it from string to real list. "[a, b]" -> [a, b]

In [None]:
def evaluate(tokens_list):
    return [ast.literal_eval(tl) for tl in tokens_list]
df_monthly_grouped_tokens = df_monthly_grouped_tokens.apply(evaluate)

In [None]:
df_monthly_grouped_tokens

In [None]:
# Save monthly grouped token lists
df_monthly_grouped_tokens.to_csv('monthly_en_token_lists.csv')

## Part 4 - TF-IDF for daily tweets grouped by month

In [None]:
df_monthly_tokens = pd.read_csv('monthly_en_token_lists.csv', names=['date','token_lists'])
# df_daily_tokens.tokens = [ast.literal_eval("%r" % tokens) for tokens in df_daily_tokens.tokens]
# df_monthly_tokens = df_monthly_tokens[df_daily_tokens.tokens != '[]']
df_monthly_tokens['date'] = pd.to_datetime(df_monthly_tokens.date)
df_monthly_tokens = df_monthly_tokens.set_index('date')
# df_monthly_tokens.token_lists = df_monthly_tokens.token_lists.apply(evaluate)
df_monthly_tokens

Get list of all tokens throughout the dataset:

In [None]:
all_tokens = list()
i = 0
for row in df_monthly_tokens.token_lists:
    all_tokens += tokenize_tweet_to_list(row)
len(all_tokens)

Lemmatize tokens:

In [None]:
# Lemmatize tokens
wnl = nltk.WordNetLemmatizer()
all_tokens_lemmatized = [wnl.lemmatize(t) for t in all_tokens]
len(all_tokens_lemmatized)

In [None]:
all_most_common = nltk.FreqDist(all_tokens).most_common(150)
all_most_common

Save the monthly freaquencies for the 150 most common tokens (words):

In [None]:
top_words = [row[0] for row in all_most_common]
len(top_words)

In [None]:
wnl = nltk.WordNetLemmatizer()
# Save monthly frequencies of the top words to file
with open('monthly_top_words_frequencies.csv', 'w') as the_file:
    the_file.write('date,' + ','.join(top_words) + '\n')
    for row in df_monthly_tokens.iterrows():
        tokens_lemmatized = [wnl.lemmatize(t) for t in tokenize_tweet_to_list(row[1].token_lists)]
        freq_dist = dict(nltk.FreqDist(tokens_lemmatized))
        
        counts = list()
        for word in top_words:
            if word in freq_dist:
                counts.append("%d" % freq_dist[word])
            else:
                counts.append("0")
        
        the_file.write("%d-%d" % (row[0].year, row[0].month) + ',' + ','.join(counts) + '\n')

## Part 5 - Plot monthly summary of vocabulary based on TF-IDF

In [1]:
import datetime

In [36]:
df_monthly_top_words_freq = pd.read_csv('monthly_top_words_frequencies.csv')
df_monthly_top_words_freq['date'] = pd.to_datetime(df_monthly_top_words_freq.date)
df_monthly_top_words_freq = df_monthly_top_words_freq.set_index('date')
df_monthly_top_words_freq = df_monthly_top_words_freq.loc[datetime.date(year=2014,month=1,day=1):datetime.date(year=2018,month=9,day=1)]
df_monthly_top_words_freq.index = df_monthly_top_words_freq.index.strftime('%b-%y')


In [37]:
# ad-hoc join counts for words "america" and "american"
df_monthly_top_words_freq["america"] = df_monthly_top_words_freq["america"].add(df_monthly_top_words_freq["american"])
df_monthly_top_words_freq.drop(['american'], axis=1, inplace=True)

In [8]:
import plotly.plotly as py
import cufflinks as cf
import pandas as pd
import numpy as np
import plotly

In [38]:
plotly.tools.set_credentials_file(username='s180012', api_key='ARWPjbRNuBBRc8wpdG9V')

labels = list(df_monthly_top_words_freq)

df_monthly_top_words_freq.iplot(kind='bar', barmode='stack', fill=True, y=labels[0:100],  xTitle='Date', yTitle='Number of tweets', colorscale='spectral')

#### Selecting words for later manual filtering

In [None]:
word_mean = {}
for col in list(df_monthly_top_words_freq)[:100]:
    word_mean[col] = {"max": df_monthly_top_words_freq[col].max(), "mean": df_monthly_top_words_freq[col].mean()}

word_mean

In [None]:
[(word, word_mean[word]) for word in word_mean if word_mean[word]["max"] > 5000]

**Selected words:** america, breaking, hillary, president, obama, people, new, trump