# Text Analysis

Below is some basic initilization, which involves connecting to our remote MongoDB database.

In [16]:
# Custome helper file
import elections_helper as helper
from elections_helper import display_table

import numpy as np
import re, string, operator, pickle, nltk, pprint

from nltk.tokenize import TweetTokenizer
from __future__ import division

# sklearn
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer, CountVectorizer
from sklearn.feature_extraction import DictVectorizer

### Globally Defined Variables

In [3]:
# retrieve common stop words form nltk
stopwords = nltk.corpus.stopwords.words('english')

# Retrieved dicationary containing word mapped with its happiness index
words_happiness = pickle.load(open('./data_files/sentiment.pickle','rb'))

In [4]:
client = helper.setup_mongo_client(properties_file='./properties/db.properties')

tweet_collection, _ = helper.get_collections(client)

## Understanding our Data

Initially, let's display some of the content in the tweets, practice making queries and looking and commonly used words.

In [5]:
N = 5
cur = tweet_collection.find({}, projection={'text': True}, limit = N)

for i in xrange(N):
    tweet_text = cur.next()
    print "Tweet #%s: %s" % (i+1, tweet_text['text'])

Tweet #1: RT @Ethan_Booker: TRUMP: look at this Hillary mask. it's hideous! just like her sou--
AIDE: *whispers in ear*
TRUMP: this mask is beautiful…
Tweet #2: TRUMP: look at this Hillary mask. it's hideous! just like her sou--
AIDE: *whispers in ear*
TRUMP: this mask is bea… https://t.co/IpquTCKM9F
Tweet #3: RT @FoxNews: #DonaldTrump's daughter-in-law, Lara Trump, blasted #HillaryClinton in a new interview. https://t.co/apANzmnyQU #Election2016…
Tweet #4: #DonaldTrump's daughter-in-law, Lara Trump, blasted #HillaryClinton in a new interview. https://t.co/apANzmnyQU… https://t.co/eMohElI9aR
Tweet #5: WikiLeaks: DNC And CNN Colluded On Questions For Trump, Cruz https://t.co/VDETfDgyLi


Things to note:
* Excess text for a retweeted message. ex. RT @MassDeception1:
* Links in tweets
* Mentions in tweets

First off, we don't need to include retweeted message since they will have the same sentiment as the original. We can use regular expressions to filter out links in the format https://t.co/... and mentions as well.

In [6]:
all_hashtags = set([])

def tweet_tokenizer(text):
    '''
    
    '''
    # matches mentions and twitter links
    remove_pattern = r"(?:@[\w]*|https://t.co/[\w]*)" 
   
    # matches text found after hashtag
    hashtag_pattern = r"#([^\s]*)"
    
    # pattern to remove puncuation except punctuation x
    punc_pattern = lambda x:  r"[^\w\d" + x + "\s]+"
    
    # match words from hashtag
    hashtag_word_pattern = r'([A-Z][^A-Z]*|[a-z][a-z]*)'
    
    # create a list of all hashtags (not including hash symbol)
    hashtags = re.findall(hashtag_pattern, text)
    
    # add hashtags to global set
    all_hashtags.update(hashtags)
    
    # remove punctuation from hashtags in case it exists
    hashtags = [re.sub(punc_pattern(''),' ', s) for s in hashtags]
    
    # split hashtag into words
    hashtag_tokens = re.findall(hashtag_word_pattern, ' '.join(hashtags))

    # remove mentions and links from tweet
    text = re.sub(remove_pattern,'', text)
    
    # replaces punctuation with a space, removes hashtags
    text = re.sub(punc_pattern("'#"),' ', text)
    
    # create a combined string of tweet text and words from hashtag
    text = text + ' '.join(hashtag_tokens)
    
    # split text at whitespace
    tokens = set(text.split())
    
    # remove if not in the alphabet and not in stopwords, set to lowercase
    return [t.lower() for t in tokens if t.lower() not in stopwords and t.isalpha()]

In [7]:
N = 5
cur = tweet_collection.find({'retweeted': False}, projection={'text': True}, limit = N)

for i in xrange(N):
    tweet = cur.next()
    text = tweet['text']
    tokens = tweet_tokenizer(text)
    
    print "Tweet #%s: %s" % (i+1, tokens)
    print text

Tweet #1: [u'ear', u'trump', u'bea', u'mask', u'like', u'whispers', u'hillary', u'sou', u'aide', u'hideous', u'look']
TRUMP: look at this Hillary mask. it's hideous! just like her sou--
AIDE: *whispers in ear*
TRUMP: this mask is bea… https://t.co/IpquTCKM9F
Tweet #2: [u'daughter', u'trump', u'hillary', u'donald', u'lara', u'clinton', u'interview', u'new', u'law', u'blasted']
#DonaldTrump's daughter-in-law, Lara Trump, blasted #HillaryClinton in a new interview. https://t.co/apANzmnyQU… https://t.co/eMohElI9aR
Tweet #3: [u'dnc', u'cruz', u'questions', u'cnn', u'wikileaks', u'colluded', u'trump']
WikiLeaks: DNC And CNN Colluded On Questions For Trump, Cruz https://t.co/VDETfDgyLi
Tweet #4: [u'never', u'clinton', u'believed', u'emails', u'person', u'hillary', u'johnpodesta', u'wikileaks', u'podesta']
Hillary Clinton is not a person who can be believed #NeverHillary #wikileaks #johnpodesta #PodestaEmails… https://t.co/ity0m0nJLL
Tweet #5: [u'obama', u'clinton', u'annihilate', u'would', u'

In [9]:
def tweet_iterable(text_only=True):
    cur = tweet_collection.find({'retweeted': False})
    for tweet in cur:
        if text_only:
            yield tweet['text']
        else:
            yield tweet

In [10]:
def merge_dicts(*dict_args):
    '''
    Given any number of dicts, shallow copy and merge into a new dict,
    precedence goes to key value pairs in latter dicts.
    '''
    result = {}
    for dictionary in dict_args:
        result.update(dictionary)
    return result

In [11]:
def get_dict_representation(X, feature_names, merge=False):
    '''
    Convert sparse matrix representation to dictionary.
    '''
    dict_vectorizer = DictVectorizer()

    # set feature names so dictionaries can be unpacked
    dict_vectorizer.feature_names_ = feature_names

    # merge dictionaries
    if merge:
        return merge_dicts(*dict_vectorizer.inverse_transform(X))
    # keep seperate
    else:
        return dict_vectorizer.inverse_transform(X)

In [12]:
# def display_table(data, title=None, limit=20, **kwargs):
#     if title:            
#         print title + ' (limited to %s results)' % limit
#     if type(data[0]) == tuple:
#         data = [[str(tup[0]), str(tup[1])] for tup in data[:limit]]
#     print tabulate(data[:limit], tablefmt="fancy_grid", **kwargs)
#     print '\n'

In [13]:
def sort_dict(d, reverse=True):
    return sorted(d.items(), key=operator.itemgetter(1), reverse=reverse)

In [14]:
# Create a count vectorizer to convert tweets to bag of word representation
count_vectorizer = CountVectorizer(tokenizer=tweet_tokenizer)

# Create a matrix passing in a function that iterates over each tweet
count = count_vectorizer.fit_transform(tweet_iterable())

# Save the indices of
feature_names = count_vectorizer.get_feature_names()

# Get a dictionary representation of the bag of words matrix
# This can be used to create a tweet id to token dictionary
count_dict = get_dict_representation(count, feature_names)

In [17]:
unique_words = merge_dicts(*count_dict).keys()

freq_dist = {}
for tweet_tokens in count_dict:
    for k in tweet_tokens.keys():
        if freq_dist.get(k,0) == 0:
            freq_dist[k] = 1
        else:
            freq_dist[k] += 1
            
print "Total number of unique words is %s\n" % len(unique_words)
sorted_freq_dist = sort_dict(freq_dist)
display_table(sorted_freq_dist, title="Most frequent words from Tweets", headers=['Words','Frequency'])

Total number of unique words is 36971

Most frequent words from Tweets (limited to 20 results)
╒═══════════════════════╤═════════════╕
│ Words                 │   Frequency │
╞═══════════════════════╪═════════════╡
│ trump                 │       39772 │
├───────────────────────┼─────────────┤
│ clinton               │        6660 │
├───────────────────────┼─────────────┤
│ electionday           │        6602 │
├───────────────────────┼─────────────┤
│ donald                │        6514 │
├───────────────────────┼─────────────┤
│ president             │        5959 │
├───────────────────────┼─────────────┤
│ america               │        4803 │
├───────────────────────┼─────────────┤
│ hillary               │        4767 │
├───────────────────────┼─────────────┤
│ election              │        4617 │
├───────────────────────┼─────────────┤
│ vote                  │        4176 │
├───────────────────────┼─────────────┤
│ people                │        3268 │
├───────────────────────┼

In [18]:
# Create a transformer to convert to a tf-idf representation
tfidf_transformer = TfidfTransformer(sublinear_tf=True)

# Calculate tf-idf using bag of words matrix
tfidf = tfidf_transformer.fit_transform(count)

# Create a dictionary representation of the sparse matrix
tfidf_dict = get_dict_representation(tfidf, feature_names, merge=True)

In [19]:
sorted_tfidf = sort_dict(tfidf_dict)
display_table(sorted_tfidf, title="Highest TF-IDF words from Tweets", headers=['Words','TF-IDF'])

# lets find out how frequent the top words are
tfidf_words, _ = zip(*sorted_tfidf)
data = [ [word[0], freq_dist[word[0]]] for word in sorted_tfidf]

display_table(data, title="Highest TF-IDF words from Tweets", headers=['Words','Frequency'])

Highest TF-IDF words from Tweets (limited to 20 results)
╒═════════════════════════════╤══════════╕
│ Words                       │   TF-IDF │
╞═════════════════════════════╪══════════╡
│ oceans                      │        1 │
├─────────────────────────────┼──────────┤
│ trumpette                   │        1 │
├─────────────────────────────┼──────────┤
│ tempting                    │        1 │
├─────────────────────────────┼──────────┤
│ safe                        │        1 │
├─────────────────────────────┼──────────┤
│ happenelectionfinalthoughts │        1 │
├─────────────────────────────┼──────────┤
│ kitchen                     │        1 │
├─────────────────────────────┼──────────┤
│ nervio                      │        1 │
├─────────────────────────────┼──────────┤
│ clintonclintonvstrump       │        1 │
├─────────────────────────────┼──────────┤
│ bawbag                      │        1 │
├─────────────────────────────┼──────────┤
│ triumphedtrump              │        1

# Sentiment Analysis

In [20]:
def get_sentiment(tokens):
    
    combined_happiness = 0
    words_with_no_sentiment = set()
    words_sentiment_count = 0
    
    # Go through each token and if we have a sentiment for it, add it to the combined happiness score
    for token in tokens:
        # get the happiness value, otherwise return zero
        happiest_value = words_happiness.get(token,0)
        
        # if a happiness value exists keep track 
        if happiest_value != 0:
            combined_happiness += happiest_value
            words_sentiment_count += 1
        # save the words that have no happiness index
        else:
            words_with_no_sentiment.add(token)
    
    # Safe check to avoid division by 0
    if combined_happiness == 0: 
        avg_sentiment_score = 0 
    else: 
        avg_sentiment_score = combined_happiness / words_sentiment_count
    
    return avg_sentiment_score, words_with_no_sentiment

In [21]:
tweet_sentiment = {}
tweets_with_no_sentiment = []
all_words_with_no_sentiment = set()

for tweet in tweet_iterable(text_only=False):
    text = tweet['text']
    _id = tweet['_id']
    
    tokens = tweet_tokenizer(text)
    
    avg_sentiment_score, words_with_no_sentiment = get_sentiment(tokens)
    
    # update set of words with no sentiment
    if len(words_with_no_sentiment) > 0:
        all_words_with_no_sentiment = all_words_with_no_sentiment.union(words_with_no_sentiment)
    
    # If we didn't find any sentiment for the tweet, save the tweet id
    if avg_sentiment_score == 0:
        tweets_with_no_sentiment.append(_id)
    # otherwise save the sentiment to the dictionary
    else:
        tweet_sentiment[_id] = avg_sentiment_score

In [22]:
print len(tweet_sentiment)
cur = tweet_collection.find({'retweeted': False})
print cur.count()
print len(tweets_with_no_sentiment)
print len(all_words_with_no_sentiment)

71265
71370
105
26128


In [25]:
values = tweet_sentiment.values()

average_sentiment = np.average(values)
std_sentiment = np.std(values)
happiest_value = max(values)
saddest_value = min(values)

tweet_percent = (len(tweet_sentiment) / (len(tweets_with_no_sentiment) + len(tweet_sentiment))) * 100
word_percent = (1 - len(all_words_with_no_sentiment) / len(unique_words)) * 100

happiest_tweet_id = tweet_sentiment.keys()[values.index(happiest_value)]
saddest_tweet_id = tweet_sentiment.keys()[values.index(saddest_value)]

happy_tweet = tweet_collection.find_one({'_id': happiest_tweet_id})['text']
sad_tweet = tweet_collection.find_one({'_id': saddest_tweet_id})['text']

print "The average sentiment of tweet sentiment is   %2.2f" % average_sentiment
print "The stanard deviation of tweet sentiment is   %2.2f" % std_sentiment
print ''
print "Percentage of tweets that have a sentiment    %2.2f%%" % tweet_percent
print "Percentage of words that have a sentiment     %2.2f%%" % word_percent
print ''
print "The happiest tweet is                         %2.2f" % happiest_value
print "Tweet text: %s" % happy_tweet
print ''
print "The saddest tweet is                          %2.2f" % saddest_value
print "Tweet text: %s" % sad_tweet

The average sentiment of tweet sentiment is   5.57
The stanard deviation of tweet sentiment is   0.47

Percentage of tweets that have a sentiment    99.85%
Percentage of words that have a sentiment     29.33%

The happiest tweet is                         8.18
Tweet text: @IcomOfficiel @UNESCO Excellent.
BUT coinciding with https://t.co/2UoluNULzY

The saddest tweet is                          1.56
Tweet text: @xtremevicky "Harambe died for this?"
#Elections2016 😂 https://t.co/ygtf0tnjl4


In [None]:
for w in tweet_tokenizer(happy_tweet):
    print words_happiness.get(w,0)

In [None]:
headers = ['Tweet ID','Happiness Index']

display_table(sort_dict(tweet_sentiment), title="Happiest Tweets", limit=10, headers=headers)

display_table(sort_dict(tweet_sentiment, reverse=False), title="Unhappiest Tweets", limit=10, headers=headers)

In [26]:
sad_tweet

u'@xtremevicky "Harambe died for this?"\n#Elections2016 \U0001f602 https://t.co/ygtf0tnjl4'