# Text Analysis

Below is some basic initilization, which involves connecting to our remote MongoDB database.

In [352]:
# Custom helper file
import elections_helper as helper
from elections_helper import display_table

import numpy as np
import re, string, operator, pickle, nltk, pprint, math

# from nltk.stem.snowball import SnowballStemmer

from __future__ import division

# sklearn
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer, CountVectorizer
from sklearn.feature_extraction import DictVectorizer

### Globally Defined Variables

In [None]:
# Twitter users really like these words but we just don't get it.
other_stopwords = ['like','get']

# retrieve common stop words form nltk
stopwords = set(nltk.corpus.stopwords.words('english') + other_stopwords)

# Retrieved dicationary containing word mapped with its happiness index
words_happiness = pickle.load(open('./data_files/sentiment.pickle','rb'))

### Connect to MongoDB

In [3]:
# Connect to MongoDB
client = helper.setup_mongo_client(properties_file='./properties/db.properties')

tweet_collection, _ = helper.get_collections(client)

## Understanding our Data

Initially, let's display some of the content in the tweets, practice making queries and looking and commonly used words.

In [4]:
N = 5
cur = tweet_collection.find({}, projection={'text': True}, limit = N)

for i in xrange(N):
    tweet_text = cur.next()
    print "Tweet #%s: %s" % (i+1, tweet_text['text'])

Tweet #1: RT @Ethan_Booker: TRUMP: look at this Hillary mask. it's hideous! just like her sou--
AIDE: *whispers in ear*
TRUMP: this mask is beautiful…
Tweet #2: TRUMP: look at this Hillary mask. it's hideous! just like her sou--
AIDE: *whispers in ear*
TRUMP: this mask is bea… https://t.co/IpquTCKM9F
Tweet #3: RT @FoxNews: #DonaldTrump's daughter-in-law, Lara Trump, blasted #HillaryClinton in a new interview. https://t.co/apANzmnyQU #Election2016…
Tweet #4: #DonaldTrump's daughter-in-law, Lara Trump, blasted #HillaryClinton in a new interview. https://t.co/apANzmnyQU… https://t.co/eMohElI9aR
Tweet #5: WikiLeaks: DNC And CNN Colluded On Questions For Trump, Cruz https://t.co/VDETfDgyLi


Things to note:
* Excess text for a retweeted message. ex. RT @MassDeception1:
* Links in tweets
* Mentions in tweets

First off, we don't need to include retweeted message since they will have the same sentiment as the original. We can use regular expressions to filter out links in the format https://t.co/... and mentions as well.

In [353]:
all_hashtags = set([])

def tweet_tokenizer(text, extra_stopwords=set([])):
    '''
    
    '''
    # matches mentions, ampersands and twitter links
    remove_pattern = r"(?:@[\w]*|https://t.co/[\w]*|&[\w]*)" 
   
    # matches text found after hashtag
    hashtag_pattern = r"#([^\s]*)"
    
    # pattern to remove puncuation except punctuation x
    punc_pattern = lambda x:  r"[^\w\d" + x + "\s]+"
    
    # match words from hashtag
    hashtag_word_pattern = r'([A-Z][^A-Z]*|[a-z][a-z]*)'
    
    # create a list of all hashtags (not including hash symbol)
    hashtags = re.findall(hashtag_pattern, text)
    
    # add hashtags to global set
    all_hashtags.update(hashtags)
    
    # remove punctuation from hashtags in case it exists
    hashtags = [re.sub(punc_pattern(''),' ', s) for s in hashtags]
    
    # split hashtag into words
    hashtag_tokens = re.findall(hashtag_word_pattern, ' '.join(hashtags))

    # remove mentions and links from tweet
    text = re.sub(remove_pattern,'', text)
    
    # replaces punctuation with a space, removes hashtags
    text = re.sub(punc_pattern("'#"),' ', text)
    
    # create a combined string of tweet text and words from hashtag
    text = text + ' '.join(hashtag_tokens)
    
    # split text at whitespace
    tokens = set(text.split())
    
    # try stemming
    # stemmer = SnowballStemmer("english")
    # stemmer.stem(t.lower())
    
    # remove if not in the alphabet and not in stopwords, set to lowercase
    return [t.lower() for t in tokens \
                if t.lower() not in stopwords.union(extra_stopwords) and t.isalpha() and len(t) > 2]

In [6]:
N = 5
cur = tweet_collection.find({'retweeted': False}, projection={'text': True}, limit = N)

for i in xrange(N):
    tweet = cur.next()
    text = tweet['text']
    tokens = tweet_tokenizer(text)
    
    print "Tweet #%s: %s" % (i+1, tokens)
    print text

Tweet #1: [u'ear', u'trump', u'bea', u'mask', u'like', u'whispers', u'hillary', u'sou', u'aide', u'hideous', u'look']
TRUMP: look at this Hillary mask. it's hideous! just like her sou--
AIDE: *whispers in ear*
TRUMP: this mask is bea… https://t.co/IpquTCKM9F
Tweet #2: [u'daughter', u'trump', u'hillary', u'donald', u'lara', u'clinton', u'interview', u'new', u'law', u'blasted']
#DonaldTrump's daughter-in-law, Lara Trump, blasted #HillaryClinton in a new interview. https://t.co/apANzmnyQU… https://t.co/eMohElI9aR
Tweet #3: [u'dnc', u'cruz', u'questions', u'cnn', u'wikileaks', u'colluded', u'trump']
WikiLeaks: DNC And CNN Colluded On Questions For Trump, Cruz https://t.co/VDETfDgyLi
Tweet #4: [u'never', u'clinton', u'believed', u'emails', u'person', u'hillary', u'johnpodesta', u'wikileaks', u'podesta']
Hillary Clinton is not a person who can be believed #NeverHillary #wikileaks #johnpodesta #PodestaEmails… https://t.co/ity0m0nJLL
Tweet #5: [u'obama', u'clinton', u'annihilate', u'would', u'

In [361]:
def tweet_generator(query, get_string=None):
    
    # if query is a list we need to aggregate
    if type(query) is list:
        cur = tweet_collection.aggregate(query)
    else:
        cur = tweet_collection.find(query)

    for document in cur:
        _id = document['_id']
        if get_string:
            result = get_string(document)
            if result:
                yield (result, _id)
        # otherwise yield document
        else:
            yield (document, _id)
    

In [8]:
def merge_dicts(*dict_args):
    '''
    Given any number of dicts, shallow copy and merge into a new dict,
    precedence goes to key value pairs in latter dicts.
    '''
    result = {}
    for dictionary in dict_args:
        result.update(dictionary)
    return result

In [9]:
def get_dict_representation(X, feature_names, merge=False):
    '''
    Convert sparse matrix representation to dictionary.
    '''
    dict_vectorizer = DictVectorizer()

    # set feature names so dictionaries can be unpacked
    dict_vectorizer.feature_names_ = feature_names

    # merge dictionaries
    if merge:
        return merge_dicts(*dict_vectorizer.inverse_transform(X))
    # keep seperate
    else:
        return dict_vectorizer.inverse_transform(X)

In [10]:
# def display_table(data, title=None, limit=20, **kwargs):
#     if title:            
#         print title + ' (limited to %s results)' % limit
#     if type(data[0]) == tuple:
#         data = [[str(tup[0]), str(tup[1])] for tup in data[:limit]]
#     print tabulate(data[:limit], tablefmt="fancy_grid", **kwargs)
#     print '\n'

In [11]:
def sort_dict(d, reverse=True):
    return sorted(d.items(), key=operator.itemgetter(1), reverse=reverse)

## Calculate Frequency Distribution and TF-IDF

For TF-IDF, we will combine all the tweets from the same location and same query since the document size for each individual tweet is too small.

In [360]:
def get_bag_words(query, get_string=None, extra_stopwords=set([])):
    # Create a count vectorizer to convert tweets to bag of word representation
    # count_vectorizer = CountVectorizer(tokenizer=tokenizer)
    
    # gen = tweet_generator(query, get_string)

    # Create a matrix passing in a generator containing tweets
    # bag_of_words = count_vectorizer.fit_transform(gen)
    bag_words = {}
    words = []
    count = 0
    
    for text, _id in tweet_generator(query, get_string):
        
        tokens = tweet_tokenizer(text, extra_stopwords)
        
        bag_words[_id] = tokens
        
        words.extend(tokens)
        
        count += 1
        
    print "Total number of items retrieved is %s" % count

    # Save the indices of
    # feature_names = count_vectorizer.get_feature_names()

    # Get a dictionary representation of the bag of words matrix
    # This can be used to create a tweet id to token dictionary
    # bag_of_words_dictionary = get_dict_representation(bag_of_words, feature_names)
    
    # return bag_of_words, bag_of_words_dictionary, feature_names
    return bag_words, set(words)

In [196]:
def get_term_frequency(bag_words):
    
    tf = {}
    
#     def word_count(inner):
#         # For each word from the list of tokens
#         for k in inner.keys():
#             # if the key does not exist, set to 1 
#             if freq_dist.get(k,0) == 0:
#                 freq_dist[k] = 1
#             # add one if the key exists
#             else:
#                 freq_dist[k] += 1
    
#     if type(bag_of_words) is list:
#         for inner in bag_of_words:
#             word_count(inner)
#     else:
#         word_count(X)

    for _, tokens in bag_words.iteritems():
        for word in tokens:
            # if the key does not exist, set to 1 
            if tf.get(word,0) == 0:
                tf[word] = 1
            # add one if the key exists
            else:
                tf[word] += 1
        
    return tf, sort_dict(tf)
    

In [322]:
def get_tfidf(bag_words, tf, normalize=True, min_frequency=1):
    
    idf = {}
    tfidf = {}
    
    for _id, tokens in bag_words.iteritems():
        
        tfidf[_id] = {}
        
        for word in set(tokens):
            
            tfidf[_id][word] = None
            
            # if the key does not exist, set to 1 
            if idf.get(word,0) == 0:
                idf[word] = 1
            # add one if the key exists
            else:
                idf[word] += 1
                
    N = len(bag_words)
    print "Size of corpus (number of documents) %s" % N
    
    for _id, inner in tfidf.iteritems():
        for word in inner.keys():
            frequency = tf[word]
            if frequency <= min_frequency:
                continue
            if normalize:
                frequency = (1 + math.log(frequency,10))

            tfidf[_id][word] = math.log(1 + N / idf[word],10) * frequency
    
    # Create a transformer to convert to a tf-idf representation
    # tfidf_transformer = TfidfTransformer(sublinear_tf=True)

    # Calculate tf-idf using bag of words matrix
    # tfidf = tfidf_transformer.fit_transform(bag_of_words)

    # Create a dictionary representation of the sparse matrix
    # tfidf_dictionary = get_dict_representation(tfidf, feature_names, merge=True)
    
    #if sort:
    #    tfidf_sorted = sort_dict(tfidf_dictionary)
    
    #return tfidf, tfidf_dictionary, tfidf_sorted
    return tfidf, sort_dict(merge_dicts(*[v for k,v in tfidf.iteritems()]))

### All Tweets

First, all the tweets will be used to calculate the frequency distribution and TF-IDF (not including retweets). 

1. Calculating the frequency distribution.
2. Calculating the TF-DF.

In [None]:
query = {'retweeted': False}
get_string = lambda x: x['text']
# Find a bag of words representation
bag_words, unique_words = get_bag_words(query, get_string)

# Find the frequency distribution
tf, tf_sorted = get_term_frequency(bag_words)
print "Total number of unique words is %s\n" % len(unique_words)

print tf_sorted[:10]
# display_table(tf_sorted, title="Most frequent words from Tweets", headers=['Words','Frequency'])

Many of the frequent words contain the words we used in our queries when getting Twitter data. Let's take a look at what are the most popular words when we ignore these words.

In [None]:
query_stopwords = [u'financial', u'mexico', u'narrates', u'new', u'final', u'market', u'trump ', \
                   u'riot', u'cabinet', u'hillary', u'president', u'america', u'day', u'thoughts', \
                   u'stock', u'clinton ', u'positions', u'weed', u'planet', u'electionnight', u'clinton', \
                   u'obama', u'elections2016 ', u'elect', u'trump', u'canadian', u'donald', u'election', \
                   u'earth', u'still','night']

In [None]:
# Find a bag of words representation
bag_words, unique_words = get_bag_words(query, get_string, query_stopwords)

# Find the frequency distribution
_, tf_sorted = get_term_frequency(bag_words)
print tf_sorted[:5]

Next we will calculate the TF-IDF in three ways:

1. Smooth IDF and TF 
> ( 1 + log(size/occurences) ) * tf
2. Smooth IDF and normalized TF
> ( 1 + log(size/occurences) ) * (1 + log(tf))
3. Smooth IDF and normalized TF with a minimum TF of 10 
> ( 1 + log(size/occurences) ) * (1 + log(tf)) if tf >= 10

In [None]:
tfidf, tfidf_sorted = get_tfidf(bag_words, tf, normalize=False)
print tfidf_sorted[:5]

_, tfidf_sorted = get_tfidf(bag_words, tf)
print tfidf_sorted[:5]

_, tfidf_sorted = get_tfidf(bag_words, tf, min_frequency=10)
print tfidf_sorted[:5]


### Places

Next, tweets that have a location of origin will be used and retweets will not be included.

In [358]:
query = [
    {
        '$match': {'retweeted': False, 'place': {'$ne': None}}
    },
    {
        "$group": {
            "_id": "$place.country",
            'text': {'$push': '$text'}
        }
    },
        {
        "$project": {
            "location": "$_id.location",
            'all_text': '$text'
        }
    }
]

get_string = lambda x: ' '.join(x['all_text']) if len(x['all_text']) > 1000 else None

# Find a bag of words representation
bag_words, unique_words = get_bag_words(query, get_string, query_stopwords)

# Find the frequency distribution
tf, tf_sorted = get_term_frequency(bag_words)
print "Total number of unique words is %s\n" % len(unique_words)

display_table(tf_sorted, title="Most frequent words from Tweets", limit=10, headers=['Words','Frequency'])

Number of items retrieved is [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Total number of unique words is 0

Most frequent words from Tweets (limited to 10 results)


IndexError: list index out of range

In [328]:
tfidf, tfidf_sorted = get_tfidf(bag_words, tf, normalize=True, min_frequency=2)

for place, values in tfidf.iteritems():
    print place
    print sort_dict(values)[:3]
    print
# display_table(tfidf_sorted, title="Highest TF-IDF words from Tweets", limit=15, headers=['Words','TF-IDF'])

# tfidf_sorted.reverse()
# display_table(tfidf_sorted, title="Lowest TF-IDF words from Tweets", limit=15, headers=['Words','Frequency'])

Size of corpus (number of documents) 12
Canada
[(u'dinah', 1.353897758656518), (u'groping', 1.353897758656518), (u'sanctuary', 1.353897758656518)]

United Kingdom
[(u'weekly', 1.353897758656518), (u'brazilian', 1.353897758656518), (u'brexiteers', 1.353897758656518)]

Australia
[(u'barking', 1.6454294022461016), (u'aap', 1.353897758656518), (u'presidentelect', 1.2483122772269863)]

South Africa
[(u'malema', 1.6454294022461016), (u'scoop', 1.353897758656518), (u'wud', 1.353897758656518)]

United Arab Emirates
[(u'modiji', 1.353897758656518), (u'surgical', 1.353897758656518), (u'evasion', 1.353897758656518)]

Mexico
[(u'puto', 1.6454294022461016), (u'jihadis', 1.353897758656518), (u'vive', 1.353897758656518)]

India
[(u'hindus', 1.6454294022461016), (u'jai', 1.6454294022461016), (u'din', 1.6454294022461016)]

France
[(u'vive', 1.353897758656518), (u'brexiteers', 1.353897758656518), (u'demonstrators', 1.353897758656518)]

United States
[(u'tantrums', 1.6454294022461016), (u'damns', 1.64542

### Queries

In [None]:
query = [{'$match': {'retweeted': False}}, 
         {"$group": { "_id": "$root_query", 'text': {'$push': '$text'}}},
         {"$project": {'all_text': '$text'}}]

get_string = lambda x: ' '.join(x['all_text'])

# Find a bag of words representation
bag_words, unique_words = get_bag_words(query, get_string)

# Find the frequency distribution
tf, tf_sorted = get_term_frequency(bag_words)

print "Total number of unique words is %s\n" % len(unique_words)

display_table(tf_sorted, title="Most frequent words from Tweets", limit=10, headers=['Words','Frequency'])

# Sentiment Analysis

In [20]:
def get_sentiment(tokens):
    
    combined_happiness = 0
    words_with_no_sentiment = set()
    words_sentiment_count = 0
    
    # Go through each token and if we have a sentiment for it, add it to the combined happiness score
    for token in tokens:
        # get the happiness value, otherwise return zero
        happiest_value = words_happiness.get(token,0)
        
        # if a happiness value exists keep track 
        if happiest_value != 0:
            combined_happiness += happiest_value
            words_sentiment_count += 1
        # save the words that have no happiness index
        else:
            words_with_no_sentiment.add(token)
    
    # Safe check to avoid division by 0
    if combined_happiness == 0: 
        avg_sentiment_score = 0 
    else: 
        avg_sentiment_score = combined_happiness / words_sentiment_count
    
    return avg_sentiment_score, words_with_no_sentiment

In [21]:
tweet_sentiment = {}
tweets_with_no_sentiment = []
all_words_with_no_sentiment = set()

for tweet in tweet_iterable(text_only=False):
    text = tweet['text']
    _id = tweet['_id']
    
    tokens = tweet_tokenizer(text)
    
    avg_sentiment_score, words_with_no_sentiment = get_sentiment(tokens)
    
    # update set of words with no sentiment
    if len(words_with_no_sentiment) > 0:
        all_words_with_no_sentiment = all_words_with_no_sentiment.union(words_with_no_sentiment)
    
    # If we didn't find any sentiment for the tweet, save the tweet id
    if avg_sentiment_score == 0:
        tweets_with_no_sentiment.append(_id)
    # otherwise save the sentiment to the dictionary
    else:
        tweet_sentiment[_id] = avg_sentiment_score

In [22]:
print len(tweet_sentiment)
cur = tweet_collection.find({'retweeted': False})
print cur.count()
print len(tweets_with_no_sentiment)
print len(all_words_with_no_sentiment)

71265
71370
105
26128


In [25]:
values = tweet_sentiment.values()

average_sentiment = np.average(values)
std_sentiment = np.std(values)
happiest_value = max(values)
saddest_value = min(values)

tweet_percent = (len(tweet_sentiment) / (len(tweets_with_no_sentiment) + len(tweet_sentiment))) * 100
word_percent = (1 - len(all_words_with_no_sentiment) / len(unique_words)) * 100

happiest_tweet_id = tweet_sentiment.keys()[values.index(happiest_value)]
saddest_tweet_id = tweet_sentiment.keys()[values.index(saddest_value)]

happy_tweet = tweet_collection.find_one({'_id': happiest_tweet_id})['text']
sad_tweet = tweet_collection.find_one({'_id': saddest_tweet_id})['text']

print "The average sentiment of tweet sentiment is   %2.2f" % average_sentiment
print "The stanard deviation of tweet sentiment is   %2.2f" % std_sentiment
print ''
print "Percentage of tweets that have a sentiment    %2.2f%%" % tweet_percent
print "Percentage of words that have a sentiment     %2.2f%%" % word_percent
print ''
print "The happiest tweet is                         %2.2f" % happiest_value
print "Tweet text: %s" % happy_tweet
print ''
print "The saddest tweet is                          %2.2f" % saddest_value
print "Tweet text: %s" % sad_tweet

The average sentiment of tweet sentiment is   5.57
The stanard deviation of tweet sentiment is   0.47

Percentage of tweets that have a sentiment    99.85%
Percentage of words that have a sentiment     29.33%

The happiest tweet is                         8.18
Tweet text: @IcomOfficiel @UNESCO Excellent.
BUT coinciding with https://t.co/2UoluNULzY

The saddest tweet is                          1.56
Tweet text: @xtremevicky "Harambe died for this?"
#Elections2016 😂 https://t.co/ygtf0tnjl4


In [None]:
for w in tweet_tokenizer(happy_tweet):
    print words_happiness.get(w,0)

In [None]:
headers = ['Tweet ID','Happiness Index']

display_table(sort_dict(tweet_sentiment), title="Happiest Tweets", limit=10, headers=headers)

display_table(sort_dict(tweet_sentiment, reverse=False), title="Unhappiest Tweets", limit=10, headers=headers)