# Text Analysis

Below is some basic initilization, which involves connecting to our remote MongoDB database.

In [1]:
# Custom helper file
import elections_helper as helper
from elections_helper import display_table

import numpy as np
import re, string, operator, pickle, nltk, pprint, math

# from nltk.stem.snowball import SnowballStemmer
from __future__ import division

# sklearn
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer, CountVectorizer
from sklearn.feature_extraction import DictVectorizer

### Globally Defined Variables

In [2]:
# Twitter users really like these words but we just don't get it.
other_stopwords = ['like','get']

# retrieve common stop words form nltk
stopwords = set(nltk.corpus.stopwords.words('english') + other_stopwords)

# Retrieved dicationary containing word mapped with its happiness index
words_happiness = pickle.load(open('./data_files/sentiment.pickle','rb'))

### Connect to MongoDB

In [3]:
# Connect to MongoDB
client = helper.setup_mongo_client(properties_file='./properties/db.properties')

tweet_collection, _ = helper.get_collections(client)

### Helper Functions
TODO move these into election helper.

In [4]:
def sort_dict(d, reverse=True):
    return sorted(d.items(), key=operator.itemgetter(1), reverse=reverse)

In [5]:
def merge_dicts(*dict_args):
    '''
    Given any number of dicts, shallow copy and merge into a new dict,
    precedence goes to key value pairs in latter dicts.
    '''
    result = {}
    for dictionary in dict_args:
        result.update(dictionary)
    return result

## Understanding our Data

Initially, let's play around with our data a little bit. We're going to practice making simple queries and displaying some of the results.

In [6]:
# Query the database and save result to list
tweets = [tweet for tweet in tweet_collection.find({}, projection={'text': True}).limit(5)]

num = 0
for tweet in tweets:
    num += 1
    print "Tweet #%s: %s" % (num, tweet['text'])

Tweet #1: RT @Ethan_Booker: TRUMP: look at this Hillary mask. it's hideous! just like her sou--
AIDE: *whispers in ear*
TRUMP: this mask is beautiful…
Tweet #2: TRUMP: look at this Hillary mask. it's hideous! just like her sou--
AIDE: *whispers in ear*
TRUMP: this mask is bea… https://t.co/IpquTCKM9F
Tweet #3: RT @FoxNews: #DonaldTrump's daughter-in-law, Lara Trump, blasted #HillaryClinton in a new interview. https://t.co/apANzmnyQU #Election2016…
Tweet #4: #DonaldTrump's daughter-in-law, Lara Trump, blasted #HillaryClinton in a new interview. https://t.co/apANzmnyQU… https://t.co/eMohElI9aR
Tweet #5: WikiLeaks: DNC And CNN Colluded On Questions For Trump, Cruz https://t.co/VDETfDgyLi


There's a few things we noticed about the data and it's summarized in the list below:
* Excess text for a retweeted message. ex. RT @MassDeception1:
* Links in tweets
* Mentions in tweets
* Useless puncuation
* Lots of information saved in hashtags!

In order to perform any analysis on our data we beed to tokenize it first. Tokenizing is basically converting a string into a list of words. When tokenizing the tweets we can create regular expressions to help handle the potenial issues listed above.

In [7]:
# Create a global variable called all_hashtags, we can 
all_hashtags = []

def tweet_tokenizer(text, extra_stopwords=set([])):
    '''
    
    '''
    # matches mentions, ampersands and twitter links
    remove_pattern = r"(?:@[\w]*|https://t.co/[\w]*|&[\w]*)" 
   
    # matches text found after hashtag
    hashtag_pattern = r"#([^\s]*)"
    
    # pattern to remove puncuation except punctuation x
    punc_pattern = lambda x:  r"[^\w\d" + x + "\s]+"
    
    # match words from hashtag
    hashtag_word_pattern = r'([A-Z][^A-Z]*|[a-z][a-z]*)'
    
    # create a list of all hashtags (not including hash symbol)
    hashtags = re.findall(hashtag_pattern, text)
    
    # add hashtags to global set
    all_hashtags.extend(hashtags)
    
    # remove punctuation from hashtags in case it exists
    hashtags = [re.sub(punc_pattern(''),' ', s) for s in hashtags]
    
    # split hashtag into words
    hashtag_tokens = re.findall(hashtag_word_pattern, ' '.join(hashtags))

    # remove mentions and links from tweet
    text = re.sub(remove_pattern,'', text)
    
    # replaces punctuation with a space, removes hashtags
    text = re.sub(punc_pattern("'#"),' ', text)
    
    # create a combined string of tweet text and words from hashtag
    text = text + ' '.join(hashtag_tokens)
    
    # split text at whitespace
    tokens = text.split()
    
    # try stemming
    # stemmer = SnowballStemmer("english")
    # stemmer.stem(t.lower())
    
    # remove if not in the alphabet and not in stopwords, set to lowercase
    return [t.lower() for t in tokens \
                if t.lower() not in stopwords.union(extra_stopwords) and t.isalpha() and len(t) > 2]

Now testing the tweet_tokenizer function . . .

In [8]:
tweets = [tweet for tweet in tweet_collection.find({'retweeted': False}, projection={'text': True}).limit(5)]

num = 0
for tweet in tweets:
    num += 1
    text = tweet['text']
    tokens = tweet_tokenizer(text)  
    print "Tweet #%s: %s" % (num, tokens)

Tweet #1: [u'trump', u'look', u'hillary', u'mask', u'hideous', u'sou', u'aide', u'whispers', u'ear', u'trump', u'mask', u'bea']
Tweet #2: [u'daughter', u'law', u'lara', u'trump', u'blasted', u'new', u'interview', u'donald', u'trump', u'hillary', u'clinton']
Tweet #3: [u'wikileaks', u'dnc', u'cnn', u'colluded', u'questions', u'trump', u'cruz']
Tweet #4: [u'hillary', u'clinton', u'person', u'believed', u'never', u'hillary', u'wikileaks', u'johnpodesta', u'podesta', u'emails']
Tweet #5: [u'bloomberg', u'poll', u'obama', u'would', u'annihilate', u'trump', u'romney', u'would', u'clobber', u'hillary', u'clinton']


## Calculate Frequency Distribution and TF-IDF

For TF-IDF, we will combine all the tweets from the same location and same query since the document size for each individual tweet is too small.

In [9]:
def tweet_generator(query, get_string=None):
    
    # if query is a list we need to aggregate
    if type(query) is list:
        cur = tweet_collection.aggregate(query)
    else:
        cur = tweet_collection.find(query)

    for document in cur:
        _id = document['_id']
        if get_string:
            result = get_string(document)
            if result:
                yield (result, _id)
        # otherwise yield document
        else:
            yield document    

In [10]:
def get_bag_words(query, get_string=None, extra_stopwords=set([])):

    bag_words = {}
    words = []
    count = 0
    
    for text, _id in tweet_generator(query, get_string):
        
        tokens = tweet_tokenizer(text, extra_stopwords)
        
        bag_words[_id] = tokens
        
        words.extend(tokens)
        
        count += 1
        
    print "Total number of items retrieved is %s" % count

    return bag_words, set(words)

In [11]:
def get_term_frequency(bag_words):
    
    tf = {}
    for _, tokens in bag_words.iteritems():
        for word in tokens:
            # if the key does not exist, set to 1 
            if tf.get(word,None) is None:
                tf[word] = 1
            # add one if the key exists
            else:
                tf[word] += 1
        
    return tf, sort_dict(tf)
    

In [12]:
def get_tfidf(bag_words, tf, normalize=True, min_frequency=1):
    
    idf = {}
    tfidf = {}
    
    for _id, tokens in bag_words.iteritems():
        
        tfidf[_id] = {}
        
        for word in set(tokens):
            
            tfidf[_id][word] = None
            
            # if the key does not exist, set to 1 
            if idf.get(word,0) == 0:
                idf[word] = 1
            # add one if the key exists
            else:
                idf[word] += 1
                
    N = len(bag_words)
    print "Size of corpus (number of documents) %s" % N
    
    for _id, inner in tfidf.iteritems():
        for word in inner.keys():
            frequency = tf[word]
            if frequency <= min_frequency:
                continue
            if normalize:
                frequency = (1 + math.log(frequency,10))

            tfidf[_id][word] = math.log(1 + N / idf[word],10) * frequency
    
    return tfidf, sort_dict(merge_dicts(*[v for k,v in tfidf.iteritems()]))

### All Tweets

First, all the tweets will be used to calculate the frequency distribution and TF-IDF (not including retweets). 

1. Calculating the frequency distribution.
2. Calculating the TF-DF.

In [13]:
query = {'retweeted': False}
get_string = lambda x: x['text']
# Find a bag of words representation
bag_words, unique_words = get_bag_words(query, get_string)

# Find the frequency distribution
tf, tf_sorted = get_term_frequency(bag_words)
print "Total number of unique words is %s\n" % len(unique_words)

print "Most frequent words: "
print tf_sorted[:10]
# display_table(tf_sorted, title="Most frequent words from Tweets", headers=['Words','Frequency'])

Total number of items retrieved is 71370
Total number of unique words is 34078

Most frequent words: 
[(u'trump', 46373), (u'election', 14873), (u'day', 9544), (u'president', 8609), (u'clinton', 7593), (u'donald', 7477), (u'hillary', 5694), (u'america', 5619), (u'night', 4975), (u'vote', 4925)]


Many of the frequent words contain the words we used in our queries when getting Twitter data. Let's take a look at what are the most popular words when we ignore these words.

In [14]:
query_stopwords = [u'financial', u'mexico', u'narrates', u'new', u'final', u'market', u'trump ', \
                   u'riot', u'cabinet', u'hillary', u'president', u'america', u'day', u'thoughts', \
                   u'stock', u'clinton ', u'positions', u'weed', u'planet', u'electionnight', u'clinton', \
                   u'obama', u'elections2016 ', u'elect', u'trump', u'canadian', u'donald', u'election', \
                   u'earth', u'still','night']

In [15]:
# Find a bag of words representation
bag_words, unique_words = get_bag_words(query, get_string, query_stopwords)

# Find the frequency distribution
_, tf_sorted = get_term_frequency(bag_words)
print "Most frequent words: "
print tf_sorted[:5]

Total number of items retrieved is 71370
Most frequent words: 
[(u'vote', 4925), (u'people', 3456), (u'win', 2581), (u'voted', 2540), (u'world', 2092)]


Next we will calculate the TF-IDF in three ways:

1. Smooth IDF and TF 
> ( 1 + log(size/occurences) ) * tf
2. Smooth IDF and normalized TF
> ( 1 + log(size/occurences) ) * (1 + log(tf))
3. Smooth IDF and normalized TF with a minimum TF of 100
> ( 1 + log(size/occurences) ) * (1 + log(tf)) if tf >= 10

In [16]:
print 'Smooth IDF and TF'
tfidf, tfidf_sorted = get_tfidf(bag_words, tf, normalize=False)
print tfidf_sorted[:10]

print '\nSmooth IDF and normalized TF'
_, tfidf_sorted = get_tfidf(bag_words, tf)
print tfidf_sorted[:10]

print '\nMinimum word frequency of 100 TF-IDF:'
_, tfidf_sorted = get_tfidf(bag_words, tf, min_frequency=100)
print tfidf_sorted[:10]

Smooth IDF and TF
Size of corpus (number of documents) 71370
[(u'vote', 6104.075305499893), (u'people', 4685.963054884916), (u'win', 3806.7569528162117), (u'voted', 3797.525176872505), (u'world', 3260.189736881924), (u'wins', 3232.526477994719), (u'one', 3014.9385702047366), (u'white', 2838.4426995117924), (u'going', 2821.594692643993), (u'would', 2811.2446362677156)]

Smooth IDF and normalized TF
Size of corpus (number of documents) 71370
[(u'dubai', 9.298709173761077), (u'blah', 9.090160063745648), (u'counting', 9.020739498696368), (u'bye', 8.949604717595758), (u'gang', 8.881102413833117), (u'shirt', 8.869836422815819), (u'biden', 8.861901883759453), (u'neck', 8.846747184391567), (u'choosing', 8.769699129411945), (u'death', 8.750707003324493)]

Minimum word frequency of 100 TF-IDF:
Size of corpus (number of documents) 71370
[(u'dubai', 9.298709173761077), (u'counting', 9.020739498696368), (u'bye', 8.949604717595758), (u'biden', 8.861901883759453), (u'death', 8.750707003324493), (u'we

From the above we can see that not normalizing the term frequency will cause a few issues. When not normalized the highest TF-IDF values generally tend to be the most frequent values. By normalizing 

### Places

Next, tweets that have a location of origin will be used and retweets will not be included.

In [17]:
query = [
    {
        '$match': {'retweeted': False, 'place': {'$ne': None}}
    },
    {
        "$group": {
            "_id": "$place.country",
            'text': {'$push': '$text'}
        }
    },
        {
        "$project": {
            "location": "$_id.location",
            'all_text': '$text'
        }
    }
]

get_string = lambda x: ' '.join(x['all_text']) if len(x['all_text']) > 1000 else None

# Find a bag of words representation
bag_words, unique_words = get_bag_words(query, get_string, query_stopwords)

# Find the frequency distribution
tf, tf_sorted = get_term_frequency(bag_words)
print "Total number of unique words is %s\n" % len(unique_words)

Total number of items retrieved is 12
Total number of unique words is 25334



In [18]:
tfidf, tfidf_sorted = get_tfidf(bag_words, tf, normalize=True, min_frequency=15)

for place, values in tfidf.iteritems():
    print place
    print sort_dict(values)[:10]
    print
# display_table(tfidf_sorted, title="Highest TF-IDF words from Tweets", limit=15, headers=['Words','TF-IDF'])

# tfidf_sorted.reverse()
# display_table(tfidf_sorted, title="Lowest TF-IDF words from Tweets", limit=15, headers=['Words','Frequency'])

Size of corpus (number of documents) 12
Canada
[(u'dubai', 2.5929178581611345), (u'impacts', 2.198995798670775), (u'vancouver', 1.9958915652655798), (u'cst', 1.9614356285831662), (u'brazil', 1.8677162513475947), (u'toronto', 1.699450340776646), (u'heinous', 1.6507769157817132), (u'sweden', 1.614268076122521), (u'samuel', 1.5763678326709614), (u'popovich', 1.5406137538341171)]

United Kingdom
[(u'zuma', 2.5960414635294673), (u'choosewisely', 2.080967721759438), (u'brazilian', 1.9958915652655798), (u'syllabus', 1.7697669357399435), (u'swedish', 1.623161630509354), (u'presidente', 1.4564449945154023), (u'imstillwithher', 1.4163263090279765), (u'fights', 1.4095318928498548), (u'australia', 1.3902425212688523), (u'department', 1.3544752401323321)]

Australia
[(u'auspol', 3.034688994791726), (u'qand', 2.5632170671285213), (u'turnbull', 2.4552648023662274), (u'qanda', 2.180170061161796), (u'poori', 2.021487376982741), (u'abbott', 1.8626974772987794), (u'heinous', 1.6507769157817132), (u'hai',

### Queries

In [19]:
query = [{'$match': {'retweeted': False}}, 
         {"$group": { "_id": "$root_query", 'text': {'$push': '$text'}}},
         {"$project": {'all_text': '$text'}}]

get_string = lambda x: ' '.join(x['all_text'])

# Find a bag of words representation
bag_words, unique_words = get_bag_words(query, get_string)

# Find the frequency distribution
tf, tf_sorted = get_term_frequency(bag_words)

print "Total number of unique words is %s\n" % len(unique_words)

Total number of items retrieved is 13
Total number of unique words is 33675



In [20]:
tfidf, tfidf_sorted = get_tfidf(bag_words, tf, normalize=True, min_frequency=15)

for query, values in tfidf.iteritems():
    print query
    print sort_dict(values)[:10]
    print

Size of corpus (number of documents) 13
#NewTrumpCabinetPositions
[(u'department', 2.164801273190773), (u'bannon', 2.087645828010348), (u'french', 2.0638396777689993), (u'positions', 2.057262581619161), (u'defence', 2.013542215107921), (u'popovich', 1.9940490050081678), (u'salary', 1.9288821224592476), (u'gregg', 1.928740016689785), (u'article', 1.9185206964602164), (u'blind', 1.9070988971789762)]

stock OR market OR financial OR obama OR weed OR canadian OR mexico
[(u'concern', 2.127592247229762), (u'final', 2.1209754283788196), (u'thoughts', 2.117053452658006), (u'fourth', 2.0666565439729006), (u'difficult', 2.0666565439729006), (u'pak', 2.0320841499224453), (u'japan', 2.013542215107921), (u'kentucky', 1.9805450917472285), (u'however', 1.9805450917472285), (u'brazil', 1.974530975893134)]

#TrumpNarratesPlanetEarth
[(u'narrates', 3.052695597386898), (u'trumpnarratesplanetearth', 2.5563800409399255), (u'hotels', 2.1921597050143062), (u'bannon', 2.087645828010348), (u'sue', 1.9735016076

In [21]:
# check to make sure this is handled properly
tf['imstillwithher']

109

# Sentiment Analysis

In [22]:
def get_sentiment(tokens, min_sentiment_tokens=3):
    
    combined_happiness = 0
    words_with_no_sentiment = set()
    words_sentiment_count = 0
    
    # Go through each token and if we have a sentiment for it, add it to the combined happiness score
    for token in tokens:
        # get the happiness value, otherwise return zero
        happiest_value = words_happiness.get(token,0)
        
        # if a happiness value exists keep track 
        if happiest_value != 0:
            combined_happiness += happiest_value
            words_sentiment_count += 1
        # save the words that have no happiness index
        else:
            words_with_no_sentiment.add(token)
    
    
    
    # Safe check to avoid division by 0
    if combined_happiness == 0: 
        avg_sentiment_score = 0 
    else: 
        avg_sentiment_score = combined_happiness / words_sentiment_count
    
    # if the tweet does not have a minimum of 4 words that have sentiment, return a score of zero
    if words_sentiment_count <= min_sentiment_tokens:
        return 0, words_with_no_sentiment
    
    return avg_sentiment_score, words_with_no_sentiment

In [23]:
tweet_sentiment = {}
tweets_with_no_sentiment = []
all_words_with_no_sentiment = set()

for tweet in tweet_generator({'retweeted': False}):
    text = tweet['text']
    _id = tweet['_id']
    
    tokens = tweet_tokenizer(text)
    
    avg_sentiment_score, words_with_no_sentiment = get_sentiment(tokens)
    
    # update set of words with no sentiment
    if len(words_with_no_sentiment) > 0:
        all_words_with_no_sentiment = all_words_with_no_sentiment.union(words_with_no_sentiment)
    
    # If we didn't find any sentiment for the tweet, save the tweet id
    if avg_sentiment_score == 0:
        tweets_with_no_sentiment.append(_id)
    # otherwise save the sentiment to the dictionary
    else:
        tweet_sentiment[_id] = avg_sentiment_score

In [24]:
values = tweet_sentiment.values()

average_sentiment = np.average(values)
std_sentiment = np.std(values)
happiest_value = max(values)
saddest_value = min(values)

tweet_percent = (len(tweet_sentiment) / (len(tweets_with_no_sentiment) + len(tweet_sentiment))) * 100
word_percent = (1 - len(all_words_with_no_sentiment) / len(unique_words)) * 100

happiest_tweet_id = tweet_sentiment.keys()[values.index(happiest_value)]
saddest_tweet_id = tweet_sentiment.keys()[values.index(saddest_value)]

happy_tweet = tweet_collection.find_one({'_id': happiest_tweet_id})['text']
sad_tweet = tweet_collection.find_one({'_id': saddest_tweet_id})['text']

print "The average sentiment of tweet sentiment is   %2.2f" % average_sentiment
print "The stanard deviation of tweet sentiment is   %2.2f" % std_sentiment
print ''
print "Percentage of tweets that have a sentiment    %2.2f%%" % tweet_percent
print "Percentage of words that have a sentiment     %2.2f%%" % word_percent
print ''
print "The happiest tweet is                         %2.2f" % happiest_value
print "Tweet text: %s" % happy_tweet
print ''
print "The saddest tweet is                          %2.2f" % saddest_value
print "Tweet text: %s" % sad_tweet

The average sentiment of tweet sentiment is   5.57
The stanard deviation of tweet sentiment is   0.45

Percentage of tweets that have a sentiment    86.68%
Percentage of words that have a sentiment     23.01%

The happiest tweet is                         7.53
Tweet text: @realDonaldTrump Congratulations For Great victory All Indian with you #electionday

The saddest tweet is                          2.62
Tweet text: Is this the End of Terrorism ( #Elections2016 ) and Corruption? ( #ModiFightsCorruption )


In [25]:
headers = ['Tweet ID','Happiness Index']

display_table(sort_dict(tweet_sentiment), title="Happiest Tweets", limit=10, headers=headers)

display_table(sort_dict(tweet_sentiment, reverse=False), title="Unhappiest Tweets", limit=10, headers=headers)

Happiest Tweets (limited to 10 results)
╒════════════════════╤═══════════════════╕
│           Tweet ID │   Happiness Index │
╞════════════════════╪═══════════════════╡
│ 796291278740697088 │           7.53    │
├────────────────────┼───────────────────┤
│ 796291597507796992 │           7.53    │
├────────────────────┼───────────────────┤
│ 795703881317552128 │           7.29    │
├────────────────────┼───────────────────┤
│ 796809782812442624 │           7.26    │
├────────────────────┼───────────────────┤
│ 797587597136633856 │           7.23333 │
├────────────────────┼───────────────────┤
│ 796302267833991168 │           7.21    │
├────────────────────┼───────────────────┤
│ 796329231420706816 │           7.19    │
├────────────────────┼───────────────────┤
│ 796380493684084736 │           7.185   │
├────────────────────┼───────────────────┤
│ 796299322971168768 │           7.17556 │
├────────────────────┼───────────────────┤
│ 796268118528114690 │           7.17    │
╘═════════════

In [26]:
# for debugging
print len(tweet_sentiment)
cur = tweet_collection.find({'retweeted': False})
print cur.count()
print len(tweets_with_no_sentiment)
print len(all_words_with_no_sentiment)

61863
71370
9507
25926


## Sentiment of Communities

In [34]:
communities = pickle.load(open('./data_files/community.txt','rb'))

ImportError: No module named __builtin__

## Sentiment of Notable Twitter Users

# Testing