In [152]:
# http://streamhacker.com/2010/05/10/text-classification-sentiment-analysis-naive-bayes-classifier/

In [167]:
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import movie_reviews
import pandas as pd
import numpy as np
import re
import string
import nltk
import collections

In [154]:
tweets = pd.read_csv('twitter_sentiment/training_sample.csv', names=['polarity', 'id', 'datetime', 'query', 'user', 'text'])
tweets.drop(['id','datetime','query','user'], axis=1, inplace=True)

# 0 - the polarity of the tweet (0 = negative, 2 = neutral, 4 = positive)
# 1 - the id of the tweet (2087)
# 2 - the date of the tweet (Sat May 16 23:58:44 UTC 2009)
# 3 - the query (lyx). If there is no query, then this value is NO_QUERY.
# 4 - the user that tweeted (robotickilldozr)
# 5 - the text of the tweet (Lyx is cool)

tweets.head()

Unnamed: 0,polarity,text
0,0,I so hate homeworks -.- My head hurts so bad
1,0,Lots of revision to do tonight too for my fina...
2,0,Caught myself looking up the iphone. Promised ...
3,0,@cherrytreerec I can't see anything Stupid Yo...
4,0,@happyahma - welcome back! Sorry to hear about...


In [155]:
is_word = re.compile('[A-Za-z]')
def is_punc(word): #returns true if word is all punctuation or numbers
    return not is_word.search(word)

punc_re = re.compile('[%s]$' % re.escape(string.punctuation))
def trim_punc(word):
    return punc_re.sub('', word)

def tweet_words(tweet):
    words = tweet.split()
    clean_words = []
    for word in words:
        # remove usernames
        if word[0] == '@':
            continue
        # remove hashtags bc mostly not used on reddit
        if word[0] == '#':
            continue
        # remove all-punctuation words e.g. smileys, dashes
        elif is_punc(word):
            continue
        # keep clean words
        else:
            word = trim_punc(word)
            clean_words.append(word)
    return clean_words

tweets['words'] = tweets['text'].apply(tweet_words)
tweets.head()

Unnamed: 0,polarity,text,words
0,0,I so hate homeworks -.- My head hurts so bad,"[I, so, hate, homeworks, My, head, hurts, so, ..."
1,0,Lots of revision to do tonight too for my fina...,"[Lots, of, revision, to, do, tonight, too, for..."
2,0,Caught myself looking up the iphone. Promised ...,"[Caught, myself, looking, up, the, iphone, Pro..."
3,0,@cherrytreerec I can't see anything Stupid Yo...,"[I, can't, see, anything, Stupid, YouTube, and..."
4,0,@happyahma - welcome back! Sorry to hear about...,"[welcome, back, Sorry, to, hear, about, the, a..."


In [156]:
# To do: Consider stemming

In [157]:
# Corpus only has positive and negative tweets (polarity=={0,4})
tweets['pos'] = tweets['polarity']==4
tweets.head()

Unnamed: 0,polarity,text,words,pos
0,0,I so hate homeworks -.- My head hurts so bad,"[I, so, hate, homeworks, My, head, hurts, so, ...",False
1,0,Lots of revision to do tonight too for my fina...,"[Lots, of, revision, to, do, tonight, too, for...",False
2,0,Caught myself looking up the iphone. Promised ...,"[Caught, myself, looking, up, the, iphone, Pro...",False
3,0,@cherrytreerec I can't see anything Stupid Yo...,"[I, can't, see, anything, Stupid, YouTube, and...",False
4,0,@happyahma - welcome back! Sorry to hear about...,"[welcome, back, Sorry, to, hear, about, the, a...",False


In [158]:
# Translates list of words to dict of {word:True} for NLTK NaiveBayesClassifier
def word_feats(words):
        return dict([(word, True) for word in words])

In [223]:
def evaluate_model():
    # Feature extraction
    posfeats = [(word_feats(split_tweet), True) for split_tweet in tweets[ tweets['pos']==True ]['words']]
    negfeats = [(word_feats(split_tweet), False) for split_tweet in tweets[ tweets['pos']==False ]['words']]

    # Test-Train Split (to do: k-fold cross-val)
    negcutoff = len(negfeats)*3/4
    poscutoff = len(posfeats)*3/4
    trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
    testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]
    print 'train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats))
    
    # Train model
    classifier = NaiveBayesClassifier.train(trainfeats)
    
    # Test model
    pred_actual = [(classifier.classify(testfeat), label) for (testfeat, label) in testfeats]

    # Calculate scores
    tp = np.sum([pred==True and actual==True for (pred, actual) in pred_actual])
    tn = np.sum([pred==False and actual==False for (pred, actual) in pred_actual])
    fp = np.sum([pred==True and actual==False for (pred, actual) in pred_actual])
    fn = np.sum([pred==False and actual==True for (pred, actual) in pred_actual])

    accuracy = (tp+tn)*1.0 / len(testfeats)
    precision = tp*1.0 / (tp+fp)
    recall = tp*1.0 / (tp+fn)
    
    # Print scores
    print 'accuracy:  ', accuracy
    print 'precision: ', precision
    print 'recall:    ', recall
    classifier.show_most_informative_features()

train on 1194 instances, test on 399 instances
accuracy:   0.641604010025
precision:  0.678832116788
recall:     0.484375
Most Informative Features
                   sorry = True            False : True   =      9.0 : 1.0
                    miss = True            False : True   =      7.6 : 1.0
                   won't = True            False : True   =      7.1 : 1.0
                  didn't = True            False : True   =      7.1 : 1.0
                    wish = True            False : True   =      6.5 : 1.0
                    down = True            False : True   =      5.9 : 1.0
                    hate = True            False : True   =      5.8 : 1.0
                    than = True             True : False  =      5.4 : 1.0
                 Twitter = True             True : False  =      5.4 : 1.0
                    left = True            False : True   =      5.3 : 1.0


```
Results with sample:

Most Informative Features
                   sorry = True              neg : pos    =      9.0 : 1.0
                    miss = True              neg : pos    =      7.6 : 1.0
                   won't = True              neg : pos    =      7.1 : 1.0
                  didn't = True              neg : pos    =      7.1 : 1.0
                    wish = True              neg : pos    =      6.5 : 1.0
                    down = True              neg : pos    =      5.9 : 1.0
                    hate = True              neg : pos    =      5.8 : 1.0
                 Twitter = True              pos : neg    =      5.4 : 1.0
                    than = True              pos : neg    =      5.4 : 1.0
                    left = True              neg : pos    =      5.3 : 1.0

accuracy:  0.6365914786967418
precision: 0.671532846715
recall:    0.479166666667
F-measure: 0.559270516717
```

```
Results with full set:
    
    Most Informative Features
            #squarespace = True              neg : pos    =    211.0 : 1.0
                     447 = True              neg : pos    =     97.0 : 1.0
                    Poem = True              pos : neg    =     80.3 : 1.0
                saddened = True              neg : pos    =     79.7 : 1.0
             condolences = True              neg : pos    =     53.8 : 1.0
                    sadd = True              neg : pos    =     46.3 : 1.0
                     228 = True              neg : pos    =     45.7 : 1.0
                   Boooo = True              neg : pos    =     40.2 : 1.0
                  (tears = True              neg : pos    =     39.0 : 1.0
                     SAD = True              neg : pos    =     38.2 : 1.0
                   
accuracy:    0.748685
precision:   0.847306016424
recall:      0.606705
F-measure: 0.707098902123
```