In [1]:
# http://streamhacker.com/2010/05/10/text-classification-sentiment-analysis-naive-bayes-classifier/

In [2]:
import nltk
#from nltk.classify import NaiveBayesClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import chi2, SelectKBest
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.externals import joblib
from sklearn.base import TransformerMixin
from PostStemmer import PostStemmer
import pandas as pd
import numpy as np
import re, string, collections
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
tweets = pd.read_csv('twitter_sentiment/training_sample.csv', names=['polarity', 'id', 'datetime', 'query', 'user', 'text'])
tweets.drop(['id','datetime','query','user'], axis=1, inplace=True)

# 0 - the polarity of the tweet (0 = negative, 2 = neutral, 4 = positive)
# 1 - the id of the tweet (2087)
# 2 - the date of the tweet (Sat May 16 23:58:44 UTC 2009)
# 3 - the query (lyx). If there is no query, then this value is NO_QUERY.
# 4 - the user that tweeted (robotickilldozr)
# 5 - the text of the tweet (Lyx is cool)

tweets.head()

Unnamed: 0,polarity,text
0,0,I so hate homeworks -.- My head hurts so bad
1,0,Lots of revision to do tonight too for my fina...
2,0,Caught myself looking up the iphone. Promised ...
3,0,@cherrytreerec I can't see anything Stupid Yo...
4,0,@happyahma - welcome back! Sorry to hear about...


In [4]:
# Corpus only has positive and negative tweets (polarity=={0,4})
tweets['pos'] = tweets['polarity']==4
tweets.head()

Unnamed: 0,polarity,text,pos
0,0,I so hate homeworks -.- My head hurts so bad,False
1,0,Lots of revision to do tonight too for my fina...,False
2,0,Caught myself looking up the iphone. Promised ...,False
3,0,@cherrytreerec I can't see anything Stupid Yo...,False
4,0,@happyahma - welcome back! Sorry to hear about...,False


In [5]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(tweets['text'], tweets['pos'], test_size=0.3)

In [6]:
train_or_load = 'train'

if( train_or_load == 'train' ):
    
    classifier = Pipeline([ ('stem', PostStemmer()),
                            ('vect', TfidfVectorizer(use_idf=False)),
                            ('prune', SelectKBest(chi2, k=800)),
                            ('clf', MultinomialNB()) ])
    
    classifier.fit(X_train, y_train)
    
    # Save
    joblib.dump(classifier, 'twitter_sentiment/classifier.pkl')
    
# Load trained model    
else:
    classifier = joblib.load('twitter_sentiment/classifier.pkl')

In [7]:
# vectorizer = CountVectorizer()

# word_counts = vectorizer.fit_transform(content)

# train_or_load = 'train'

# if( train_or_load == 'train' ):
#     # Train model
#     classifier = MultinomialNB()
#     classifier.fit(X_train, y_train)
#     # Prune features
#     ch2 = SelectKBest(chi2, k=800)
#     X_train = ch2.fit_transform(X_train, y_train)
#     X_test = ch2.transform(X_test)
#     best_words = np.array(vectorizer.get_feature_names())[ch2.get_support()]
#     # Save
#     joblib.dump(classifier, 'twitter_sentiment/classifier.pkl')
#     joblib.dump(best_words, 'twitter_sentiment/best_words.pkl')
    
# # Load trained model    
# else:
#     classifier = joblib.load('twitter_sentiment/classifier.pkl')
#     best_words = joblib.load('twitter_sentiment/best_words.pkl')

In [8]:
# Test model
pred = classifier.predict(X_test)

# Print scores
print 'accuracy:  ', accuracy_score(y_test, pred)
print 'precision: ', precision_score(y_test, pred)
print 'recall:    ', recall_score(y_test, pred)

accuracy:   0.688284518828
precision:  0.751724137931
recall:     0.490990990991


In [None]:
# Tune number of word features
# chi2scores = chi2(word_counts, tweets['pos'])[0]
# best_words = np.array(vectorizer.get_feature_names())[np.argsort(-1*chi2scores)[:600]]
# print best_words[:30]
# word_scores = zip(vectorizer.get_feature_names(), chi2scores)
# best_word_scores = sorted(word_scores, key=lambda (word,chi2sc): chi2sc, reverse=True)
# plt.hist(chi2scores, bins=40)
# plt.vlines(1.5, 0, 2500)
# print best_word_scores[575]