In [1]:
%load_ext autoreload
%autoreload 2

# A simple sentiment prototype

In [2]:
import os  # manipulate paths
import pandas as pd  # SQL-like operations and convenience functions
import joblib  # save and load models

Download the Sentiment140 data from [their website](http://help.sentiment140.com/for-students) or directly from [Standford site](http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip) and set `DATA_DIR` to the directory in which you have put the `CSV` files.

In [3]:
DATA_DIR = "./../data"
training_csv_file = os.path.join(DATA_DIR, 'training.1600000.processed.noemoticon.csv')

In [4]:
training_csv_file

'./../data/training.1600000.processed.noemoticon.csv'

## A peek at the data

In [5]:
names = ('polarity', 'id', 'date', 'query', 'author', 'text')
df = pd.read_csv(training_csv_file, encoding='latin1', names=names)

In [6]:
pd.options.display.max_colwidth = 140  # allow wide columns
df.head()  # show first 5 rows

Unnamed: 0,polarity,id,date,query,author,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by texting it... and might cry as a result School today also. Blah!
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Managed to save 50% The rest go out of bounds
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over there."


In [7]:
df.tail()

Unnamed: 0,polarity,id,date,query,author,text
1599995,4,2193601966,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,AmandaMarie1028,Just woke up. Having no school is the best feeling ever
1599996,4,2193601969,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,TheWDBoards,TheWDB.com - Very cool to hear old Walt interviews! â« http://blip.fm/~8bmta
1599997,4,2193601991,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,bpbabe,Are you ready for your MoJo Makeover? Ask me for details
1599998,4,2193602064,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! Tupac Amaru Shakur
1599999,4,2193602129,Tue Jun 16 08:40:50 PDT 2009,NO_QUERY,RyanTrevMorris,happy #charitytuesday @theNSPCC @SparksCharity @SpeakingUpH4H


In [8]:
df['polarity'].replace({0: -1, 4: 1}, inplace=True)
text = df['text']
target = df['polarity'].values

In [9]:
print(len(target), len(text))

(1600000, 1600000)


## Train the model

Set 20% of the data aside to test the trained model

In [10]:
from sklearn.cross_validation import train_test_split

text_train, text_validation, target_train, target_validation = (
    train_test_split(text, target, test_size=0.2, random_state=42)
)



Build a pipeline

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import LogisticRegressionCV
from sklearn.pipeline import Pipeline

vectorizer = CountVectorizer(ngram_range=(1, 2), max_features=100000)
feature_selector = SelectKBest(chi2, k=5000)
classifier = LogisticRegressionCV(n_jobs=4)

This next cell took ~3 minutes to run on my machine

In [12]:
if os.path.exists('model.pkl'):
    sentiment_pipeline = joblib.load('model.pkl')
else:
    sentiment_pipeline = Pipeline((
        ('v', vectorizer),
        ('f', feature_selector),
        ('c', classifier)
    ))
    sentiment_pipeline.fit(text_train, target_train)
    joblib.dump(sentiment_pipeline, 'model.pkl');

## Test the model

In [13]:
print(sentiment_pipeline.predict(['bad', 'good', "didnt like", "today was a good day", "i hate this product"]))

[-1  1 -1  1 -1]


In [14]:
for text, target in zip(text_validation[:10], target_validation[:10]):
    print(sentiment_pipeline.predict([text])[0], target, '\t', text)

(1, -1, '\t', u'@chrishasboobs AHHH I HOPE YOUR OK!!! ')
(1, -1, '\t', u'@misstoriblack cool , i have no tweet apps  for my razr 2')
(1, -1, '\t', u'@TiannaChaos i know  just family drama. its lame.hey next time u hang out with kim n u guys like have a sleepover or whatever, ill call u')
(-1, -1, '\t', u"School email won't open  and I have geography stuff on there to revise! *Stupid School* :'(")
(1, -1, '\t', u'upper airways problem ')
(-1, -1, '\t', u"Going to miss Pastor's sermon on Faith... ")
(1, 1, '\t', u'on lunch....dj should come eat with me ')
(-1, -1, '\t', u'@piginthepoke oh why are you feeling like that? ')
(-1, -1, '\t', u'gahh noo!peyton needs to live!this is horrible ')
(1, 1, '\t', u'@mrstessyman thank you glad you like it! There is a product review bit on the site  Enjoy knitting it!')


In [15]:
sentiment_pipeline.score(text_validation, target_validation)

0.79943750000000002

## What did the model learn?

In [16]:
feature_names = sentiment_pipeline.steps[0][1].get_feature_names()
feature_names = [feature_names[i] for i in 
                 sentiment_pipeline.steps[1][1].get_support(indices=True)]

def show_most_informative_features(feature_names, clf, n=1000):
    coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
    top = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1])
    for (coef_1, fn_1), (coef_2, fn_2) in top:
        print("\t%.4f\t%-15s\t\t%.4f\t%-15s" % (coef_1, fn_1, coef_2, fn_2))

In [17]:
show_most_informative_features(feature_names, sentiment_pipeline.steps[2][1], n=500)

	-3.4605	not happy      		2.6788	no problem     
	-3.2820	clean me       		2.6236	no worries     
	-2.9184	not looking    		2.5599	cannot wait    
	-2.9167	inaperfectworld		2.3206	cant wait      
	-2.9130	sad            		2.2872	no prob        
	-2.8566	passed away    		2.2462	smiling        
	-2.6917	sadly          		2.1924	nothing wrong  
	-2.6387	not nice       		2.1125	not bad        
	-2.6384	gutted         		2.0138	sad sad        
	-2.6232	not cool       		1.9194	congratulations
	-2.6003	no luck        		1.7807	fuzzball       
	-2.5969	disappointing  		1.7781	no probs       
	-2.4976	heartbreaking  		1.7736	welcome        
	-2.4807	sadd           		1.7362	musicmonday    
	-2.4713	heartbroken    		1.7098	hate hate      
	-2.4325	boohoo         		1.6675	smile          
	-2.4235	rip            		1.6655	yayyy          
	-2.4127	not fun        		1.6282	woooo          
	-2.4052	poor           		1.6259	thankyou       
	-2.4027	dontyouhate    		1.5978	just sayin     
	-2.3918	bummer     