In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('sentiment_analysis_models/input/dataset.csv')

In [3]:
data

Unnamed: 0,id,text,sentiment
0,126301956951117826,@Twitter CEO points to @Apple as 'corporate me...,positive
1,126107965991297024,Not Bad! @Apple Sells Over 4 Million #IPhones ...,positive
2,126008369562652672,Kind of excited. On my way to my last class ri...,positive
3,126183339945234432,@Apple downloads of iOS 5 are proving popular ...,positive
4,125922999651139584,One word - #wow. RT @jldavid iPhone 4S First W...,positive
5,126352268705538048,Come to the dark side 📱“@gretcheneclark: Hey ...,positive
6,126287654093471745,At the bus with my iPhone ;) thxx @apple,positive
7,126256230397259776,@apple @jilive @DanielPink: Apple sells 4 mill...,positive
8,126360935509135362,RT @PhillipRowntree: Just registered as an @ap...,positive
9,126180209501286400,Lmfao look at the argument I had with Siri !!\...,positive


In [4]:
tweets = data['text'].tolist()
sentiments = data['sentiment'].tolist()

# Cleaning the data

In [5]:
from message_tokenize import tokenize_and_stem

# TFIDF matrix

In [6]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer



In [7]:
count_vect = CountVectorizer(stop_words='english', tokenizer=tokenize_and_stem, 
                             max_df=0.9, min_df=0.01, ngram_range=(1,3))
X_train_counts = count_vect.fit_transform(tweets)

In [8]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [9]:
len(count_vect.get_feature_names())

95

# Classifying

In [10]:
clf = MultinomialNB().fit(X_train_tfidf, sentiments)

In [11]:
clf.score(X_train_counts, sentiments)

0.73503401360544218

# Persist Model

In [12]:
from sklearn.externals import joblib

In [13]:
joblib.dump(count_vect, 'sentiment_analysis_models/count_vect.pkl')
joblib.dump(tfidf_transformer, 'sentiment_analysis_models/tfidf_transformer.pkl')
joblib.dump(clf, 'sentiment_analysis_models/clf.pkl')

['sentiment_analysis_models/clf.pkl']

# Testing the prediction

In [14]:
from predict_message_sentiment import predict_sentiment

In [15]:
predict_sentiment('I love this')

'positive'

In [16]:
predict_sentiment('I hate this')

'negative'

In [17]:
predict_sentiment('Just some common text')

'neutral'