# Sentiment Analysis

In [39]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import naive_bayes
from sklearn.metrics import roc_auc_score

#### Load data

In [19]:
df = pd.read_csv("data.txt", sep="\t", header=None, names=["liked", "text"])

In [20]:
df.head()

Unnamed: 0,liked,text
0,1,The Da Vinci Code book is just awesome.
1,1,this was the first clive cussler i've ever rea...
2,1,i liked the Da Vinci Code a lot.
3,1,i liked the Da Vinci Code a lot.
4,1,I liked the Da Vinci Code but it ultimatly did...


#### Preprocess data

In [26]:
stopset = set(stopwords.words("english"))
vectorizer = TfidfVectorizer(use_idf=True, lowercase=True, strip_accents="ascii", stop_words=stopset) # most of these args are just preprocessing the data

In [27]:
y = df.liked
X = vectorizer.fit_transform(df.text)

In [30]:
print(y.shape)
print(X.shape)

(6918,)
(6918, 2011)


#### Train

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)

In [33]:
clf = naive_bayes.MultinomialNB()
clf.fit(X_train, y_train)

MultinomialNB()

#### Evaluate model

In [37]:
clf.classes_

array([0, 1])

In [36]:
clf.predict_proba(X_test)

array([[9.99997127e-01, 2.87272653e-06],
       [9.85756577e-01, 1.42434227e-02],
       [2.50196092e-02, 9.74980391e-01],
       ...,
       [8.93510067e-04, 9.99106490e-01],
       [9.59830779e-01, 4.01692213e-02],
       [9.98244314e-01, 1.75568616e-03]])

In [34]:
roc_auc_score(y_test, clf.predict_proba(X_test)[:,1])

0.9992027171740706

#### Use model with new data

In [53]:
review1 = ["This movie was good", "bad"]
review1_vector = vectorizer.transform(review1)
clf.predict(review1_vector)

array([1, 0])