# **Sentiment Analysis**

In [1]:
# Load the data
import pandas as pd

url = 'https://raw.githubusercontent.com/jeffprosise/Machine-Learning/master/Data/reviews.csv'
df = pd.read_csv(url, encoding='ISO-8859-1')

In [2]:
# Defines a function for cleaning text by removing punctuation characters, converting multiple
# spaces to single spaces, and converting characters to lowercase
import string, re

table = str.maketrans('', '', string.punctuation)

def clean_text(text):
    return re.sub(' +', ' ', text.translate(table).lower())

In [3]:
# Clean the text in the dataset
df['Text'] = df['Text'].apply(lambda x: clean_text(x))
df.head()

Unnamed: 0,Text,Sentiment
0,once again mr costner has dragged out a movie ...,0
1,this is an example of why the majority of acti...,0
2,first of all i hate those moronic rappers who ...,0
3,not even the beatles could write songs everyon...,0
4,brass pictures movies is not a fitting word fo...,0


In [4]:
# Vectorize the text
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(ngram_range=(1, 2), stop_words=['i', 'we', 'you', 'the', 'and', 'am', 'are'], min_df=10)
vectors = vectorizer.fit_transform(df['Text'])

In [5]:
#  Split the dataset into a training set and a test set
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(vectors, df['Sentiment'], test_size=0.2, random_state=0)

In [6]:
# Train a classifier
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=500)
model.fit(X_train, y_train)

LogisticRegression(max_iter=500)

In [7]:
# Score the model
model.score(X_test, y_test)

0.9084

In [8]:
# Assess accuracy with a confusion matrix
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, model.predict(X_test))

array([[4531,  471],
       [ 445, 4553]], dtype=int64)

In [9]:
# Score a review
review = [clean_text('The long lines and poor customer service really turned me off.')]
model.predict_proba(vectorizer.transform(review))[0][1]

0.1647511889712159

In [10]:
# Score another review
review = [clean_text('One of the most delightful experiences I have ever had!')]
model.predict_proba(vectorizer.transform(review))[0][1]

0.8442029860805697

In [11]:
# Save the model and the vocabulary
import pickle

pickle.dump(model, open('sentiment.pkl', 'wb'))
pickle.dump(vectorizer.vocabulary_, open('sentiment.pkl', 'wb'))