In [12]:
# Import the necessary libraries
import urllib.request
import os
import pandas as pd
import numpy as np
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

In [13]:
# Reading the .csv file from Pandas and examining the first 3 rows
# The shape of our data is (50000,2), we have 50K rows with 2 columns of review and sentiment. We are going to define a helper function which will help us in various steps of cleaning the text data such as stopwords removal, lowering the case, etc.
df = pd.read_csv('movie_data.csv', encoding='utf-8')
df.head(3)

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0


In [14]:
# Cleaning the text data
# init Objects
# In the below function, we are lowering the cases of sentences, breaking them into tokens, preserving the tokens if they are not part of predefined stopwords and finally stemming them for the root form.
# Once all the below gets completed we are returning the clean text in the end.
tokenizer=RegexpTokenizer(r'\w+')
en_stopwords=set(stopwords.words('english'))
ps=PorterStemmer()
def getStemmedReview(review):
    review=review.lower()
    review=review.replace("<br /><br />"," ")
    #Tokenize
    tokens=tokenizer.tokenize(review)
    new_tokens=[token for token in tokens if token not in  en_stopwords]
    stemmed_tokens=[ps.stem(token) for token in new_tokens]
    clean_review=' '.join(stemmed_tokens)
    return clean_review

In [15]:
# Cleaning all the reviews and splitting our data for training and testing.
# As our total length of data is 50K rows, we are splitting it into 35K rows for training and 15K for testing purposes.
df['review'].apply(getStemmedReview)
X_train = df.loc[:35000, 'review'].values
y_train = df.loc[:35000, 'sentiment'].values
X_test = df.loc[35000:, 'review'].values
y_test = df.loc[35000:, 'sentiment'].values

In [16]:
# Transforming words into feature vectors
# To feed the data to the Machine Learning model, we have to convert categorical data, such as text or words, into a numerical form.
# We are going to use TfidfVectorizer for this purpose which is already present in the scikit-learn library
# we perform the fit operation only on the training set and once the vectorizer learns completely from the training data, we use the same learning to transform our test data.
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(sublinear_tf=True, encoding='utf-8',decode_error='ignore')
vectorizer.fit(X_train)
X_train=vectorizer.transform(X_train)
X_test=vectorizer.transform(X_test)

In [10]:
# Creating the model and checking the score on training and test data
# Here we are using the LogisticRegression model because it’s easy to interpret in terms of probability of the output. Feel free to explore other models as per your preference.
from sklearn.linear_model import LogisticRegression
model=LogisticRegression(solver='liblinear')
model.fit(X_train,y_train)
print("Score on training data is: "+str(model.score(X_train,y_train)))
print("Score on testing data is: "+str(model.score(X_test,y_test)))

Score on training data is: 0.935973257906917
Score on testing data is: 0.8976666666666666


In [19]:
# Let’s verify our model’s output on a single review
# Below is our test point df.iloc[35000,0], which is same as our first point in testing data i.e. X_test[0]
#If you haven't seen the gong show TV series then you won't like this movie much at all, not that knowing the series makes this a great movie. <br /><br />I give it a 5 out of 10 because a few things make it kind of amusing that help make up for its obvious problems.<br />
#When we perform prediction on above point we can see below that our model is performing well enough.
# Here 0 denotes a negative sentiment
model.predict(X_test[0])
# 78% probability that the given text is negative
model.predict_proba(X_test[0])

array([[0.78833439, 0.21166561]])

In [20]:
# Model deployment
# Serializing fitted scikit-learn estimators
# Because we don’t want to retrain our model every time the web-application loads we go for the option of model persistence through Python’s in-built pickle module
import joblib
joblib.dump(en_stopwords,'stopwords.pkl') 
joblib.dump(model,'model.pkl')
joblib.dump(vectorizer,'vectorizer.pkl')

['vectorizer.pkl']