In [None]:
#Importing required libraries
import pandas as pd #open source data analysis library
import numpy as np #provides a multidimensional array
from sklearn.feature_extraction.text import CountVectorizer # To convert a collection of text documents to a matrix of token counts.
from sklearn.preprocessing import LabelEncoder #to transform non-numerical labels (as long as they are hashable and comparable) to numerical labels.
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import pickle
import seaborn as sns


In [None]:
#Step 2: Load and Preprocess the Dataset
df = pd.read_excel("imdb_dataset copy.xlsx")
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#Step 3: Performing EDA
df.shape

df.isnull().sum()

df.describe()

df.info()

df["sentiment"].unique()

df["sentiment"].value_counts()

#Note that this is a balanced dataset

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 540 entries, 0 to 539
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     540 non-null    object
 1   sentiment  540 non-null    object
dtypes: object(2)
memory usage: 8.6+ KB


negative    285
positive    255
Name: sentiment, dtype: int64

In [None]:
#Step 3: Applying LabelEncoding
#making target feature into numerical (Positive: 1, Negative: 0)
label = LabelEncoder()
df["sentiment"] = label.fit_transform(df["sentiment"])

In [None]:
#Step 4: Dividing data into ind and dep, removing special char, stemming
x = df["review"]
y = df["sentiment"]

import nltk
nltk.download("stopwords")
import re
p = PorterStemmer()
corpus = []
for i in range(len(x)):
    #print(i)
    review = re.sub("[^a-zA-Z]"," ",x[i])
    review = review.lower()
    review = review.split()
    review = [p.stem(word) for word in review if word not in set(stopwords.words("english"))]
    review = " ".join(review)
    corpus.append(review)


#corpus

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
#Step 5: Vectorize the Text Data
from sklearn.feature_extraction.text import TfidfVectorizer
cv = TfidfVectorizer(max_features=5000)
x = cv.fit_transform(corpus).toarray()
Y = cv.fit_transform(corpus).toarray()

In [None]:
#Step 6: Split the Dataset
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((432, 5000), (108, 5000), (432,), (108,))

In [None]:
#Step 7: Define naive-bayes model
mnb = MultinomialNB()
mnb.fit(X_train, y_train)

#test model using test data
pred = mnb.predict(X_test)

In [None]:
#Step 8: Evaluate the Model
print(accuracy_score(y_test, pred))
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))


0.6018518518518519
[[49  4]
 [39 16]]
              precision    recall  f1-score   support

           0       0.56      0.92      0.70        53
           1       0.80      0.29      0.43        55

    accuracy                           0.60       108
   macro avg       0.68      0.61      0.56       108
weighted avg       0.68      0.60      0.56       108



In [None]:
#Step 8: Define function to test model
import pickle
pickle.dump(cv, open("count-Vectorozer.pkl", 'wb'))
pickle.dump(mnb, open("Movie_Review_Classification.pkl", 'wb'))

#Loading my naive-bayes model

save_cv = pickle.load(open("count-Vectorozer.pkl", 'rb'))
model = pickle.load(open("Movie_Review_Classification.pkl",'rb'))


def test_model(sentence):
    sen = save_cv.transform([sentence]).toarray()
    res = model.predict(sen)[0]
    if res==1:
        return "Positive review!"
    else:
        return "Negative review!"

In [None]:
#Step 9: Testing
sen = "This is a great movie"
res = test_model(sen)
print(res)


sen = "This is nice movie"
res = test_model(sen)
print(res)


sen = "I have never seen such kind of a movie in my entire life. I'm quite stunned"
res = test_model(sen)
print(res)


Positive review!
Positive review!
Negative review!
