In [24]:
#preapre a model of sentiment analysis of movies dataset
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report,precision_score,recall_score,f1_score,confusion_matrix

In [25]:
df= pd.read_csv('IMDB Dataset.csv')
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [26]:
from nltk.corpus import stopwords

In [27]:
nltk.download('stopwords')
stopwords = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to C:\Users\Raj
[nltk_data]     Aryan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [28]:
print(stopwords)

{'few', 'other', 'just', "they're", 'am', 'mightn', 'were', "you've", 'by', 'and', 'has', 'did', "you'd", "he's", 'y', 'our', 'having', "we've", 'mustn', 'my', "don't", "i've", 'of', 'so', 'here', "aren't", 'we', 'theirs', 'when', "weren't", 'before', 'hasn', 'no', 'up', 'for', 'yourselves', 'all', 'off', 'whom', 'which', 'about', 'as', 'until', 'same', "you're", 'under', 'at', 'ourselves', "i'm", 'm', 'between', "hadn't", 'through', 'more', "we'll", 'itself', "we'd", 'you', "it'll", 'only', 'himself', 'weren', 'if', 'isn', "they've", 'an', 'o', "mustn't", 'doing', 'where', 'with', "she's", 'into', 'do', "they'd", 'myself', 'd', 'any', 'me', "haven't", 's', 'there', 'not', 'the', 'than', 'to', 'while', 'needn', 'shouldn', 'will', 'she', "they'll", 'be', "won't", 'then', 'he', 'was', "it'd", 'shan', "hasn't", 'hers', 'won', 'yourself', 'out', "we're", "didn't", 'these', 're', 'her', 'it', "needn't", "should've", "wouldn't", 'haven', 'on', "she'll", 'each', 'because', 'too', "i'll", 'som

In [29]:
df.shape

(50000, 2)

In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [31]:
df["review"].value_counts()

review
Loved today's show!!! It was a variety and not solely cooking (which would have been great too). Very stimulating and captivating, always keeping the viewer peeking around the corner to see what was coming up next. She is as down to earth and as personable as you get, like one of us which made the show all the more enjoyable. Special guests, who are friends as well made for a nice surprise too. Loved the 'first' theme and that the audience was invited to play along too. I must admit I was shocked to see her come in under her time limits on a few things, but she did it and by golly I'll be writing those recipes down. Saving time in the kitchen means more time with family. Those who haven't tuned in yet, find out what channel and the time, I assure you that you won't be disappointed.                                                                                                                                                                                                         

In [32]:
df["sentiment"].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [33]:
#mapping the sentiments to some numerical value
df["sentiment"] = df["sentiment"].map({
    "positive": 1,
    "negative": 0
})


In [34]:
#clean the text
def clean_text(text):
    text = re.sub(r"[^a-zA-Z]"," ",text).lower()
    tokens = text.split()
    tokens = [word for word in tokens if word not in stopwords]
    return " ".join(tokens)

In [35]:
df["cleaned_review"] = df["review"].apply(clean_text)

In [36]:
df["cleaned_review"].head()

0    one reviewers mentioned watching oz episode ho...
1    wonderful little production br br filming tech...
2    thought wonderful way spend time hot summer we...
3    basically family little boy jake thinks zombie...
4    petter mattei love time money visually stunnin...
Name: cleaned_review, dtype: object

In [37]:
#feature Extraction
vectorizer = CountVectorizer(max_features= 5000)
X = vectorizer.fit_transform(df["cleaned_review"])
y = df["sentiment"]

In [38]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=1)

In [39]:
#Train the model
model = MultinomialNB()#for text classification
model.fit(X_train,y_train)

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [40]:
#make the pred
y_pred = model.predict(X_test)

In [41]:
#calculate the performance matrix
accuracy = accuracy_score(y_test,y_pred)
precision = precision_score(y_pred,y_test)
recall = recall_score(y_pred,y_test)
f1 = f1_score(y_pred,y_test)
cm = confusion_matrix(y_pred,y_test)
classification_rep = classification_report(y_test,y_pred)

In [42]:
print("Accuracy : ", accuracy)
print("Precision : ", precision)
print("Recall : ", recall)
print("F1 Score : ", f1)
print("Confusion Matrix : \n", cm)
print("Classification Report : \n", classification_rep)

Accuracy :  0.8512
Precision :  0.8492736077481841
Recall :  0.8503030303030303
F1 Score :  0.8497880072683223
Confusion Matrix : 
 [[4303  747]
 [ 741 4209]]
Classification Report : 
               precision    recall  f1-score   support

           0       0.85      0.85      0.85      5044
           1       0.85      0.85      0.85      4956

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000



In [43]:
#save the moel and vectorizer
import joblib
joblib.dump(model, 'sentiment_model.pkl')
joblib.dump(vectorizer, 'vectorizer.pkl')

print("Model and vectorizer saved successfully.")

Model and vectorizer saved successfully.
