In [1]:
#prepare sentiment analysis of imdb dataset

import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score, confusion_matrix, classification_report

import nltk
import re

In [2]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\achyu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\achyu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\achyu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
stop_words = set(stopwords.words('english'))

In [4]:
print(stop_words)

{"i'd", 'him', 'am', 'because', 'a', 'an', 'not', 'my', 'some', 'after', 'don', 'during', 'myself', 'only', 'until', 'it', "you'd", 'further', 'most', 'itself', 'her', "she'll", 'yours', 'does', 'what', 'yourselves', 'other', 'were', 'who', "he'd", 'all', 'here', 'needn', 'won', 'that', "we're", 'you', 'such', 'ourselves', "i'll", "it'll", 'both', 'at', 'no', 'isn', 'hadn', 'shan', 'same', 'to', "shan't", "mustn't", 'but', 'and', 'those', 'they', 'mustn', 'll', 'each', "we've", 'will', 'are', "she'd", 's', "won't", 'shouldn', 'hers', 'how', "it's", "i'm", 'ma', 'just', "they've", 'between', 'herself', "they're", 'your', 'the', 'very', 'in', 'again', "they'd", 'which', 'own', 'these', 'for', 'there', 'ain', "needn't", 'from', "couldn't", 'or', 'have', "you'll", 'hasn', 'weren', 'haven', 'so', 'then', "she's", "mightn't", 'whom', 'if', 'where', 'be', 'should', 'up', "you're", 'as', 'me', "we'll", "that'll", "he'll", "you've", 'above', 'he', 'once', 'had', 'his', 'o', 'into', 'while', 'm'

In [5]:
# Load the dataset into DataFrame
df = pd.read_csv('IMDB Dataset.csv')
print(df.head())

                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


In [6]:
df.shape

(50000, 2)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [8]:
df['review'].value_counts()

review
Loved today's show!!! It was a variety and not solely cooking (which would have been great too). Very stimulating and captivating, always keeping the viewer peeking around the corner to see what was coming up next. She is as down to earth and as personable as you get, like one of us which made the show all the more enjoyable. Special guests, who are friends as well made for a nice surprise too. Loved the 'first' theme and that the audience was invited to play along too. I must admit I was shocked to see her come in under her time limits on a few things, but she did it and by golly I'll be writing those recipes down. Saving time in the kitchen means more time with family. Those who haven't tuned in yet, find out what channel and the time, I assure you that you won't be disappointed.                                                                                                                                                                                                         

In [9]:
df["sentiment"].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [10]:
#Mapping the sentiment labels to numerical values
df["sentiment"]=df["sentiment"].map({
    "positive": 1,
     "negative": 0
})

In [11]:
def clean_text(text):
    text = re.sub(r"[^a-zA-Z]", " ", text)  # Remove all non-alphabetic characters
    tokens = text.split()  # Split the text into tokens
    tokens = [word for word in tokens if word not in stop_words]  # Remove stop words
    return " ".join(tokens)  # Join the tokens back into a single string


In [12]:
#clean the reviews
df["cleaned_review"] = df["review"].apply(clean_text)


In [13]:
df["cleaned_review"]

0        One reviewers mentioned watching Oz episode ho...
1        A wonderful little production br br The filmin...
2        I thought wonderful way spend time hot summer ...
3        Basically family little boy Jake thinks zombie...
4        Petter Mattei Love Time Money visually stunnin...
                               ...                        
49995    I thought movie right good job It creative ori...
49996    Bad plot bad dialogue bad acting idiotic direc...
49997    I Catholic taught parochial elementary schools...
49998    I going disagree previous comment side Maltin ...
49999    No one expects Star Trek movies high art fans ...
Name: cleaned_review, Length: 50000, dtype: object

In [14]:
#frequency of words

vectorizer = CountVectorizer(max_features=5000)
X = vectorizer.fit_transform(df["cleaned_review"])



In [15]:
y= df["sentiment"]

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [17]:
model = MultinomialNB()

model.fit(X_train, y_train)

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [18]:
#make the pred
y_pred = model.predict(X_test)

In [19]:
#calculate the performance matrix
accuracy = accuracy_score(y_test,y_pred)
precision = precision_score(y_pred,y_test)
recall = recall_score(y_pred,y_test)
f1 = f1_score(y_pred,y_test)
cm = confusion_matrix(y_pred,y_test)
classification_rep = classification_report(y_test,y_pred)

In [None]:
print("Accuracy : ", accuracy)
print("Precision : ", precision)
print("Recall : ", recall)
print("F1 Score : ", f1)
print("Confusion Matrix : \n", cm)
print("Classification Report : \n", classification_rep)

Accuracy :  0.8502
Precision :  0.8473903552292121
Recall :  0.8541708341668334
F1 Score :  0.8507670850767085
Confusion Matrix : 
 [[4232  769]
 [ 729 4270]]
Classification Report : 
               precision    recall  f1-score   support

           0       0.85      0.85      0.85      4961
           1       0.85      0.85      0.85      5039

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000



In [21]:
import joblib

joblib.dump(model, 'sentiment_model.pkl')
joblib.dump(vectorizer, 'vectorizer.pkl')

['vectorizer.pkl']