In [60]:
# IMPORTING LIBRARIES

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import confusion_matrix, accuracy_score

In [2]:
# Importing Dataset
dataset = pd.read_csv(r'C:\Users\anand\OneDrive - Sheridan College\College\Machine Learning\Projects\Sentiment Analysis\IMDB Dataset.csv')

# Basic EDA
print(dataset.head())
print("---------------------------------------------------------------------------")
print(dataset.info())
print("---------------------------------------------------------------------------")
print("Columns in the dataset:", dataset.columns)

                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive
---------------------------------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB
None
---------------------------------------------------------------------------
Columns in the dataset: Index(['review', 'sentiment'], dtype='object')


In [3]:
# Check for missing values
print("Missing values:")
print(dataset.isnull().sum())

Missing values:
review       0
sentiment    0
dtype: int64


In [7]:
# Cleaning the Text
# Using Lemmatization

import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
corpus = []
for review in dataset['review']:
    review = re.sub('[^a-zA-Z]', ' ', review) 
    review = re.sub('br', ' ', review) 
    review = review.split()
    lemmatizer = WordNetLemmatizer()
    all_stopwords = stopwords.words('english')
    review = [lemmatizer.lemmatize(word) for word in review if not word in set(all_stopwords) and len(word) > 2]
    review = ' '.join(review)
    corpus.append(review)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\anand\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\anand\AppData\Roaming\nltk_data...


In [10]:
print(corpus[:5])

['One reviewer mentioned watching episode hooked They right exactly happened The first thing struck utality unflinching scene violence set right word Trust show faint hearted timid This show pull punch regard drug sex violence Its hardcore classic use word called nickname given Oswald Maximum Security State Penitentary focus mainly Emerald City experimental section prison cell glass front face inwards privacy high agenda City home many Aryans Muslims gangsta Latinos Christians Italians Irish scuffle death stare dodgy dealing shady agreement never far away would say main appeal show due fact go show dare Forget pretty picture painted mainstream audience forget charm forget romance mess around The first episode ever saw struck nasty surreal say ready watched developed taste got accustomed high level graphic violence Not violence injustice crooked guard sold nickel inmate kill order get away well mannered middle class inmate turned prison bitch due lack street skill prison experience Watc

In [61]:
# Creating the Bag of Words Model - Tokenization

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=50000)
x = cv.fit_transform(corpus)
sentiment_mapping = {'negative': 0, 'positive': 1}
y = dataset['sentiment'].map(sentiment_mapping).values

In [62]:
# Splitting the dataset into Training and Test Set

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

In [63]:
# Constructing the Logistic Regression Model

from sklearn.linear_model import LogisticRegression
lr_regressor = LogisticRegression(max_iter=1000)
lr_regressor.fit(x_train, y_train)

In [64]:
# Predicting and Making the Confusion Matrix for the Logistic Regression Model

y_pred_lr = lr_regressor.predict(x_test)
lr_cm = confusion_matrix(y_test, y_pred_lr)
print(lr_cm)

# Evaluating the Logistic Regression Model

lr_accuracy = accuracy_score(y_test, y_pred_lr)
print(f"Accuracy of Logistic Regression Model: {lr_accuracy}")

[[4444  591]
 [ 563 4402]]
Accuracy of Logistic Regression Model: 0.8846


In [65]:
# Constructing the Multinomial Naive Bayes Model

from sklearn.naive_bayes import MultinomialNB
nb_classifier = MultinomialNB()
nb_classifier.fit(x_train, y_train)

In [67]:
# Predicting and Making the Confusion Matrix 

y_pred_nb = nb_classifier.predict(x_test)
nb_cm = confusion_matrix(y_test, y_pred_nb)
print(nb_cm)

# Evaluating the model

nb_accuracy = accuracy_score(y_test, y_pred_nb)
print(f"Accuracy of Multinomial Naive Bayes Model: {nb_accuracy}")

[[4422  613]
 [ 808 4157]]
Accuracy of Multinomial Naive Bayes Model: 0.8579
