# Import Libraries

In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import re
import string

import nltk
from nltk.corpus import stopwords
nltk.download('omw-1.4')

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

nltk.download('stopwords')
nltk.download('wordnet')
from nltk.tokenize import WhitespaceTokenizer
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Berkay\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Berkay\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Berkay\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Data Preprocessing

In [11]:
# Load the dataset
df = pd.read_csv('IMDB_Dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [12]:
# Convert sentiment values to numerical (1 for positive, 0 for negative)
df['sentiment'] = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)

In [13]:
# Text preprocessing: Remove HTML tags, URLs, convert to lowercase, and remove punctuation
df['review'] = (
    df['review']
    .str.replace(r'<[^<>]*>', '', regex=True)  # Remove HTML tags
    .str.replace(r'https?://\S+|www\.\S+', '', regex=True)  # Remove URLs
    .str.lower()  # Convert text to lowercase
    .str.replace(f"[{string.punctuation}]", '', regex=True)  # Remove punctuation
)

In [14]:
stop_words = set(stopwords.words('english'))

# Remove stopwords and apply lemmatization
tokenizer = WhitespaceTokenizer()
lemmatizer = WordNetLemmatizer()

df['review'] = df['review'].apply(
    lambda x: ' '.join(
        lemmatizer.lemmatize(word) for word in tokenizer.tokenize(x) if word not in stop_words
    )
)

In [15]:
df.head()

Unnamed: 0,review,sentiment
0,one reviewer mentioned watching 1 oz episode y...,1
1,wonderful little production filming technique ...,1
2,thought wonderful way spend time hot summer we...,1
3,basically there family little boy jake think t...,0
4,petter matteis love time money visually stunni...,1


# Classifier

In [16]:
train, test = train_test_split(df, test_size = 0.25, random_state = 42)

X_train, y_train = train['review'], train['sentiment']
X_test, y_test = test['review'], test['sentiment']

In [17]:
tfidf = TfidfVectorizer()
x_train_vector = tfidf.fit_transform(X_train)

x_test_vector = tfidf.transform(X_test)

In [18]:
multi_clf = MultinomialNB()
multi_clf.fit(x_train_vector, y_train.values)

predict_NB = multi_clf.predict(x_test_vector)

In [19]:
print("Classification Report: \n\n", classification_report(y_test, predict_NB))
print("Confusion Matrix: \n\n", confusion_matrix(y_test, predict_NB))
print("Accuracy: \n\n", accuracy_score(y_test, predict_NB))

Classification Report: 

               precision    recall  f1-score   support

           0       0.85      0.89      0.87      6157
           1       0.88      0.84      0.86      6343

    accuracy                           0.87     12500
   macro avg       0.87      0.87      0.87     12500
weighted avg       0.87      0.87      0.87     12500

Confusion Matrix: 

 [[5459  698]
 [ 988 5355]]
Accuracy: 

 0.86512
