# importing required libraries


In [2]:
import pandas as pd #for data manipulation
import numpy as np #for data analysis
#for visualization
import matplotlib.pyplot as plt
import seaborn as sns
#visualization of frequent words
from wordcloud import WordCloud
#natural language processing (NLP)
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re #cleaning text
#ML tools
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer# vectorization
from sklearn.naive_bayes import MultinomialNB#naive bayes for text classification
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# loading Dataset ,displaying head lines (first 5)

In [3]:
df = pd.read_csv("IMDB Dataset.csv")
df.head()


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
df.shape #dataset structure 2 columns 50000 lines


(50000, 2)

In [5]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [6]:
from nltk.corpus import stopwords
print(stopwords.words('english')[:10])#English stopwords 


['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an']


# text cleaning 

In [7]:
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

#lemmatizer creation ( grouping many forms of a word in one word )
#used in advanced research znd chatboxes
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

#function to clean text
def clean_text(text):
    #removing html tags
    text = re.sub(r'<.*?>', '', text)
    #only letters
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    #convert to lowercase
    text = text.lower()
    #Remove stopwords and lemmatize
    words = [lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words]
    #words back into a single string
    return ' '.join(words)

#clean the review column
df['clean_review'] = df['review'].apply(clean_text)

#example
df[['review', 'clean_review']].head()


Unnamed: 0,review,clean_review
0,One of the other reviewers has mentioned that ...,one reviewer mentioned watching oz episode hoo...
1,A wonderful little production. <br /><br />The...,wonderful little production filming technique ...
2,I thought this was a wonderful way to spend ti...,thought wonderful way spend time hot summer we...
3,Basically there's a family where a little boy ...,basically family little boy jake think zombie ...
4,"Petter Mattei's ""Love in the Time of Money"" is...",petter mattei love time money visually stunnin...


In [8]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Define a function to clean text
def clean_text(text):
    #removing html tags
    text = re.sub(r'<.*?>', '', text)
    #only letters
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    #convert to lowercase
    text = text.lower()
    #Remove stopwords and lemmatize
    words = [lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words]
    #words back into a single string
    return ' '.join(words)

#clean the review column
df['clean_review'] = df['review'].apply(clean_text)

#example
df[['review', 'clean_review']].head(10)


Unnamed: 0,review,clean_review
0,One of the other reviewers has mentioned that ...,one reviewer mentioned watching oz episode hoo...
1,A wonderful little production. <br /><br />The...,wonderful little production filming technique ...
2,I thought this was a wonderful way to spend ti...,thought wonderful way spend time hot summer we...
3,Basically there's a family where a little boy ...,basically family little boy jake think zombie ...
4,"Petter Mattei's ""Love in the Time of Money"" is...",petter mattei love time money visually stunnin...
5,"Probably my all-time favorite movie, a story o...",probably time favorite movie story selflessnes...
6,I sure would like to see a resurrection of a u...,sure would like see resurrection dated seahunt...
7,"This show was an amazing, fresh & innovative i...",show amazing fresh innovative idea first aired...
8,Encouraged by the positive comments about this...,encouraged positive comment film looking forwa...
9,If you like original gut wrenching laughter yo...,like original gut wrenching laughter like movi...


# vectorization

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

#vectorization TF IDF
vectorizer = TfidfVectorizer(max_features=5000)  #5000 most frequent words
X = vectorizer.fit_transform(df['clean_review']).toarray()#transform clean_review to an array

#target variable (pos/neg)
y = df['sentiment']

#dividing datasets into 2 subsets train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training shape:", X_train.shape)
print("Testing shape:", X_test.shape)


Training shape: (40000, 5000)
Testing shape: (10000, 5000)


# training the model ,prediction and performance evaluation

In [10]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

#creation and training model 
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

#prediction
y_pred = nb_model.predict(X_test)

#performance evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\n Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\n Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.8554

 Confusion Matrix:
 [[4206  755]
 [ 691 4348]]

 Classification Report:
               precision    recall  f1-score   support

    negative       0.86      0.85      0.85      4961
    positive       0.85      0.86      0.86      5039

    accuracy                           0.86     10000
   macro avg       0.86      0.86      0.86     10000
weighted avg       0.86      0.86      0.86     10000

