# Exploratory Data Analysis 

In [1]:
import pandas as pd

In [2]:
# Load the movie review dataset
data = pd.read_csv("movie_reviews.csv")

In [3]:
# Basic information about the dataset
print("Basic information about the dataset:")
print(data.info())

Basic information about the dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     1000 non-null   object
 1   sentiment  1000 non-null   object
dtypes: object(2)
memory usage: 15.8+ KB
None


In [4]:
# Explore the size of the dataset (number of rows and columns)
print("Size of the dataset:")
print(data.shape)

Size of the dataset:
(1000, 2)


In [5]:
# Check for missing values in the dataset
print("Missing Values:")
print(data.isnull().sum())

Missing Values:
review       0
sentiment    0
dtype: int64


In [6]:
# Analyze the distribution of the target variable 'sentiment'
print("Distribution of the target variable 'sentiment':")
print(data['sentiment'].value_counts())

Distribution of the target variable 'sentiment':
negative    524
positive    476
Name: sentiment, dtype: int64


# Data Preprocessing

In [7]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download stopwords (run this once)
nltk.download('stopwords')
nltk.download('punkt')

# Function to clean and preprocess the text data
def preprocess_text(text):
    # Remove special characters
    text = re.sub(r'[^\w\s]', '', text)
    # Convert text to lowercase
    text = text.lower()
    # Tokenize the text into individual words
    words = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    # Join the words back into a string
    cleaned_text = ' '.join(words)
    return cleaned_text

# Apply text preprocessing to the 'review' column and create a new 'cleaned_text' column
data['cleaned_text'] = data['review'].apply(preprocess_text)

# Separate features (X) and target (y)
X = data['cleaned_text']
y = data['sentiment']

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Manisha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Manisha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [8]:
# To view cleaned_text
data.head()

Unnamed: 0,review,sentiment,cleaned_text
0,I really liked this Summerslam due to the look...,positive,really liked summerslam due look arena curtain...
1,Not many television shows appeal to quite as m...,positive,many television shows appeal quite many differ...
2,The film quickly gets to a major chase scene w...,negative,film quickly gets major chase scene ever incre...
3,Jane Austen would definitely approve of this o...,positive,jane austen would definitely approve onebr br ...
4,Expectations were somewhat high for me when I ...,negative,expectations somewhat high went see movie thou...


# Text Vectorization

In [9]:
from sklearn.model_selection import train_test_split

# Split the data into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Bag-of-Words Vectorization
bow_vectorizer = CountVectorizer()
X_train_bow = bow_vectorizer.fit_transform(X_train)
X_test_bow = bow_vectorizer.transform(X_test)

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Model training and evaluation

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix

# Logistic Regression for Sentiment Analysis
# Training the model using BoW representation
log_reg_bow = LogisticRegression()
log_reg_bow.fit(X_train_bow, y_train)

# Training the model using TF-IDF representation
log_reg_tfidf = LogisticRegression()
log_reg_tfidf.fit(X_train_tfidf, y_train)

# Evaluating Logistic Regression Models
# Classification report and confusion matrix for BoW representation
print("Logistic Regression with BoW Representation:")
y_pred_bow = log_reg_bow.predict(X_test_bow)
print(classification_report(y_test, y_pred_bow))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_bow))

# Classification report and confusion matrix for TF-IDF representation
print("\nLogistic Regression with TF-IDF Representation:")
y_pred_tfidf = log_reg_tfidf.predict(X_test_tfidf)
print(classification_report(y_test, y_pred_tfidf))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_tfidf))

# Naive Bayes for Sentiment Analysis
# Training the model using BoW representation
naive_bayes_bow = MultinomialNB()
naive_bayes_bow.fit(X_train_bow, y_train)

# Training the model using TF-IDF representation
naive_bayes_tfidf = MultinomialNB()
naive_bayes_tfidf.fit(X_train_tfidf, y_train)

# Evaluating Naive Bayes Models
# Classification report and confusion matrix for BoW representation
print("\nNaive Bayes with BoW Representation:")
y_pred_bow_nb = naive_bayes_bow.predict(X_test_bow)
print(classification_report(y_test, y_pred_bow_nb))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_bow_nb))

# Classification report and confusion matrix for TF-IDF representation
print("\nNaive Bayes with TF-IDF Representation:")
y_pred_tfidf_nb = naive_bayes_tfidf.predict(X_test_tfidf)
print(classification_report(y_test, y_pred_tfidf_nb))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_tfidf_nb))

# Support Vector Machine (SVM) for Sentiment Analysis
# Training the model using BoW representation
svm_bow = SVC()
svm_bow.fit(X_train_bow, y_train)

# Training the model using TF-IDF representation
svm_tfidf = SVC()
svm_tfidf.fit(X_train_tfidf, y_train)

# Evaluating SVM Models
# Classification report and confusion matrix for BoW representation
print("\nSVM with BoW Representation:")
y_pred_bow_svm = svm_bow.predict(X_test_bow)
print(classification_report(y_test, y_pred_bow_svm))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_bow_svm))

# Classification report and confusion matrix for TF-IDF representation
print("\nSVM with TF-IDF Representation:")
y_pred_tfidf_svm = svm_tfidf.predict(X_test_tfidf)
print(classification_report(y_test, y_pred_tfidf_svm))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_tfidf_svm))

Logistic Regression with BoW Representation:
              precision    recall  f1-score   support

    negative       0.87      0.75      0.81       114
    positive       0.72      0.85      0.78        86

    accuracy                           0.80       200
   macro avg       0.80      0.80      0.79       200
weighted avg       0.81      0.80      0.80       200

Confusion Matrix:
[[86 28]
 [13 73]]

Logistic Regression with TF-IDF Representation:
              precision    recall  f1-score   support

    negative       0.85      0.83      0.84       114
    positive       0.78      0.80      0.79        86

    accuracy                           0.82       200
   macro avg       0.82      0.82      0.82       200
weighted avg       0.82      0.82      0.82       200

Confusion Matrix:
[[95 19]
 [17 69]]

Naive Bayes with BoW Representation:
              precision    recall  f1-score   support

    negative       0.84      0.90      0.87       114
    positive       0.86      0.