In [2]:
# Load necessary libraries and modules
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score
import random

In [3]:
# Load the dataset
data = pd.read_csv("Dataset.csv")
print("Dataset shape:", data.shape)

Dataset shape: (50000, 2)


In [4]:
# Check for missing values
print("Missing values:")
print(data.isnull().sum())

Missing values:
Review       0
Sentiment    0
dtype: int64


In [5]:
# Perform Exploratory Data Analysis (EDA)
# Print distribution of sentiment
print("Distribution of sentiment:")
sentiment_counts = data['Sentiment'].value_counts()
print(sentiment_counts)

Distribution of sentiment:
Sentiment
positive    25000
negative    25000
Name: count, dtype: int64


In [6]:
# Text preprocessing
stop_words = set(stopwords.words('english'))

In [7]:
# Define function to preprocess text data
def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    filtered_tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
    return " ".join(filtered_tokens)

In [8]:
# Apply text preprocessing to 'Review' column
data['Review'] = data['Review'].apply(preprocess_text)

In [9]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(data['Review'], data['Sentiment'], test_size=0.2, random_state=42)

In [10]:
# TF-IDF vectorization
# Initialize TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

In [11]:
# Fit and transform the training data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

In [12]:
# Transform the test data
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [13]:
# Model training
# Initialize Support Vector Machine (SVM) classifier
svm_classifier = LinearSVC()

In [14]:
# Train the SVM classifier
svm_classifier.fit(X_train_tfidf, y_train)



In [15]:
# Model evaluation
# Make predictions on the test set
y_pred = svm_classifier.predict(X_test_tfidf)

In [16]:
# Print classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

    negative       0.90      0.89      0.89      4961
    positive       0.89      0.90      0.90      5039

    accuracy                           0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.90      0.90      0.90     10000



In [17]:
# Print accuracy score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.8958


In [22]:
# Test the model on a random sentence
# Generate a random sentence
random_sentence = "This movie exceeded all my expectations. I loved every minute of it!"

# Preprocess the random sentence
random_sentence = preprocess_text(random_sentence)

# Transform the preprocessed sentence using TF-IDF vectorizer
random_sentence_tfidf = tfidf_vectorizer.transform([random_sentence])

# Make prediction on the random sentence
predicted_sentiment = svm_classifier.predict(random_sentence_tfidf)[0]

In [23]:
# Print the random sentence and predicted sentiment
print("\nRandom Sentence:", random_sentence)
print("Predicted Sentiment:", predicted_sentiment)


Random Sentence: movie exceeded expectations loved every minute
Predicted Sentiment: positive
