In [4]:
import os
import pandas as pd

file_name = 'IMDB Dataset.csv'

if os.path.exists(file_name):
    data = pd.read_csv(file_name)
    
    print("Dataset loaded successfully!")
    
    print(data.head())  # Display the first few rows

else:
    print(f"Error: {file_name} not found in the current directory.")


Dataset loaded successfully!
                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

# Step 1: Encode sentiment labels
label_encoder = LabelEncoder()
data['sentiment_encoded'] = label_encoder.fit_transform(data['sentiment'])

# Step 2: Split the data into training and testing sets
X = data['review']
y = data['sentiment_encoded']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Step 3: Text vectorization using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=20000, stop_words='english')
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Output dimensions of the training and testing data
X_train_tfidf.shape, X_test_tfidf.shape


((40000, 20000), (10000, 20000))

In [8]:
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, classification_report
from scipy.stats import mode
import numpy as np

# Step 4: K-means Clustering
# Train a K-means model with 2 clusters (as we have 2 classes: positive, negative)
kmeans = KMeans(n_clusters=2, random_state=42)
kmeans.fit(X_train_tfidf)

# Predict cluster labels for the test data
kmeans_labels = kmeans.predict(X_test_tfidf)

# Map cluster labels to original sentiment labels using mode
# Since k-means assigns arbitrary cluster numbers, we align them to the actual sentiments
kmeans_mapped_labels = np.zeros_like(kmeans_labels)
for i in range(2):  # 2 clusters
    mask = (kmeans.labels_ == i)
    kmeans_mapped_labels[kmeans_labels == i] = mode(y_train[mask])[0]

# Evaluate the K-means model
kmeans_accuracy = accuracy_score(y_test, kmeans_mapped_labels)
kmeans_report = classification_report(y_test, kmeans_mapped_labels, target_names=label_encoder.classes_)

kmeans_accuracy, kmeans_report


(0.5182,
 '              precision    recall  f1-score   support\n\n    negative       0.52      0.40      0.45      5000\n    positive       0.51      0.64      0.57      5000\n\n    accuracy                           0.52     10000\n   macro avg       0.52      0.52      0.51     10000\nweighted avg       0.52      0.52      0.51     10000\n')

In [10]:
from sklearn.svm import LinearSVC

# Step 5: Support Vector Machine (SVM)
# Train a linear SVM model on the training data
svm = LinearSVC(random_state=42)
svm.fit(X_train_tfidf, y_train)

# Predict on the test data
svm_predictions = svm.predict(X_test_tfidf)

# Evaluate the SVM model
svm_accuracy = accuracy_score(y_test, svm_predictions)
svm_report = classification_report(y_test, svm_predictions, target_names=label_encoder.classes_)

svm_accuracy, svm_report


(0.8942,
 '              precision    recall  f1-score   support\n\n    negative       0.90      0.89      0.89      5000\n    positive       0.89      0.90      0.89      5000\n\n    accuracy                           0.89     10000\n   macro avg       0.89      0.89      0.89     10000\nweighted avg       0.89      0.89      0.89     10000\n')

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report

# Step 6: Neural Network Alternative (MLP)
# Train an MLPClassifier on the TF-IDF features
mlp = MLPClassifier(hidden_layer_sizes=(64, 32), activation='relu', solver='adam', random_state=42, max_iter=75)

# Train the model
mlp.fit(X_train_tfidf, y_train)

# Predict on the test data
mlp_predictions = mlp.predict(X_test_tfidf)

# Evaluate the MLP model
mlp_accuracy = accuracy_score(y_test, mlp_predictions)
mlp_report = classification_report(y_test, mlp_predictions, target_names=label_encoder.classes_)

print("MLP Accuracy:", mlp_accuracy)
print("\nClassification Report:\n", mlp_report)
