In [1]:
import os
import pandas as pd
def load_reviews_from_folder(folder_path, sentiment):
    reviews = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            with open(os.path.join(folder_path, filename), encoding="utf-8") as file:
                review = file.read()
                reviews.append((review, sentiment))
    return reviews
train_pos_path = "C:/Users/YASHRAJ/Downloads/aclImdb_v1/aclImdb/train/pos"
train_neg_path = "C:/Users/YASHRAJ/Downloads/aclImdb_v1/aclImdb/train/neg"
test_pos_path = "C:/Users/YASHRAJ/Downloads/aclImdb_v1/aclImdb/test/pos"
test_neg_path = "C:/Users/YASHRAJ/Downloads/aclImdb_v1/aclImdb/test/neg"

reviews = []
reviews += load_reviews_from_folder(train_pos_path, 1)
reviews += load_reviews_from_folder(train_neg_path, 0)
reviews += load_reviews_from_folder(test_pos_path, 1)
reviews += load_reviews_from_folder(test_neg_path, 0)

df = pd.DataFrame(reviews, columns=["review", "label"])
df = df.sample(frac=1).reset_index(drop=True)
df.to_csv("imdb_reviews.csv", index=False)
print("Saved to imdb_reviews.csv")

Saved to imdb_reviews.csv


In [2]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer

df = pd.read_csv("imdb_reviews.csv")  
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'<.*?>', '', text)           
    text = re.sub(r'[^a-z\s]', '', text)        
    text = re.sub(r'\s+', ' ', text).strip()    
    return text
df['cleaned_review'] = df['review'].apply(preprocess_text)
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X = vectorizer.fit_transform(df['cleaned_review'])

df[['cleaned_review', 'label']].to_csv("imdb_reviews_preprocessed.csv", index=False)
print("✅ Preprocessed data saved as 'imdb_reviews_preprocessed.csv'")

✅ Preprocessed data saved as 'imdb_reviews_preprocessed.csv'


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
df = pd.read_csv("imdb_reviews_preprocessed.csv")

X_text = df['cleaned_review']
y = df['label']
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X = vectorizer.fit_transform(X_text)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("🔍 Accuracy:", accuracy_score(y_test, y_pred))
print("\n📊 Classification Report:\n", classification_report(y_test, y_pred))
print("🧮 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

🔍 Accuracy: 0.8815

📊 Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.88      0.88      5074
           1       0.87      0.89      0.88      4926

    accuracy                           0.88     10000
   macro avg       0.88      0.88      0.88     10000
weighted avg       0.88      0.88      0.88     10000

🧮 Confusion Matrix:
 [[4441  633]
 [ 552 4374]]


In [4]:
from sklearn.naive_bayes import MultinomialNB
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)
y_pred_nb = nb_model.predict(X_test)
print("🔹 MultinomialNB Accuracy:", accuracy_score(y_test, y_pred_nb))
print("\n", classification_report(y_test, y_pred_nb))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_nb))

🔹 MultinomialNB Accuracy: 0.8537

               precision    recall  f1-score   support

           0       0.86      0.85      0.85      5074
           1       0.85      0.86      0.85      4926

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000

Confusion Matrix:
 [[4300  774]
 [ 689 4237]]


In [5]:
from sklearn.svm import SVC
svc_model = SVC(kernel='linear')  
svc_model.fit(X_train, y_train)
y_pred_svc = svc_model.predict(X_test)
print("🔹 SVC Accuracy:", accuracy_score(y_test, y_pred_svc))
print("\n", classification_report(y_test, y_pred_svc))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_svc))

🔹 SVC Accuracy: 0.8776

               precision    recall  f1-score   support

           0       0.88      0.87      0.88      5074
           1       0.87      0.88      0.88      4926

    accuracy                           0.88     10000
   macro avg       0.88      0.88      0.88     10000
weighted avg       0.88      0.88      0.88     10000

Confusion Matrix:
 [[4438  636]
 [ 588 4338]]


In [6]:
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
print("🔹 Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("\n", classification_report(y_test, y_pred_rf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))

🔹 Random Forest Accuracy: 0.8483

               precision    recall  f1-score   support

           0       0.84      0.86      0.85      5074
           1       0.85      0.84      0.84      4926

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000

Confusion Matrix:
 [[4361  713]
 [ 804 4122]]


In [4]:
from sklearn.svm import SVC
svc_model_rbf = SVC(kernel='rbf')  
svc_model_rbf.fit(X_train, y_train)
y_pred_svc = svc_model_rbf.predict(X_test)
print("🔹 SVC Accuracy:", accuracy_score(y_test, y_pred_svc))
print("\n", classification_report(y_test, y_pred_svc))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_svc))

🔹 SVC Accuracy: 0.8844

               precision    recall  f1-score   support

           0       0.89      0.88      0.89      5074
           1       0.88      0.89      0.88      4926

    accuracy                           0.88     10000
   macro avg       0.88      0.88      0.88     10000
weighted avg       0.88      0.88      0.88     10000

Confusion Matrix:
 [[4456  618]
 [ 538 4388]]


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

df = pd.read_csv("imdb_reviews_preprocessed.csv")
X = df['cleaned_review'].astype(str)
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

tokenizer = Tokenizer(num_words=10000, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

max_length = 200
X_train_pad = pad_sequences(X_train_seq, maxlen=max_length, padding='post', truncating='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_length, padding='post', truncating='post')
model = Sequential()
model.add(Embedding(input_dim=10000, output_dim=64, input_length=max_length))
model.add(LSTM(64, return_sequences=False))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model.fit(X_train_pad, y_train, epochs=20, batch_size=64, validation_split=0.2)
y_pred_probs = model.predict(X_test_pad)
y_pred = (y_pred_probs > 0.5).astype(int)
print("🔍 Accuracy:", accuracy_score(y_test, y_pred))
print("\n📊 Classification Report:\n", classification_report(y_test, y_pred))
print("🧮 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Epoch 1/20




[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m86s[0m 163ms/step - accuracy: 0.5163 - loss: 0.6925 - val_accuracy: 0.5226 - val_loss: 0.6884
Epoch 2/20
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 162ms/step - accuracy: 0.5743 - loss: 0.6688 - val_accuracy: 0.6062 - val_loss: 0.6170
Epoch 3/20
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 164ms/step - accuracy: 0.6829 - loss: 0.6118 - val_accuracy: 0.7157 - val_loss: 0.5774
Epoch 4/20
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 164ms/step - accuracy: 0.7659 - loss: 0.5243 - val_accuracy: 0.7579 - val_loss: 0.5308
Epoch 5/20
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 163ms/step - accuracy: 0.7887 - loss: 0.4903 - val_accuracy: 0.7284 - val_loss: 0.6813
Epoch 6/20
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 164ms/step - accuracy: 0.8164 - loss: 0.4605 - val_accuracy: 0.5729 - val_loss: 0.6781
Epoch 7/20
[1m500/50

In [6]:
import joblib
joblib.dump(svc_model_rbf, "svm_model.pkl")
joblib.dump(vectorizer, "vectorizer.pkl")
print("✅ Model and vectorizer saved successfully!")

✅ Model and vectorizer saved successfully!
