# 🤖 Model Training

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt
import joblib
import json
import os
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

print("✅ Libraries loaded!")

In [None]:
df = pd.read_csv('../data/processed/processed_reviews.csv')
print("✅ Data loaded! Preview:")
df.head()

In [None]:
X = df['processed']
y = df['sentiment']

print("🔀 Splitting data...")
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

print("🔤 TF-IDF vectorizing...")
tfidf = TfidfVectorizer(max_features=10000, ngram_range=(1,2), min_df=2, max_df=0.95)
X_train_vec = tfidf.fit_transform(X_train)
X_test_vec = tfidf.transform(X_test)

print("✅ Vectorization complete. Feature count:", X_train_vec.shape[1])

In [None]:
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Naive Bayes': MultinomialNB(),
    'Linear SVM': LinearSVC(random_state=42, max_iter=10000)
}

results = {}
trained_models = {}

print("🔧 Training models...\n")
for name in tqdm(models):
    model = models[name]
    print(f"▶️ {name}")
    %time model.fit(X_train_vec, y_train)

    preds = model.predict(X_test_vec)
    acc = accuracy_score(y_test, preds)
    print(f"✅ Test Accuracy: {acc:.4f}")

    print("🔁 Cross-validating...")
    %time cv = cross_val_score(model, X_train_vec, y_train, cv=5).mean()
    print(f"📈 CV Accuracy: {cv:.4f}")

    results[name] = {'Test Accuracy': acc, 'CV Accuracy': cv}
    trained_models[name] = model


In [None]:

results_df = pd.DataFrame(results).T
print("\n📊 Comparison of Models:")
print(results_df)
best_model_name = results_df['Test Accuracy'].idxmax()
best_model = trained_models[best_model_name]
print(f"\n🏆 Best Model: {best_model_name} | 🎯 Accuracy: {results_df.loc[best_model_name, 'Test Accuracy']:.4f}")


In [None]:
# 💾 Save the best model
os.makedirs('models', exist_ok=True)

model_filename = "../models/best_model.pkl"
vectorizer_filename = "../models/tfidf_vectorizer.pkl"

joblib.dump(best_model, model_filename)
joblib.dump(tfidf, vectorizer_filename)

print(f"✅ Model saved to: {model_filename}")
print(f"✅ Vectorizer saved to: {vectorizer_filename}")

In [None]:
model_info = {
    "model": best_model_name,
    "test_accuracy": results_df.loc[best_model_name, "Test Accuracy"],
    "cv_accuracy": results_df.loc[best_model_name, "CV Accuracy"],
    "feature_count": X_train_vec.shape[1]
}
with open("../models/model_info.json", "w") as f:
    json.dump(model_info, f, indent=2)

print("📦 Model info saved to: ../models/model_info.json")
