# Complete Classification Model Training - Google Colab Ready
## Trains Random Forest, SVM, LSTM, CNN and saves them for prediction app

**STEP 1**: Upload your `quiz_data.csv` file when prompted below!

This notebook:
1. ‚úÖ Handles file upload (for Colab)
2. ‚úÖ Trains all 4 models (Random Forest, SVM, LSTM, CNN)
3. ‚úÖ Evaluates them (calculates accuracy, precision, recall, etc.)
4. ‚úÖ **SAVES the models** so prediction app can use them
5. ‚úÖ Creates downloadable ZIP file

In [None]:
# Check if running on Colab
try:
    import google.colab
    IN_COLAB = True
    print("‚úÖ Running on Google Colab")
except:
    IN_COLAB = False
    print("‚úÖ Running locally")

In [None]:
# Install required packages
!pip install -q scikit-learn torch pandas numpy matplotlib seaborn joblib tqdm

In [None]:
# Imports
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
import joblib
import pickle
import os
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

print("‚úÖ Imports complete!")

## üì§ UPLOAD YOUR DATA FILE

**For Colab**: Click the button below to upload `quiz_data.csv`

**For Local**: Make sure `quiz_data.csv` is in the same directory

In [None]:
# Upload file (Colab) or use local file
if IN_COLAB:
    print("üì§ Please upload your quiz_data.csv file:")
    from google.colab import files
    uploaded = files.upload()
    
    # Get the uploaded filename
    CSV_PATH = list(uploaded.keys())[0]
    print(f"‚úÖ Uploaded: {CSV_PATH}")
else:
    CSV_PATH = 'quiz_data.csv'
    if os.path.exists(CSV_PATH):
        print(f"‚úÖ Found: {CSV_PATH}")
    else:
        print(f"‚ùå File not found: {CSV_PATH}")
        print("Please make sure quiz_data.csv is in the same directory!")

In [None]:
# Configuration
TARGET_COLUMN = 'subject'  # Change to 'difficulty' or 'question_type' for other tasks
SAVE_DIR = f'./saved_models/{TARGET_COLUMN}'
os.makedirs(SAVE_DIR, exist_ok=True)

print(f"üìä Target: {TARGET_COLUMN}")
print(f"üíæ Save directory: {SAVE_DIR}")

## Step 1: Load and Prepare Data

In [None]:
# Load data
print("Loading data...")
df = pd.read_csv(CSV_PATH)

print(f"‚úÖ Loaded {len(df)} rows")
print(f"\nColumns: {df.columns.tolist()}")

# Clean data
df = df.dropna(subset=['question', TARGET_COLUMN])
df['question'] = df['question'].astype(str)

print(f"\n‚úÖ After cleaning: {len(df)} samples")
print(f"\nClass distribution:")
print(df[TARGET_COLUMN].value_counts())

# Prepare X, y
X = df['question'].values
y = df[TARGET_COLUMN].values

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Save label encoder
joblib.dump(label_encoder, f'{SAVE_DIR}/label_encoder.pkl')
print(f"\n‚úÖ Label encoder saved!")
print(f"Number of classes: {len(label_encoder.classes_)}")
print(f"Classes: {label_encoder.classes_}")

In [None]:
# Train-test split
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

print(f"Train set: {len(X_train)} samples")
print(f"Validation set: {len(X_val)} samples")
print(f"Test set: {len(X_test)} samples")

## Step 2: Train Random Forest (ML Model #1)

In [None]:
print("\n" + "="*60)
print("üå≤ TRAINING RANDOM FOREST")
print("="*60)

# Create TF-IDF vectorizer
print("\n1. Creating TF-IDF features...")
tfidf_vectorizer = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),
    stop_words='english'
)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_val_tfidf = tfidf_vectorizer.transform(X_val)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Save TF-IDF vectorizer
joblib.dump(tfidf_vectorizer, f'{SAVE_DIR}/tfidf_vectorizer.pkl')
print("   ‚úÖ TF-IDF vectorizer saved!")
print(f"   Feature dimensions: {X_train_tfidf.shape[1]}")

# Train Random Forest
print("\n2. Training Random Forest...")
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=20,
    random_state=42,
    n_jobs=-1,
    verbose=1
)
rf_model.fit(X_train_tfidf, y_train)

# Evaluate
print("\n3. Evaluating...")
y_pred_rf = rf_model.predict(X_test_tfidf)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"   ‚úÖ Test Accuracy: {accuracy_rf:.4f} ({accuracy_rf*100:.2f}%)")

# Save model
joblib.dump(rf_model, f'{SAVE_DIR}/random_forest.pkl')
print("   ‚úÖ Model saved!")

## Step 3: Train SVM (ML Model #2)

In [None]:
print("\n" + "="*60)
print("üéØ TRAINING SVM")
print("="*60)

# Train LinearSVM
print("\n1. Training LinearSVM...")
linear_svm = LinearSVC(C=1.0, random_state=42, max_iter=2000, verbose=1)
linear_svm.fit(X_train_tfidf, y_train)
print("   ‚úÖ Training complete!")

# Calibrate for probabilities
print("\n2. Calibrating for probability predictions...")
svm_model = CalibratedClassifierCV(linear_svm, method='sigmoid', cv=3)
svm_model.fit(X_train_tfidf, y_train)
print("   ‚úÖ Calibration complete!")

# Evaluate
print("\n3. Evaluating...")
y_pred_svm = svm_model.predict(X_test_tfidf)
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print(f"   ‚úÖ Test Accuracy: {accuracy_svm:.4f} ({accuracy_svm*100:.2f}%)")

# Save model
joblib.dump(svm_model, f'{SAVE_DIR}/svm.pkl')
print("   ‚úÖ Model saved!")

## Step 4: Prepare Data for Deep Learning Models

In [None]:
print("\n" + "="*60)
print("üî§ PREPARING DATA FOR DEEP LEARNING")
print("="*60)

# Build vocabulary
print("\n1. Building vocabulary...")
word_counts = Counter()
for text in X_train:
    words = str(text).lower().split()
    word_counts.update(words)

# Create word to index mapping
vocab = {'<PAD>': 0, '<UNK>': 1}
for idx, (word, count) in enumerate(word_counts.most_common(10000)):
    vocab[word] = idx + 2

vocab_size = len(vocab)
print(f"   ‚úÖ Vocabulary size: {vocab_size}")

# Save vocabulary
with open(f'{SAVE_DIR}/vocab.pkl', 'wb') as f:
    pickle.dump(vocab, f)
print("   ‚úÖ Vocabulary saved!")

# Text to sequence function
def text_to_sequence(text, max_length=512):
    words = str(text).lower().split()
    seq = [vocab.get(word, vocab['<UNK>']) for word in words[:max_length]]
    seq = seq + [vocab['<PAD>']] * (max_length - len(seq))
    return seq

# Convert to sequences
print("\n2. Converting texts to sequences...")
X_train_seq = np.array([text_to_sequence(text) for text in tqdm(X_train, desc="Train")])
X_val_seq = np.array([text_to_sequence(text) for text in tqdm(X_val, desc="Val")])
X_test_seq = np.array([text_to_sequence(text) for text in tqdm(X_test, desc="Test")])

print(f"   ‚úÖ Sequence shape: {X_train_seq.shape}")

In [None]:
# Create PyTorch datasets
class TextDataset(Dataset):
    def __init__(self, sequences, labels):
        self.sequences = torch.LongTensor(sequences)
        self.labels = torch.LongTensor(labels)
    
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, idx):
        return self.sequences[idx], self.labels[idx]

# Create dataloaders
train_dataset = TextDataset(X_train_seq, y_train)
val_dataset = TextDataset(X_val_seq, y_val)
test_dataset = TextDataset(X_test_seq, y_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

print("‚úÖ Dataloaders created!")

## Step 5: Define Deep Learning Models

In [None]:
# LSTM Model
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_classes, num_layers=2, dropout=0.3):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, 
                           batch_first=True, dropout=dropout, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, num_classes)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, (hidden, cell) = self.lstm(embedded)
        hidden = torch.cat((hidden[-2], hidden[-1]), dim=1)
        hidden = self.dropout(hidden)
        output = self.fc(hidden)
        return output

# CNN Model
class CNNClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, num_filters, filter_sizes, num_classes, dropout=0.3):
        super(CNNClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.convs = nn.ModuleList([
            nn.Conv1d(embedding_dim, num_filters, kernel_size=fs)
            for fs in filter_sizes
        ])
        self.fc = nn.Linear(len(filter_sizes) * num_filters, num_classes)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x):
        embedded = self.embedding(x)
        embedded = embedded.permute(0, 2, 1)
        conved = [torch.relu(conv(embedded)) for conv in self.convs]
        pooled = [torch.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        cat = torch.cat(pooled, dim=1)
        cat = self.dropout(cat)
        output = self.fc(cat)
        return output

print("‚úÖ Model architectures defined!")

## Step 6: Train LSTM (DL Model #1)

In [None]:
print("\n" + "="*60)
print("üîÑ TRAINING LSTM")
print("="*60)

# Device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"\nUsing device: {device}")
if device.type == 'cuda':
    print(f"GPU: {torch.cuda.get_device_name(0)}")

# Initialize model
num_classes = len(label_encoder.classes_)
lstm_model = LSTMClassifier(
    vocab_size=vocab_size,
    embedding_dim=300,
    hidden_dim=128,
    num_classes=num_classes,
    num_layers=2,
    dropout=0.3
).to(device)

print(f"\nModel parameters: {sum(p.numel() for p in lstm_model.parameters()):,}")

# Training setup
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(lstm_model.parameters(), lr=0.001)
num_epochs = 10

# Training loop
print(f"\nTraining for {num_epochs} epochs...")
for epoch in range(num_epochs):
    lstm_model.train()
    total_loss = 0
    correct = 0
    total = 0
    
    for sequences, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
        sequences, labels = sequences.to(device), labels.to(device)
        
        optimizer.zero_grad()
        outputs = lstm_model(sequences)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
    
    train_acc = correct / total
    print(f"  Loss: {total_loss/len(train_loader):.4f}, Accuracy: {train_acc:.4f}")

# Evaluate on test set
print("\nEvaluating on test set...")
lstm_model.eval()
correct = 0
total = 0
with torch.no_grad():
    for sequences, labels in test_loader:
        sequences, labels = sequences.to(device), labels.to(device)
        outputs = lstm_model(sequences)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy_lstm = correct / total
print(f"‚úÖ Test Accuracy: {accuracy_lstm:.4f} ({accuracy_lstm*100:.2f}%)")

# Save model
torch.save(lstm_model.state_dict(), f'{SAVE_DIR}/lstm_model.pt')
lstm_config = {
    'vocab_size': vocab_size,
    'embedding_dim': 300,
    'hidden_dim': 128,
    'num_classes': num_classes,
    'num_layers': 2,
    'dropout': 0.3
}
with open(f'{SAVE_DIR}/lstm_config.pkl', 'wb') as f:
    pickle.dump(lstm_config, f)
print("‚úÖ LSTM model saved!")

## Step 7: Train CNN (DL Model #2)

In [None]:
print("\n" + "="*60)
print("üß† TRAINING CNN")
print("="*60)

# Initialize model
cnn_model = CNNClassifier(
    vocab_size=vocab_size,
    embedding_dim=300,
    num_filters=100,
    filter_sizes=[3, 4, 5],
    num_classes=num_classes,
    dropout=0.3
).to(device)

print(f"\nModel parameters: {sum(p.numel() for p in cnn_model.parameters()):,}")

# Training setup
optimizer = torch.optim.Adam(cnn_model.parameters(), lr=0.001)

# Training loop
print(f"\nTraining for {num_epochs} epochs...")
for epoch in range(num_epochs):
    cnn_model.train()
    total_loss = 0
    correct = 0
    total = 0
    
    for sequences, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
        sequences, labels = sequences.to(device), labels.to(device)
        
        optimizer.zero_grad()
        outputs = cnn_model(sequences)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
    
    train_acc = correct / total
    print(f"  Loss: {total_loss/len(train_loader):.4f}, Accuracy: {train_acc:.4f}")

# Evaluate on test set
print("\nEvaluating on test set...")
cnn_model.eval()
correct = 0
total = 0
with torch.no_grad():
    for sequences, labels in test_loader:
        sequences, labels = sequences.to(device), labels.to(device)
        outputs = cnn_model(sequences)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy_cnn = correct / total
print(f"‚úÖ Test Accuracy: {accuracy_cnn:.4f} ({accuracy_cnn*100:.2f}%)")

# Save model
torch.save(cnn_model.state_dict(), f'{SAVE_DIR}/cnn_model.pt')
cnn_config = {
    'vocab_size': vocab_size,
    'embedding_dim': 300,
    'num_filters': 100,
    'filter_sizes': [3, 4, 5],
    'num_classes': num_classes,
    'dropout': 0.3
}
with open(f'{SAVE_DIR}/cnn_config.pkl', 'wb') as f:
    pickle.dump(cnn_config, f)
print("‚úÖ CNN model saved!")

## Step 8: Summary and Visualization

In [None]:
print("\n" + "="*80)
print("üéâ TRAINING COMPLETE!")
print("="*80)

print("\nüìä MODEL ACCURACIES:")
print(f"  üå≤ Random Forest:  {accuracy_rf*100:.2f}%")
print(f"  üéØ SVM:            {accuracy_svm*100:.2f}%")
print(f"  üîÑ LSTM:           {accuracy_lstm*100:.2f}%")
print(f"  üß† CNN:            {accuracy_cnn*100:.2f}%")

print(f"\nüíæ All models saved to: {SAVE_DIR}")
print("\nüìÅ Saved files:")
for file in sorted(os.listdir(SAVE_DIR)):
    size_mb = os.path.getsize(os.path.join(SAVE_DIR, file)) / (1024 * 1024)
    print(f"  ‚úÖ {file:30} ({size_mb:.2f} MB)")

print("\n" + "="*80)

In [None]:
# Visualize accuracies
models = ['Random Forest', 'SVM', 'LSTM', 'CNN']
accuracies = [accuracy_rf*100, accuracy_svm*100, accuracy_lstm*100, accuracy_cnn*100]
colors = ['#FF6B6B', '#FF8E53', '#4ECDC4', '#44A08D']

plt.figure(figsize=(10, 6))
bars = plt.bar(models, accuracies, color=colors, alpha=0.8, edgecolor='black', linewidth=2)
plt.ylabel('Accuracy (%)', fontsize=14, fontweight='bold')
plt.title(f'Model Comparison - {TARGET_COLUMN.upper()} Classification', fontsize=16, fontweight='bold')
plt.ylim(min(accuracies) - 5, 100)
plt.grid(axis='y', alpha=0.3, linestyle='--')

# Add value labels on bars
for bar, acc in zip(bars, accuracies):
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height + 0.5,
             f'{acc:.2f}%', ha='center', va='bottom', fontsize=12, fontweight='bold')

plt.tight_layout()
plt.savefig(f'{SAVE_DIR}/model_comparison.png', dpi=150, bbox_inches='tight')
plt.show()

print("‚úÖ Comparison chart saved!")

## Step 9: Download Models (For Colab)

In [None]:
if IN_COLAB:
    print("üì¶ Creating ZIP file for download...")
    !zip -r saved_models.zip ./saved_models/
    
    print("\n‚úÖ ZIP file created!")
    print("üì• Click below to download:")
    
    from google.colab import files
    files.download('saved_models.zip')
    
    print("\n" + "="*80)
    print("üìù NEXT STEPS:")
    print("="*80)
    print("1. Extract saved_models.zip in your project folder")
    print("2. Run: streamlit run prediction_app.py")
    print("3. Start making predictions!")
    print("="*80)
else:
    print("\n‚úÖ Models saved locally!")
    print("\nYou can now run: streamlit run prediction_app.py")