In [None]:
from preprocessing import X_train, y_train, X_test, y_test

Convert to Pytorch Tensors

In [None]:
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
import numpy as np
import time

# Store the original model's accuracy for comparison
start_time = time.time()
rf_original = RandomForestClassifier(
    n_estimators=50,
    max_depth=15,
    min_samples_split=5,
    n_jobs=-1,
    random_state=42
)

rf_original.fit(X_train, y_train)
original_accuracy = rf_original.score(X_test, y_test)
original_train_time = time.time() - start_time
print(f"Original Random Forest Accuracy: {original_accuracy:.4f}")
print(f"Training time: {original_train_time:.2f} seconds")

Finding best hyperparameters


In [None]:
# Define parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 15, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None]
}

# Use GridSearchCV for hyperparameter tuning
print("\nPerforming hyperparameter tuning with GridSearchCV...")
grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42, n_jobs=-1),
    param_grid=param_grid,
    cv=3,  # 3-fold cross-validation
    n_jobs=-1,  # Use all available cores
    scoring='accuracy',
    verbose=1
)

start_time = time.time()
grid_search.fit(X_train, y_train)
tuning_time = time.time() - start_time

# Best parameters and results
print(f"\nBest parameters found: {grid_search.best_params_}")
print(f"GridSearchCV took {tuning_time:.2f} seconds to complete")

# Create optimized model with best parameters
rf_optimized = RandomForestClassifier(
    **grid_search.best_params_,
    random_state=42,
    n_jobs=-1
)

start_time = time.time()
rf_optimized.fit(X_train, y_train)
optimized_accuracy = rf_optimized.score(X_test, y_test)
optimized_train_time = time.time() - start_time

print(f"\nOptimized Random Forest Accuracy: {optimized_accuracy:.4f}")
print(f"Training time: {optimized_train_time:.2f} seconds")
print(f"Accuracy improvement: {(optimized_accuracy - original_accuracy) * 100:.2f}%")

# Visualize results
models = ['Original RF', 'Optimized RF']
accuracies = [original_accuracy, optimized_accuracy]

plt.figure(figsize=(10, 6))
bars = plt.bar(models, accuracies, color=['#3498db', '#2ecc71'])
plt.ylim(max(0.9, min(accuracies) - 0.05), min(1.0, max(accuracies) + 0.02))
plt.xlabel('Random Forest Models')
plt.ylabel('Accuracy')
plt.title('Random Forest Accuracy Comparison')

# Add accuracy values on top of bars
for bar, accuracy in zip(bars, accuracies):
    plt.text(
        bar.get_x() + bar.get_width()/2 - 0.05,
        bar.get_height() + 0.005,
        f"{accuracy:.4f}",
        fontweight='bold'
    )

plt.tight_layout()
plt.show()

# Feature importance of the optimized model
feature_importances = rf_optimized.feature_importances_
feature_names = range(X.shape[1])  # Assuming you have feature names

# Sort features by importance
indices = np.argsort(feature_importances)[::-1]
top_10_indices = indices[:10]  # Show top 10 features

plt.figure(figsize=(10, 6))
plt.bar(range(10), feature_importances[top_10_indices])
plt.xticks(range(10), [f"Feature {i}" for i in top_10_indices])
plt.xlabel('Feature')
plt.ylabel('Importance')
plt.title('Top 10 Feature Importances (Optimized Random Forest)')
plt.tight_layout()
plt.show()

KNN :- K-Nearest Neighbors

In [None]:
# Hyperparameter tuning for KNN
print("\nPerforming hyperparameter tuning for KNN...")
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
}

grid_search = GridSearchCV(
    KNeighborsClassifier(),
    param_grid,
    cv=3,
    scoring='accuracy',
    n_jobs=-1
)

start_time = time.time()
grid_search.fit(X_train, y_train)
tuning_time = time.time() - start_time

print(f"Best parameters: {grid_search.best_params_}")
print(f"GridSearchCV took {tuning_time:.2f} seconds")


# Visualize accuracy vs k value
k_values = list(range(1, 21))
accuracies = []

for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k, **{key: value for key, value in grid_search.best_params_.items() if key != 'n_neighbors'})
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    accuracies.append(accuracy_score(y_test, y_pred))

plt.figure(figsize=(10, 6))
plt.plot(k_values, accuracies, marker='o', linestyle='-', color='#3498db')
plt.title('KNN Accuracy vs k Value')
plt.xlabel('k (Number of Neighbors)')
plt.ylabel('Accuracy')
plt.xticks(k_values)
plt.grid(True, linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

print(f"Best KNN accuracy: {grid_search.best_score_:.4f}")

Support Vector Machine (SVM)

In [None]:
from sklearn.svm import SVC

svm = SVC(kernel='rbf', C=1.0, gamma='scale')  # RBF kernel is often best for faces
svm.fit(X_train, y_train)

y_pred = svm.predict(X_test)
print("SVM Accuracy:", accuracy_score(y_test, y_pred))

Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred))

ANN :- Artificial Neural Network

In [None]:
import torch.optim as optim
from sklearn.metrics import accuracy_score

class ANNModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(ANNModel, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(hidden_size, num_classes)
    
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.relu2(x)
        x = self.fc3(x)
        return x

# Model parameters
input_size = X_train.shape[1]  # Number of features
hidden_size = 64  # Number of neurons in hidden layers
num_classes = len(label_encoder.classes_)  # Number of unique labels

# Initialize the model
model = ANNModel(input_size, hidden_size, num_classes)

# ============================
# 3. Train the Model
# ============================
# Loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 50  
batch_size = 32

# Track accuracy
train_accuracies = []
val_accuracies = []

for epoch in range(num_epochs):
    model.train()
    for i in range(0, len(X_train_tensor), batch_size):
        # Get mini-batch
        X_batch = X_train_tensor[i:i+batch_size]
        y_batch = y_train_tensor[i:i+batch_size]

        # Forward pass
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Calculate training accuracy
    model.eval()
    with torch.no_grad():
        # Training accuracy
        train_outputs = model(X_train_tensor)
        train_pred_classes = torch.argmax(train_outputs, axis=1)
        train_accuracy = accuracy_score(y_train_tensor.numpy(), train_pred_classes.numpy())
        train_accuracies.append(train_accuracy)

        # Validation accuracy
        val_outputs = model(X_test_tensor)
        val_pred_classes = torch.argmax(val_outputs, axis=1)
        val_accuracy = accuracy_score(y_test_tensor.numpy(), val_pred_classes.numpy())
        val_accuracies.append(val_accuracy)

    # Print loss and accuracies for every epoch
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}, "
          f"Train Accuracy: {train_accuracy * 100:.2f}%, "
          f"Validation Accuracy: {val_accuracy * 100:.2f}%")

# ============================
# 4. Evaluate the Model
# ============================
model.eval()
with torch.no_grad():
    y_pred = model(X_test_tensor)
    y_pred_classes = torch.argmax(y_pred, axis=1).numpy()
    accuracy = accuracy_score(y_test, y_pred_classes)
    print(f"Final Test Accuracy: {accuracy * 100:.2f}%")

# ============================
# 5. Save the Model
# ============================
torch.save(model.state_dict(), "ann_model.pth")
print("Model saved as ann_model.pth")

# ============================
# 6. Plot Training and Validation Accuracy
# ============================
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.plot(range(1, num_epochs + 1), train_accuracies, label='Train Accuracy')
plt.plot(range(1, num_epochs + 1), val_accuracies, label='Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Training and Validation Accuracy')
plt.legend()
plt.grid()
plt.tight_layout()
plt.show()

XG Boost

In [None]:
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import numpy as np


# Store XGBoost results
xgb_results = {}

# ===================================
# 2. Basic XGBoost Model
# ===================================
print("=== XGBoost Models ===")

# Basic XGBoost (default parameters)
xgb_basic = XGBClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
xgb_basic.fit(X_train, y_train)
y_pred = xgb_basic.predict(X_test)
xgb_basic_acc = accuracy_score(y_test, y_pred)
xgb_results['XGBoost (basic)'] = xgb_basic_acc
print(f"XGBoost (basic) Accuracy: {xgb_basic_acc:.4f}")


Hyperparameter Tuning

param_grid_xgb = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [4, 6, 8],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

# GridSearchCV for hyperparameter tuning
xgb_improved = GridSearchCV(
    XGBClassifier(random_state=42),
    param_grid_xgb,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)

xgb_improved.fit(X_train, y_train)
y_pred = xgb_improved.predict(X_test)
xgb_improved_acc = accuracy_score(y_test, y_pred)
xgb_results['XGBoost (improved)'] = xgb_improved_acc
print(f"XGBoost (improved) Accuracy: {xgb_improved_acc:.4f}")
print(f"Best XGBoost Parameters: {xgb_improved.best_params_}")

# ================================
# 4. Plot XGBoost Results
# ================================
plt.figure(figsize=(8, 5))
models = list(xgb_results.keys())
accuracies = list(xgb_results.values())
plt.bar(models, accuracies, color=['#66b3ff', '#ff9999'])
plt.xlabel('XGBoost Models')
plt.ylabel('Accuracy')
plt.title('XGBoost Model Accuracy Comparison')
plt.ylim(0.5, 1.0)
plt.tight_layout()
plt.show()