In [1]:
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import os

os.makedirs('../visuals', exist_ok=True)

# Load Data
df = pd.read_csv('../data/processed_data.csv')

# Define Target (Categorical)
target = 'health_risk_level_encoded'
# Ensure target exists
if target not in df.columns:
    raise ValueError("Target column not found!")

X = df.drop(columns=[target, 'health_risk_level', 'Health_Risk_Score'], errors='ignore')
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Train Shape: {X_train.shape}, Test Shape: {X_test.shape}")

Train Shape: (171, 55), Test Shape: (43, 55)


In [2]:
results_list = []

def evaluate_model(name, model):
    start_time = time.time()
    try:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        elapsed = time.time() - start_time
        
        acc = accuracy_score(y_test, y_pred)
        prec = precision_score(y_test, y_pred, average='weighted', zero_division=0)
        rec = recall_score(y_test, y_pred, average='weighted', zero_division=0)
        f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
        
        # CONFUSION MATRIX PLOT 
        cm = confusion_matrix(y_test, y_pred)
        plt.figure(figsize=(6, 5))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
        plt.title(f'{name}\nConfusion Matrix')
        plt.xlabel('Predicted')
        plt.ylabel('Actual')
        
        safe_name = name.replace(" ", "_").replace("(","").replace(")","").replace("=","_")
        plt.savefig(f'../visuals/cm_{safe_name}.png')
        plt.close() # Keep notebook clean
        
        return {
            'Algorithm': name,
            'Accuracy': round(acc, 4),
            'Precision': round(prec, 4),
            'Recall': round(rec, 4),
            'F1 Score': round(f1, 4),
            'Runtime': round(elapsed, 4)
        }
    except Exception as e:
        print(f"Failed for {name}: {e}")
        return None

In [3]:
# Logistic regression
results_lr = []

model = LogisticRegression(max_iter=1000)
res = evaluate_model('Logistic Regression', model)
if res: results_lr.append(res)

df_lr = pd.DataFrame(results_lr)
display(df_lr)
df_lr.to_csv('../visuals/table_LogisticRegression_classification.csv', index=False)
results_list.extend(results_lr)

Unnamed: 0,Algorithm,Accuracy,Precision,Recall,F1 Score,Runtime
0,Logistic Regression,0.9767,0.9778,0.9767,0.9747,0.0121


In [4]:
# Support Vector Machine (SVM)
results_svm = []
model_linear = SVC(kernel='linear', C=1.0)
res1 = evaluate_model('SVM (Linear)', model_linear)
if res1: results_svm.append(res1)

model_rbf = SVC(kernel='rbf', C=1.0)
res2 = evaluate_model('SVM (RBF)', model_rbf)
if res2: results_svm.append(res2)

df_svm = pd.DataFrame(results_svm)
display(df_svm)
df_svm.to_csv('../visuals/table_SVM_classification.csv', index=False)
results_list.extend(results_svm)

Unnamed: 0,Algorithm,Accuracy,Precision,Recall,F1 Score,Runtime
0,SVM (Linear),1.0,1.0,1.0,1.0,0.0051
1,SVM (RBF),0.9302,0.866,0.9302,0.8968,0.0036


In [5]:
# Decision Tree
results_dt = []
model = DecisionTreeClassifier(max_depth=5, criterion='gini')
res = evaluate_model('Decision Tree (Depth=5)', model)
if res: results_dt.append(res)

df_dt = pd.DataFrame(results_dt)
display(df_dt)
df_dt.to_csv('../visuals/table_DecisionTree_classification.csv', index=False)
results_list.extend(results_dt)

Unnamed: 0,Algorithm,Accuracy,Precision,Recall,F1 Score,Runtime
0,Decision Tree (Depth=5),1.0,1.0,1.0,1.0,0.0045


In [6]:
# Random Forest Classifier
results_rf = []
estimators = [50, 100]
for n in estimators:
    model = RandomForestClassifier(n_estimators=n, random_state=42)
    res = evaluate_model(f'Random Forest (n={n})', model)
    if res: results_rf.append(res)

df_rf = pd.DataFrame(results_rf)
display(df_rf)
df_rf.to_csv('../visuals/table_RandomForest_classification.csv', index=False)
results_list.extend(results_rf)

Unnamed: 0,Algorithm,Accuracy,Precision,Recall,F1 Score,Runtime
0,Random Forest (n=50),0.9535,0.9575,0.9535,0.9429,0.0639
1,Random Forest (n=100),0.9767,0.9778,0.9767,0.9747,0.0841


In [7]:
# KNN
results_knn = []
neighbors = [3, 5, 7]
for k in neighbors:
    model = KNeighborsClassifier(n_neighbors=k)
    res = evaluate_model(f'KNN (k={k})', model)
    if res: results_knn.append(res)

df_knn = pd.DataFrame(results_knn)
display(df_knn)
df_knn.to_csv('../visuals/table_KNN_classification.csv', index=False)
results_list.extend(results_knn)

Unnamed: 0,Algorithm,Accuracy,Precision,Recall,F1 Score,Runtime
0,KNN (k=3),0.8372,0.7988,0.8372,0.8171,0.2017
1,KNN (k=5),0.8605,0.8023,0.8605,0.8295,0.0086
2,KNN (k=7),0.7907,0.7368,0.7907,0.7618,0.009


In [8]:
# Overall Comparison
final_df = pd.DataFrame(results_list)
final_df = final_df.sort_values(by='F1 Score', ascending=False)
display(final_df)
final_df.to_csv('../visuals/classification_comparison.csv', index=False)

best_row = final_df.iloc[0]
print(f"\nBest Classifier: {best_row['Algorithm']}")

Unnamed: 0,Algorithm,Accuracy,Precision,Recall,F1 Score,Runtime
1,SVM (Linear),1.0,1.0,1.0,1.0,0.0051
3,Decision Tree (Depth=5),1.0,1.0,1.0,1.0,0.0045
0,Logistic Regression,0.9767,0.9778,0.9767,0.9747,0.0121
5,Random Forest (n=100),0.9767,0.9778,0.9767,0.9747,0.0841
4,Random Forest (n=50),0.9535,0.9575,0.9535,0.9429,0.0639
2,SVM (RBF),0.9302,0.866,0.9302,0.8968,0.0036
7,KNN (k=5),0.8605,0.8023,0.8605,0.8295,0.0086
6,KNN (k=3),0.8372,0.7988,0.8372,0.8171,0.2017
8,KNN (k=7),0.7907,0.7368,0.7907,0.7618,0.009



Best Classifier: SVM (Linear)
