In [None]:
# Mental Health Prediction Project

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Load dataset
data = pd.read_csv("C:/Users/adart/Downloads/archive (9)/Mental Health Dataset2.csv")


# Basic Data Exploration

print("\n--- DATA SHAPE ---")
print(data.shape)

print("\n--- FIRST 5 ROWS ---")
print(data.head())

print("\n--- LAST 5 ROWS ---")
print(data.tail())

print("\n--- DATA INFO ---")
print(data.info())


# Handle missing values
data = data.ffill()

# Encode categorical data
le = LabelEncoder()
for col in data.columns:
    if data[col].dtype == "object":
        data[col] = le.fit_transform(data[col])

#  Define Target Column 
target_col = "treatment"   
print("\nTarget column chosen:", target_col)

#  Value counts of target

print("\n--- TARGET VALUE COUNTS ---")
print(data[target_col].value_counts())

# Correlation with target

print("\n--- CORRELATION WITH TARGET ---")
corr_target = data.corr()['treatment'].sort_values(ascending=False)
print(corr_target)

#  Feature & Target Split

X = data.drop(target_col, axis=1)
y = data[target_col]

# Train-Test Split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Scaling

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


# Training models

# ---- KNN ----
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
knn_pred = knn.predict(X_test)

knn_acc = accuracy_score(y_test, knn_pred)
knn_prec = precision_score(y_test, knn_pred, average="weighted", zero_division=0)
knn_recall = recall_score(y_test, knn_pred, average="weighted", zero_division=0)
knn_f1 = f1_score(y_test, knn_pred, average="weighted", zero_division=0)

# ---- Naive Bayes ----
nb = GaussianNB()
nb.fit(X_train, y_train)
nb_pred = nb.predict(X_test)

nb_acc = accuracy_score(y_test, nb_pred)
nb_prec = precision_score(y_test, nb_pred, average="weighted", zero_division=0)
nb_recall = recall_score(y_test, nb_pred, average="weighted", zero_division=0)
nb_f1 = f1_score(y_test, nb_pred, average="weighted", zero_division=0)

# ---- Decision Tree ----
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
dt_pred = dt.predict(X_test)

dt_acc = accuracy_score(y_test, dt_pred)
dt_prec = precision_score(y_test, dt_pred, average="weighted", zero_division=0)
dt_recall = recall_score(y_test, dt_pred, average="weighted", zero_division=0)
dt_f1 = f1_score(y_test, dt_pred, average="weighted", zero_division=0)


# ---- Support Vector Machine ----
svm = SVC()
svm.fit(X_train, y_train)
svm_pred = svm.predict(X_test)

svm_acc = accuracy_score(y_test, svm_pred)
svm_prec = precision_score(y_test, svm_pred, average="weighted", zero_division=0)
svm_recall = recall_score(y_test, svm_pred, average="weighted", zero_division=0)
svm_f1 = f1_score(y_test, svm_pred, average="weighted", zero_division=0)

# ---- Random Forest ----
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)

rf_acc = accuracy_score(y_test, rf_pred)
rf_prec = precision_score(y_test, rf_pred, average="weighted", zero_division=0)
rf_recall = recall_score(y_test, rf_pred, average="weighted", zero_division=0)
rf_f1 = f1_score(y_test, rf_pred, average="weighted", zero_division=0)

print("\n===== ACCURACY (All Models) =====")
print("KNN:", knn_acc)
print("Naive Bayes:", nb_acc)
print("Decision Tree:", dt_acc)
print("SVM:", svm_acc)
print("Random Forest:", rf_acc)

print("\n===== PRECISION (All Models) =====")
print("KNN:", knn_prec)
print("Naive Bayes:", nb_prec)
print("Decision Tree:", dt_prec)
print("SVM:", svm_prec)
print("Random Forest:", rf_prec)

print("\n===== RECALL (All Models) =====")
print("KNN:", knn_recall)
print("Naive Bayes:", nb_recall)
print("Decision Tree:", dt_recall)
print("SVM:", svm_recall)
print("Random Forest:", rf_recall)

print("\n===== F1 SCORE (All Models) =====")
print("KNN:", knn_f1)
print("Naive Bayes:", nb_f1)
print("Decision Tree:", dt_f1)
print("SVM:", svm_f1)
print("Random Forest:", rf_f1)

#Visualizations

# Target distribution
plt.figure()
sns.countplot(x=y)
plt.title("Treatment Distribution")
plt.show()


# Correlation heatmap
plt.figure()
sns.heatmap(data.corr(), cmap="coolwarm")
plt.title("Correlation Heatmap")
plt.show()

# Accuracy comparison
plt.figure()
plt.bar(
    ["KNN", "NB", "DT", "SVM", "RF"],
    [knn_acc, nb_acc, dt_acc, svm_acc, rf_acc]
)
plt.title("Accuracy Comparison")
plt.show()


# Precision comparison
plt.figure()
plt.bar(
    ["KNN", "NB", "DT", "SVM", "RF"],
    [knn_prec, nb_prec, dt_prec, svm_prec, rf_prec]
)
plt.title("Precision Comparison")
plt.show()

# Recall comparison
plt.figure()
plt.bar(
    ["KNN", "NB", "DT", "SVM", "RF"],
    [knn_recall, nb_recall, dt_recall, svm_recall, rf_recall]
)
plt.title("Recall Comparison")
plt.show()

# Confusion Matrix (Best Model by Accuracy)
best_acc = max(knn_acc, nb_acc, dt_acc, svm_acc, rf_acc)

if best_acc == knn_acc:
    best_name = "KNN"
    best_pred = knn_pred
elif best_acc == nb_acc:
    best_name = "Naive Bayes"
    best_pred = nb_pred
elif best_acc == dt_acc:
    best_name = "Decision Tree"
    best_pred = dt_pred
elif best_acc == svm_acc:
    best_name = "SVM"
    best_pred = svm_pred
else:
    best_name = "Random Forest"
    best_pred = rf_pred

plt.figure()
sns.heatmap(confusion_matrix(y_test, best_pred), annot=True, fmt="d", cmap="Blues")
plt.title("Confusion Matrix - " + best_name)
plt.show()





