# Mental Health Prediction

### Dataset

In [11]:
import torch
import numpy as np
from torch.utils.data import Dataset


class MentalDataset(Dataset):
    def __init__(self, X: np.ndarray, y: np.ndarray):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32).view(-1, 1)

    def __len__(self) -> int:
        return len(self.X)

    def __getitem__(self, idx: int):
        return self.X[idx], self.y[idx]

### Model implementation

In [12]:
import torch
import torch.nn as nn


class LinearBaseline(nn.Module):
    """
    Modèle linéaire de référence (Logistic Regression).
    Une seule couche + sigmoid.
    """
    def __init__(self, input_dim: int):
        super().__init__()
        self.linear = nn.Linear(input_dim, 1)

    def forward(self, x):
        return torch.sigmoid(self.linear(x))

class MentalHealthModelNN(nn.Module):
    """
    Réseau de neurones simple pour classification binaire (dépression oui/non)
    Sortie : probabilité entre 0 et 1
    """
    def __init__(self, input_dim: int):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, 1),
            nn.Sigmoid(),  # probabilité
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.net(x)


### Trainer Loop

In [13]:
from dataclasses import dataclass
from typing import Dict, List
from src.dataset import MentalDataset

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import Tensor
from torch.utils.data import DataLoader, RandomSampler
import torch.optim as optim


# 1. Chargement & prétraitement des données

def load_and_preprocess(path: str):
    """
    Charge train.csv, applique le prétraitement et renvoie :
    X_train, y_train, X_val, y_val, mean, std, feature_cols
    """

    df = pd.read_csv(path)

    # Cible
    target_col = "Depression"

    # Colonnes inutiles
    df = df.drop(columns=["id", "Name"])

    # Colonnes numériques
    num_cols = [
        "Age",
        "Academic Pressure",
        "Work Pressure",
        "CGPA",
        "Study Satisfaction",
        "Job Satisfaction",
        "Work/Study Hours",
        "Financial Stress",
    ]

    # Colonnes catégorielles
    cat_cols = [
        "Gender",
        "City",
        "Working Professional or Student",
        "Profession",
        "Sleep Duration",
        "Dietary Habits",
        "Degree",
        "Have you ever had suicidal thoughts ?",
        "Family History of Mental Illness",
    ]

    # Séparer X / y
    y = df[target_col].astype("float32").values
    X = df.drop(columns=[target_col])

    # Valeurs manquantes
    X[num_cols] = X[num_cols].fillna(X[num_cols].median())
    X[cat_cols] = X[cat_cols].fillna("Unknown")

    # One-hot encoding
    X = pd.get_dummies(X, columns=cat_cols)

    # Garder la liste des colonnes (pour test plus tard)
    feature_cols = X.columns.tolist()

    # Conversion en numpy
    X = X.astype("float32").values

    # Split train / val (80 / 20)
    np.random.seed(42)
    indices = np.random.permutation(len(X))
    split = int(0.8 * len(X))

    train_idx = indices[:split]
    val_idx = indices[split:]

    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]

    # Standardisation
    mean = X_train.mean(axis=0, keepdims=True)
    std = X_train.std(axis=0, keepdims=True) + 1e-8

    X_train = (X_train - mean) / std
    X_val = (X_val - mean) / std

    return X_train, y_train, X_val, y_val, mean, std, feature_cols

# 4. Trainer 

@dataclass
class MentalHealthTrainer:
    batch_size: int
    n_epochs: int
    eval_samples: int  # nb d'échantillons pour l'éval train/val

    def train(
        self,
        model: torch.nn.Module,
        train_set: MentalDataset,
        val_set: MentalDataset,
        optimizer: optim.Optimizer,
        device: torch.device,
        model_name: str = "model",
    ) -> float:
        """
        Entraîne UN modèle et renvoie la meilleure accuracy validation.
        Sauvegarde les meilleurs poids dans best_model_weights_{model_name}.pt
        """
        model.to(device)

        n_params = sum(p.numel() for p in model.parameters())
        print(f"\n=== Training {model_name} ({n_params:,} params) ===")

        train_loader = DataLoader(
            train_set,
            batch_size=self.batch_size,
            shuffle=True,
            drop_last=False,
            num_workers=0,
        )

        best_val = 0.0
        patience = 3
        counter = 0

        for epoch in range(self.n_epochs):
            # ---- TRAIN ----
            model.train()
            train_loss = 0.0
            train_acc = 0.0
            n_train = 0

            for X_batch, y_batch in train_loader:
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)

                optimizer.zero_grad()
                preds = model(X_batch)          # (B,1) dans [0,1]
                loss = F.binary_cross_entropy(preds, y_batch)
                loss.backward()
                optimizer.step()

                bs = X_batch.size(0)
                with torch.no_grad():
                    predicted = (preds >= 0.5).float()
                    acc = (predicted == y_batch).float().mean().item()

                train_loss += loss.item() * bs
                train_acc += acc * bs
                n_train += bs

            train_loss /= n_train
            train_acc /= n_train

            # ---- EVAL ----
            metrics_train = self.eval(model, train_set, device)
            metrics_val = self.eval(model, val_set, device)
            val_acc = metrics_val["accuracy"]

            print(
                f"[{model_name}] Epoch {epoch+1:02d} | "
                f"Train Loss: {train_loss:.4f} Acc: {train_acc:.4f} | "
                f"Eval Train Acc: {metrics_train['accuracy']:.4f} | "
                f"Eval Val Acc: {metrics_val['accuracy']:.4f}"
            )

            # Early stopping + sauvegarde des meilleurs poids
            if val_acc > best_val:
                best_val = val_acc
                counter = 0
                torch.save(model.state_dict(), f"best_model_weights_{model_name}.pt")
            else:
                counter += 1
                if counter >= patience:
                    print(f"Early stopping for {model_name}.")
                    break

        return best_val

    @torch.inference_mode
    def eval(
        self,
        model: torch.nn.Module,
        dataset: MentalDataset,
        device: torch.device,
    ) -> Dict[str, float]:
        model.eval()

        n = len(dataset)
        n_samples = min(self.eval_samples, n)

        sampler = RandomSampler(
            dataset,
            replacement=False,
            num_samples=n_samples,
        )

        loader = DataLoader(
            dataset,
            batch_size=self.batch_size,
            sampler=sampler,
            num_workers=0,
        )

        all_acc: List[torch.Tensor] = []
        all_loss: List[torch.Tensor] = []

        for X_batch, y_batch in loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)

            preds = model(X_batch)
            loss = F.binary_cross_entropy(preds, y_batch, reduction="none")

            predicted = (preds >= 0.5).float()
            acc = (predicted == y_batch).float()

            all_acc.append(acc)
            all_loss.append(loss)

        acc_tensor = torch.cat(all_acc).mean()
        loss_tensor = torch.cat(all_loss).mean()

        return {
            "accuracy": float(acc_tensor),
            "loss": float(loss_tensor),
        }

### Prédiction

In [14]:
import numpy as np
import pandas as pd


# mêmes listes que dans load_and_preprocess
NUM_COLS = [
    "Age",
    "Academic Pressure",
    "Work Pressure",
    "CGPA",
    "Study Satisfaction",
    "Job Satisfaction",
    "Work/Study Hours",
    "Financial Stress",
]

CAT_COLS = [
    "Gender",
    "City",
    "Working Professional or Student",
    "Profession",
    "Sleep Duration",
    "Dietary Habits",
    "Degree",
    "Have you ever had suicidal thoughts ?",
    "Family History of Mental Illness",
]

DROP_COLS = ["id", "Name"]


def preprocess_test(df_test: pd.DataFrame, feature_cols, mean, std) -> np.ndarray:
    """Applique le même prétraitement que pour le train,
    puis aligne les colonnes sur feature_cols.
    """

    # on enlève les colonnes inutiles
    df = df_test.drop(columns=DROP_COLS)

    # valeurs manquantes
    df[NUM_COLS] = df[NUM_COLS].fillna(df[NUM_COLS].median())
    df[CAT_COLS] = df[CAT_COLS].fillna("Unknown")

    # one-hot
    df = pd.get_dummies(df, columns=CAT_COLS)

    # réaligner les colonnes sur celles du train
    df = df.reindex(columns=feature_cols, fill_value=0.0)

    X_test = df.astype("float32").values

    # standardisation avec mean/std du train
    X_test = (X_test - mean) / (std + 1e-8)

    return X_test

### Evaluation et comparaison des modèles

In [15]:
from src.trainer import MentalHealthTrainer, load_and_preprocess
from src.dataset import MentalDataset
from src.model import LinearBaseline, MentalHealthModelNN

import torch
import torch.optim as optim


def main():
    # 1) Charger & prétraiter
    X_train, y_train, X_val, y_val, mean, std, feature_cols = load_and_preprocess(
        "./data/train.csv"
    )

    train_set = MentalDataset(X_train, y_train)
    val_set = MentalDataset(X_val, y_val)

    input_dim = X_train.shape[1]

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    trainer = MentalHealthTrainer(
        batch_size=64,
        n_epochs=25,
        eval_samples=10_000,
    )

    # 2) Définir les modèles à comparer
    models = {
        "linear": LinearBaseline(input_dim),
        "nn": MentalHealthModelNN(input_dim),
    }

    val_scores = {}

    # 3) Entraîner chaque modèle et stocker la meilleure val accuracy
    for name, model in models.items():
        optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-4)
        best_val_acc = trainer.train(
            model=model,
            train_set=train_set,
            val_set=val_set,
            optimizer=optimizer,
            device=device,
            model_name=name,
        )
        val_scores[name] = best_val_acc

    # 4) Afficher un tableau de comparaison
    print("\n=== Model comparison (validation accuracy) ===")
    print("+---------+--------------------+")
    print("| Model   | Val Accuracy       |")
    print("+---------+--------------------+")
    for name, acc in val_scores.items():
        print(f"| {name:<7} | {acc*100:>6.2f}%            |")
    print("+---------+--------------------+")

    # 5) Déterminer le meilleur modèle
    best_model_name = max(val_scores, key=val_scores.get)
    best_val = val_scores[best_model_name]
    print(f"\nBest model: {best_model_name} ({best_val*100:.2f}% val accuracy)")

    # 6) Sauvegarder un checkpoint "global" pour predict.py
    #    On recharge les poids du meilleur modèle et on les stocke dans un seul fichier.
    best_state_dict = torch.load(f"best_model_weights_{best_model_name}.pt", map_location="cpu")
    torch.save(
        {
            "model_type": best_model_name,   # "linear" ou "nn"
            "state_dict": best_state_dict,
            "mean": mean,
            "std": std,
            "feature_cols": feature_cols,
        },
        "mental_health_model.pt",
    )
    print("Saved best model in mental_health_model.pt")


if __name__ == "__main__":
    main()



=== Training linear (356 params) ===
[linear] Epoch 01 | Train Loss: 0.4875 Acc: 0.9012 | Eval Train Acc: 0.9306 | Eval Val Acc: 0.9331
[linear] Epoch 02 | Train Loss: 0.3639 Acc: 0.9337 | Eval Train Acc: 0.9328 | Eval Val Acc: 0.9355
[linear] Epoch 03 | Train Loss: 0.3449 Acc: 0.9361 | Eval Train Acc: 0.9358 | Eval Val Acc: 0.9360
[linear] Epoch 04 | Train Loss: 0.3381 Acc: 0.9368 | Eval Train Acc: 0.9351 | Eval Val Acc: 0.9369
[linear] Epoch 05 | Train Loss: 0.3351 Acc: 0.9369 | Eval Train Acc: 0.9379 | Eval Val Acc: 0.9374
[linear] Epoch 06 | Train Loss: 0.3337 Acc: 0.9368 | Eval Train Acc: 0.9385 | Eval Val Acc: 0.9391
[linear] Epoch 07 | Train Loss: 0.3330 Acc: 0.9370 | Eval Train Acc: 0.9375 | Eval Val Acc: 0.9393
[linear] Epoch 08 | Train Loss: 0.3037 Acc: 0.9370 | Eval Train Acc: 0.9392 | Eval Val Acc: 0.9379
[linear] Epoch 09 | Train Loss: 0.2473 Acc: 0.9370 | Eval Train Acc: 0.9393 | Eval Val Acc: 0.9386
[linear] Epoch 10 | Train Loss: 0.1836 Acc: 0.9376 | Eval Train Acc: 0.