# Student Lifestyle Project — EDA, Modeling, and Mediation


In [5]:

# --- 1) Setup & Load Data ---
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, PolynomialFeatures, OneHotEncoder
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, f1_score, confusion_matrix
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

import statsmodels.api as sm

plt.rcParams['figure.dpi'] = 120  # crisper plots

CSV_PATH = r"./data/student_lifestyle_dataset.csv"

df = pd.read_csv(CSV_PATH)

def canonical_rename(cols):
    m = {}
    for c in cols:
        lc = c.lower().strip()
        # Map any GPA/grade-like field to 'grades'
        if ('gpa' in lc and 'gap' not in lc) or ('grade' in lc) or ('score' in lc and 'stress' not in lc):
            m[c] = 'grades'
        elif 'sleep' in lc:
            m[c] = 'sleep'
        elif 'study' in lc or 'hours_stud' in lc:
            m[c] = 'study'
        elif 'exerc' in lc or 'workout' in lc or 'fitness' in lc:
            m[c] = 'exercise'
        elif 'social' in lc or 'friends' in lc or 'interaction' in lc:
            m[c] = 'social'
        elif 'stress' in lc or 'anxiety' in lc or 'pressure' in lc:
            m[c] = 'stress'
        else:
            m[c] = c
    return m

df = df.rename(columns=canonical_rename(df.columns))
print("Shape:", df.shape)
df.head(10)


Shape: (2000, 9)


Unnamed: 0,Student_ID,study,Extracurricular_Hours_Per_Day,sleep,social,Physical_Activity_Hours_Per_Day,stress,Gender,grades
0,1,6.9,3.8,8.7,2.8,1.8,Moderate,Male,7.48
1,2,5.3,3.5,8.0,4.2,3.0,Low,Female,6.88
2,3,5.1,3.9,9.2,1.2,4.6,Low,Male,6.68
3,4,6.5,2.1,7.2,1.7,6.5,Moderate,Male,7.2
4,5,8.1,0.6,6.5,2.2,6.6,High,Male,8.78
5,6,6.0,2.1,8.0,0.3,7.6,Moderate,Female,7.12
6,7,8.0,0.7,5.3,5.7,4.3,High,Male,7.7
7,8,8.4,1.8,5.6,3.0,5.2,High,Male,8.0
8,9,5.2,3.6,6.3,4.0,4.9,Low,Male,7.05
9,10,7.7,0.7,9.8,4.5,1.3,Moderate,Female,6.9


In [11]:
# helpers

def print_metrics(name, y_true, y_pred):
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    r2 = r2_score(y_true, y_pred)
    print(f"{name:22s} | RMSE = {rmse:.4f} | R^2 = {r2:.4f}")
    return rmse, r2

class TabDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.from_numpy(X.astype(np.float32))
        self.y = torch.from_numpy(y.astype(np.float32)).view(-1, 1)
    def __len__(self): return len(self.X)
    def __getitem__(self, i): return self.X[i], self.y[i]

class MLPRegressor(nn.Module):
    def __init__(self, in_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.15),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.10),
            nn.Linear(64, 1),
        )
    def forward(self, x): return self.net(x)

def train_mlp(Xtr, ytr, Xva, yva, Xte, yte, seed=42):
    torch.manual_seed(seed)
    np.random.seed(seed)
    device = "cuda" if torch.cuda.is_available() else "cpu"

    train_loader = DataLoader(TabDataset(Xtr, ytr), batch_size=64, shuffle=True)
    val_loader   = DataLoader(TabDataset(Xva, yva), batch_size=256, shuffle=False)
    test_loader  = DataLoader(TabDataset(Xte, yte), batch_size=256, shuffle=False)

    model = MLPRegressor(in_dim=Xtr.shape[1]).to(device)
    opt = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-5)
    loss_fn = nn.MSELoss()

    def eval_mse(loader):
        model.eval()
        losses = []
        with torch.no_grad():
            for xb, yb in loader:
                xb, yb = xb.to(device), yb.to(device)
                pred = model(xb)
                losses.append(loss_fn(pred, yb).item())
        return float(np.mean(losses))

    best = float("inf")
    patience, pat = 12, 0
    best_state = None

    for epoch in range(1, 201):
        model.train()
        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.to(device)
            opt.zero_grad()
            pred = model(xb)
            loss = loss_fn(pred, yb)
            loss.backward()
            opt.step()

        val_mse = eval_mse(val_loader)
        if val_mse < best - 1e-6:
            best = val_mse
            pat = 0
            best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}
        else:
            pat += 1

        if epoch % 10 == 0 or epoch == 1:
            print(f"Epoch {epoch:3d} | val MSE = {val_mse:.4f}")
        if pat >= patience:
            print(f"Early stop at epoch {epoch} (best val MSE={best:.4f})")
            break

    if best_state is not None:
        model.load_state_dict(best_state)

    model.eval()
    preds = []
    with torch.no_grad():
        for xb, _ in test_loader:
            xb = xb.to(device)
            preds.append(model(xb).cpu().numpy().ravel())
    y_pred = np.concatenate(preds)
    print_metrics("MLP (Deep Learning)", yte, y_pred)
    return y_pred


Numeric-only: Classical + Deep Learning (same split)

In [12]:
assert "grades" in df.columns, "Need df['grades'] as target."

# y
y_all = pd.to_numeric(df["grades"], errors="coerce").values.astype(np.float32)

# X numeric only (exclude grades)
X_num = df.drop(columns=["grades"]).select_dtypes(include=[np.number]).copy()

# Drop ID-like numeric columns if present
for col in ["Student_ID", "student_id", "id"]:
    if col in X_num.columns:
        X_num = X_num.drop(columns=[col])

mask = np.isfinite(y_all)
X_num = X_num.loc[mask].reset_index(drop=True)
y_num = y_all[mask]

# Split (shared across models)
Xtr_df, Xtemp_df, ytr, ytemp = train_test_split(X_num, y_num, test_size=0.30, random_state=42)
Xva_df, Xte_df, yva, yte     = train_test_split(Xtemp_df, ytemp, test_size=0.50, random_state=42)

# Preprocess: impute + scale
num_pre = Pipeline([
    ("imp", SimpleImputer(strategy="median")),
    ("sc", StandardScaler())
])
num_pre.fit(Xtr_df)

Xtr = num_pre.transform(Xtr_df).astype(np.float32)
Xva = num_pre.transform(Xva_df).astype(np.float32)
Xte = num_pre.transform(Xte_df).astype(np.float32)

print("=== Numeric-only models (same split) ===")

# Linear Regression
lr = LinearRegression()
lr.fit(Xtr, ytr)
print_metrics("Linear Regression", yte, lr.predict(Xte))

# Random Forest
rf = RandomForestRegressor(n_estimators=400, random_state=42, n_jobs=-1)
rf.fit(Xtr_df.fillna(Xtr_df.median()), ytr)  # RF can use raw scale; just impute
rf_pred = rf.predict(Xte_df.fillna(Xtr_df.median()))
print_metrics("Random Forest", yte, rf_pred)

# MLP
_ = train_mlp(Xtr, ytr, Xva, yva, Xte, yte)


=== Numeric-only models (same split) ===
Linear Regression      | RMSE = 0.4944 | R^2 = 0.5318




Random Forest          | RMSE = 0.5350 | R^2 = 0.4519
Epoch   1 | val MSE = 28.0116
Epoch  10 | val MSE = 1.0674
Epoch  20 | val MSE = 0.3942
Epoch  30 | val MSE = 0.3709
Epoch  40 | val MSE = 0.3354
Epoch  50 | val MSE = 0.3251
Epoch  60 | val MSE = 0.3331
Epoch  70 | val MSE = 0.3856
Early stop at epoch 74 (best val MSE=0.3146)
MLP (Deep Learning)    | RMSE = 0.5132 | R^2 = 0.4956




In [13]:
assert "grades" in df.columns, "Need df['grades'] as target."

y_all = pd.to_numeric(df["grades"], errors="coerce").values.astype(np.float32)

X_all = df.drop(columns=["grades"]).copy()

# Drop ID-like columns if present
for col in ["Student_ID", "student_id", "id"]:
    if col in X_all.columns:
        X_all = X_all.drop(columns=[col])

mask = np.isfinite(y_all)
X_all = X_all.loc[mask].reset_index(drop=True)
y_all = y_all[mask]

# Split (shared across models)
Xtr_df, Xtemp_df, ytr, ytemp = train_test_split(X_all, y_all, test_size=0.30, random_state=42)
Xva_df, Xte_df, yva, yte     = train_test_split(Xtemp_df, ytemp, test_size=0.50, random_state=42)

# Identify numeric/categorical
num_cols = Xtr_df.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = [c for c in Xtr_df.columns if c not in num_cols]

# Preprocess: numeric impute+scale, categorical impute+onehot
pre = ColumnTransformer([
    ("num", Pipeline([("imp", SimpleImputer(strategy="median")),
                      ("sc", StandardScaler())]), num_cols),
    ("cat", Pipeline([("imp", SimpleImputer(strategy="most_frequent")),
                      ("oh", OneHotEncoder(handle_unknown="ignore"))]), cat_cols),
])

pre.fit(Xtr_df)

def transform_any(Xdf):
    Xt = pre.transform(Xdf)
    if hasattr(Xt, "toarray"):
        Xt = Xt.toarray()
    return Xt.astype(np.float32)

Xtr = transform_any(Xtr_df)
Xva = transform_any(Xva_df)
Xte = transform_any(Xte_df)

print("=== Numeric + categorical models (same split) ===")

# Linear Regression (on transformed features)
lr = LinearRegression()
lr.fit(Xtr, ytr)
print_metrics("Linear Regression", yte, lr.predict(Xte))

# Random Forest (needs dense numeric input) — use same transformed X
rf = RandomForestRegressor(n_estimators=400, random_state=42, n_jobs=-1)
rf.fit(Xtr, ytr)
print_metrics("Random Forest", yte, rf.predict(Xte))

# MLP
_ = train_mlp(Xtr, ytr, Xva, yva, Xte, yte)


=== Numeric + categorical models (same split) ===
Linear Regression      | RMSE = 0.4954 | R^2 = 0.5300




Random Forest          | RMSE = 0.5308 | R^2 = 0.4605
Epoch   1 | val MSE = 37.4054
Epoch  10 | val MSE = 0.6986
Epoch  20 | val MSE = 0.4531
Epoch  30 | val MSE = 0.3874
Epoch  40 | val MSE = 0.3536
Epoch  50 | val MSE = 0.3464
Epoch  60 | val MSE = 0.3659
Early stop at epoch 60 (best val MSE=0.3366)
MLP (Deep Learning)    | RMSE = 0.5053 | R^2 = 0.5110


