### 01 Preprocessing

This notebook prepares the cleaned dataset for modelling by:
- Dropping non-informative ID columns
- Splitting features (X) and target (y)
- Binary encoding on (Gender, Suicidal Thoughts, Family History)
- Performing 80/20 train-test split
- One-hot encoding remaining categorical variables
- Fitting preprocessing on training data only (avoid data leakage)
- Saving transformed data and feature names for 02_model_training_evaluation


Import Library

In [39]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

import joblib

Config

In [40]:
SEED = 42
DATA_PATH = "student_depression_dataset_cleaned.csv"

TARGET = "Depression"

# Binary columns
COL_GENDER = "Gender"
COL_SUICIDAL = "Have you ever had suicidal thoughts ?"
COL_FAM_HIST = "Family History of Mental Illness"


Load dataset

In [41]:
df = pd.read_csv(DATA_PATH)

print("Dataset shape:", df.shape)
print("\nColumns:")
print(df.columns.tolist())
print("\nHead:")
df.head()

Dataset shape: (21733, 18)

Columns:
['id', 'Gender', 'Age', 'City', 'Profession', 'Academic Pressure', 'Work Pressure', 'CGPA', 'Study Satisfaction', 'Job Satisfaction', 'Sleep Duration', 'Dietary Habits', 'Degree', 'Have you ever had suicidal thoughts ?', 'Work/Study Hours', 'Financial Stress', 'Family History of Mental Illness', 'Depression']

Head:


Unnamed: 0,id,Gender,Age,City,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,2,Male,33,Visakhapatnam,Student,5,0,8.97,2,0,'5-6 hours',Healthy,B.Pharm,Yes,3,1,No,1
1,8,Female,24,Bangalore,Student,2,0,5.9,5,0,'5-6 hours',Moderate,BSc,No,3,2,Yes,0
2,26,Male,31,Srinagar,Student,3,0,7.03,5,0,'Less than 5 hours',Healthy,BA,No,9,1,Yes,0
3,30,Female,28,Varanasi,Student,3,0,5.59,2,0,'7-8 hours',Moderate,BCA,Yes,4,5,Yes,1
4,32,Female,25,Jaipur,Student,4,0,8.13,3,0,'5-6 hours',Moderate,M.Tech,Yes,1,1,No,0


Drop ID column

In [43]:
if "id" in df.columns:
    df = df.drop(columns=["id"])
    print("Dropped column: id")

print("Shape after dropping id (if any):", df.shape)

Dropped column: id
Shape after dropping id (if any): (21733, 17)


Define X / y

In [44]:
if TARGET not in df.columns:
    raise ValueError(f"Target column '{TARGET}' not found. Available columns: {df.columns.tolist()}")

X = df.drop(columns=[TARGET]).copy()
y = df[TARGET].copy()

print("X shape:", X.shape)
print("y distribution:\n", y.value_counts(dropna=False))


X shape: (21733, 16)
y distribution:
 Depression
1    11983
0     9750
Name: count, dtype: int64


Convert (Gender, Suicidal Thoughts, Family History) to binary variables

In [45]:
def normalize_str(s):
    if pd.isna(s):
        return s
    return str(s).strip()

for col in [COL_GENDER, COL_SUICIDAL, COL_FAM_HIST]:
    if col in X.columns:
        X[col] = X[col].apply(normalize_str)

binary_mappings = {
    COL_GENDER: {"Male": 0, "Female": 1},
    COL_SUICIDAL: {"No": 0, "Yes": 1},
    COL_FAM_HIST: {"No": 0, "Yes": 1}
}

for col, mapping in binary_mappings.items():
    if col in X.columns:
        # Show unique values before mapping (helps catch unexpected labels)
        print(f"\n[{col}] unique values BEFORE mapping:", sorted(X[col].dropna().unique().tolist()))
        X[col] = X[col].map(mapping)
        print(f"[{col}] unique values AFTER mapping:", sorted(X[col].dropna().unique().tolist()))
    else:
        print(f"\nWarning: Column not found in X -> {col}")



[Gender] unique values BEFORE mapping: ['Female', 'Male']
[Gender] unique values AFTER mapping: [0, 1]

[Have you ever had suicidal thoughts ?] unique values BEFORE mapping: ['No', 'Yes']
[Have you ever had suicidal thoughts ?] unique values AFTER mapping: [0, 1]

[Family History of Mental Illness] unique values BEFORE mapping: ['No', 'Yes']
[Family History of Mental Illness] unique values AFTER mapping: [0, 1]


Check data type (after conversion for binary variable)

In [46]:
print("\nDtype counts:\n", X.dtypes.value_counts())

# Identify columns by type
num_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
cat_cols = X.select_dtypes(include=["object"]).columns.tolist()

print("\nNumeric columns:")
print(num_cols)

print("\nCategorical columns (to be one-hot encoded):")
print(cat_cols)


Dtype counts:
 int64      10
object      5
float64     1
Name: count, dtype: int64

Numeric columns:
['Gender', 'Age', 'Academic Pressure', 'Work Pressure', 'CGPA', 'Study Satisfaction', 'Job Satisfaction', 'Have you ever had suicidal thoughts ?', 'Work/Study Hours', 'Financial Stress', 'Family History of Mental Illness']

Categorical columns (to be one-hot encoded):
['City', 'Profession', 'Sleep Duration', 'Dietary Habits', 'Degree']


Split the data into 80:20 (80 for train, 20 for test)

In [47]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=SEED
)

print("Train:", X_train.shape, "Test:", X_test.shape)
print("\nTrain target distribution:\n", y_train.value_counts(normalize=True))
print("\nTest target distribution:\n", y_test.value_counts(normalize=True))

Train: (17386, 16) Test: (4347, 16)

Train target distribution:
 Depression
1    0.551363
0    0.448637
Name: proportion, dtype: float64

Test target distribution:
 Depression
1    0.551415
0    0.448585
Name: proportion, dtype: float64


Numeric Scaling & Categorical Encoding


In [48]:
numeric_preprocess = Pipeline(steps=[
    ("scaler", StandardScaler())                   
])

categorical_preprocess = Pipeline(steps=[
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_preprocess, num_cols),
        ("cat", categorical_preprocess, cat_cols)
    ],
    remainder="drop"
)

Fit on TRAIN dataset and transform

In [49]:
X_train_trans = preprocess.fit_transform(X_train)
X_test_trans = preprocess.transform(X_test)

print("Transformed train shape:", X_train_trans.shape)
print("Transformed test shape:", X_test_trans.shape)

Transformed train shape: (17386, 87)
Transformed test shape: (4347, 87)


Feature Names

In [50]:
feature_names = preprocess.get_feature_names_out()
print("Total features after preprocessing:", len(feature_names))
print("First 30 feature names:\n", feature_names[:30])

Total features after preprocessing: 87
First 30 feature names:
 ['num__Gender' 'num__Age' 'num__Academic Pressure' 'num__Work Pressure'
 'num__CGPA' 'num__Study Satisfaction' 'num__Job Satisfaction'
 'num__Have you ever had suicidal thoughts ?' 'num__Work/Study Hours'
 'num__Financial Stress' 'num__Family History of Mental Illness'
 'cat__City_Agra' 'cat__City_Ahmedabad' 'cat__City_Bangalore'
 'cat__City_Bhopal' 'cat__City_Chennai' 'cat__City_Delhi'
 'cat__City_Faridabad' 'cat__City_Ghaziabad' 'cat__City_Hyderabad'
 'cat__City_Indore' 'cat__City_Jaipur' 'cat__City_Kalyan'
 'cat__City_Kanpur' 'cat__City_Kolkata' 'cat__City_Lucknow'
 'cat__City_Ludhiana' 'cat__City_Meerut' 'cat__City_Mumbai'
 'cat__City_Nagpur']


Saving Preprocessed Data and Artifacts

In [51]:
joblib.dump(preprocess, "preprocess_pipeline.joblib")
joblib.dump(X_train_trans, "X_train_trans.joblib")
joblib.dump(X_test_trans, "X_test_trans.joblib")
joblib.dump(y_train, "y_train.joblib")
joblib.dump(y_test, "y_test.joblib")
joblib.dump(feature_names, "feature_names.joblib")

print("Saved:")
print("- preprocess_pipeline.joblib")
print("- X_train_trans.joblib / X_test_trans.joblib")
print("- y_train.joblib / y_test.joblib")
print("- feature_names.joblib")

Saved:
- preprocess_pipeline.joblib
- X_train_trans.joblib / X_test_trans.joblib
- y_train.joblib / y_test.joblib
- feature_names.joblib
