## Imports

In [2]:
import pandas as pd

from pathlib import Path
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder

In [12]:
data_path = Path("../data/depression_data.csv")

In [13]:
original_df = pd.read_csv(data_path)

In [14]:
df = original_df.copy()

In [15]:
df.drop("Name", axis = 1, inplace=True)
df.head()

Unnamed: 0,Age,Marital Status,Education Level,Number of Children,Smoking Status,Physical Activity Level,Employment Status,Income,Alcohol Consumption,Dietary Habits,Sleep Patterns,History of Mental Illness,History of Substance Abuse,Family History of Depression,Chronic Medical Conditions
0,31,Married,Bachelor's Degree,2,Non-smoker,Active,Unemployed,26265.67,Moderate,Moderate,Fair,Yes,No,Yes,Yes
1,55,Married,High School,1,Non-smoker,Sedentary,Employed,42710.36,High,Unhealthy,Fair,Yes,No,No,Yes
2,78,Widowed,Master's Degree,1,Non-smoker,Sedentary,Employed,125332.79,Low,Unhealthy,Good,No,No,Yes,No
3,58,Divorced,Master's Degree,3,Non-smoker,Moderate,Unemployed,9992.78,Moderate,Moderate,Poor,No,No,No,No
4,18,Single,High School,0,Non-smoker,Sedentary,Unemployed,8595.08,Low,Moderate,Fair,Yes,No,Yes,Yes


In [16]:
df.shape

(413768, 15)

## Standard Scaler

In [17]:
num_cols = ["Age", "Number of Children", "Income"]

In [18]:
scaler = StandardScaler()

In [19]:
df[num_cols] = scaler.fit_transform(df[num_cols])

## Onehot Encoder

In [20]:
onehot_cols = ["Marital Status", "Smoking Status", "Physical Activity Level"]

In [21]:
onehot_encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)

In [22]:
onehot_encoded = pd.DataFrame(
    onehot_encoder.fit_transform(df[onehot_cols]),
    columns=onehot_encoder.get_feature_names_out(onehot_cols),
    index = df.index
)

In [23]:
df = pd.concat(
    [df.drop(onehot_cols, axis = 1), onehot_encoded], axis = 1
)

## Ordinal Encoder

In [24]:
ordinal_cols = [
    "Education Level",
    "Alcohol Consumption",
    "Dietary Habits",
    "Sleep Patterns"
]

In [25]:
edu_order = ["High School","Associate Degree", "Bachelor's Degree", "Master's Degree", "PhD"]
alc_order = ["Low", "Moderate", "High"]
diet_order = ["Unhealthy", "Moderate", "Healthy"]
sleep_order = ["Poor", "Fair", "Good"]

In [26]:
ord_encoder = OrdinalEncoder(
    categories=[edu_order, alc_order, diet_order, sleep_order],
    handle_unknown="use_encoded_value",
    unknown_value=-1
)

In [27]:
ord_df = pd.DataFrame(ord_encoder.fit_transform(df[ordinal_cols]), columns=ordinal_cols, index=df.index)

In [28]:
df = pd.concat(
    [df.drop(ordinal_cols, axis = 1), ord_df], axis = 1
)

# Binary Encoder

In [29]:
df["Employment Status"] = df["Employment Status"].map({"Unemployed": 0, "Employed": 1})
df["History of Mental Illness"] = df["History of Mental Illness"].map({"Yes": 1, "No": 0})
df["History of Substance Abuse"] = df["History of Substance Abuse"].map({"Yes": 1, "No": 0})
df["Family History of Depression"] = df["Family History of Depression"].map({"Yes": 1, "No": 0})
df["Chronic Medical Conditions"] = df["Chronic Medical Conditions"].map({"Yes": 1, "No": 0})

In [30]:
df

Unnamed: 0,Age,Number of Children,Employment Status,Income,History of Mental Illness,History of Substance Abuse,Family History of Depression,Chronic Medical Conditions,Marital Status_Divorced,Marital Status_Married,...,Smoking Status_Current,Smoking Status_Former,Smoking Status_Non-smoker,Physical Activity Level_Active,Physical Activity Level_Moderate,Physical Activity Level_Sedentary,Education Level,Alcohol Consumption,Dietary Habits,Sleep Patterns
0,-0.991298,0.566692,0,-0.600532,1,0,1,1,0.0,1.0,...,0.0,0.0,1.0,1.0,0.0,0.0,2.0,1.0,1.0,1.0
1,0.330380,-0.241681,1,-0.195730,1,0,0,1,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,2.0,0.0,1.0
2,1.596988,-0.241681,1,1.838100,0,0,1,0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,3.0,0.0,0.0,2.0
3,0.495590,1.375065,0,-1.001105,0,0,0,0,1.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,3.0,1.0,1.0,0.0
4,-1.707206,-1.050054,0,-1.035510,1,0,1,1,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413763,1.046289,-1.050054,1,1.441799,0,0,0,0,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,3.0,0.0,2.0,2.0
413764,-1.266647,-1.050054,1,1.134778,1,1,0,1,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,2.0,0.0,2.0,0.0
413765,0.440520,-1.050054,1,0.657038,0,0,1,1,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,2.0,1.0,1.0,1.0
413766,1.211499,0.566692,0,-0.642590,0,1,0,0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0


## Save the data

In [31]:
df.to_csv("clean_data.csv", index=False)