In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
import joblib

In [2]:
train_df = pd.read_csv('../data/raw/mental_health_survey_data.csv')

In [3]:
print(train_df.shape)
print(train_df.info())

(1259, 27)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1259 entries, 0 to 1258
Data columns (total 27 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   Timestamp                  1259 non-null   object
 1   Age                        1259 non-null   int64 
 2   Gender                     1259 non-null   object
 3   Country                    1259 non-null   object
 4   state                      744 non-null    object
 5   self_employed              1241 non-null   object
 6   family_history             1259 non-null   object
 7   treatment                  1259 non-null   object
 8   work_interfere             995 non-null    object
 9   no_employees               1259 non-null   object
 10  remote_work                1259 non-null   object
 11  tech_company               1259 non-null   object
 12  benefits                   1259 non-null   object
 13  care_options               1259 non-null   object
 1

In [4]:
#Drop Unnecessary Columns
train_df.drop(['comments', 'state', 'Timestamp'], axis=1, inplace=True)

In [5]:
#Handling Missing Values
train_df.fillna('NaN', inplace=True)

In [7]:
# Fix Age column properly
train_df['Age'] = train_df['Age'].replace('NaN', np.nan)
train_df['Age'] = train_df['Age'].astype(float)
train_df['Age'].fillna(train_df['Age'].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df['Age'].fillna(train_df['Age'].median(), inplace=True)


In [8]:
# -------- Clean Gender Column --------
male_str = [
    "male","m","male-ish","maile","mal","male (cis)","make","male ",
    "man","msle","mail","malr","cis man","cis male","cis male"
]

female_str = [
    "cis female","f","female","woman","femake","female ",
    "cis-female/femme","female (cis)","femail"
]

trans_str = [
    "trans-female","something kinda male?","queer/she/they","non-binary",
    "nah","all","enby","fluid","genderqueer","androgyne","agender",
    "male leaning androgynous","guy (-ish) ^_^","trans woman","neuter",
    "female (trans)","queer",
    "ostensibly male, unsure what that really means"
]

train_df['Gender'] = train_df['Gender'].str.lower()

train_df['Gender'] = train_df['Gender'].replace(male_str, 'male')
train_df['Gender'] = train_df['Gender'].replace(female_str, 'female')
train_df['Gender'] = train_df['Gender'].replace(trans_str, 'trans')

In [None]:
# Remove junk gender values
train_df = train_df[~train_df['Gender'].isin(['a little about you', 'p'])]

In [None]:
# -------- Fix Age Outliers --------
train_df.loc[train_df['Age'] < 18, 'Age'] = train_df['Age'].median()
train_df.loc[train_df['Age'] > 120, 'Age'] = train_df['Age'].median()

In [12]:
# -------- Fix Specific Columns --------
train_df['self_employed'] = train_df['self_employed'].replace('NaN', 'No')
train_df['work_interfere'] = train_df['work_interfere'].replace('NaN', "Don't know")

In [13]:
# -------- Drop Country (Not Used) --------
train_df.drop(['Country'], axis=1, inplace=True)

In [15]:
# -------- Encode Categorical Features --------
categorical_cols = train_df.select_dtypes(include=['object']).columns

encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    train_df[col] = le.fit_transform(train_df[col])
    encoders[col] = le

In [16]:
# -------- Scale Age --------
scaler = MinMaxScaler()
train_df['Age'] = scaler.fit_transform(train_df[['Age']])

In [17]:
# -------- Final Feature Selection --------
feature_cols = [
    'Age',
    'Gender',
    'family_history',
    'benefits',
    'care_options',
    'anonymity',
    'leave',
    'work_interfere'
]

X = train_df[feature_cols]
y = train_df['treatment']

In [18]:
# -------- Save Artifacts --------
joblib.dump(encoders, '../models/encoders.pkl')
joblib.dump(scaler, '../models/scaler.pkl')
joblib.dump(feature_cols, '../models/features.pkl')

train_df.to_csv('../data/processed/cleaned_data.csv', index=False)
