In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from catboost import Pool, CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
import optuna
from optuna.samplers import TPESampler
import warnings
import shap

In [4]:
train_df = pd.read_csv('dataset/train.csv', encoding='utf-8', index_col='id')
test_df = pd.read_csv('dataset/test.csv', encoding='utf-8', index_col='id')
orginal_df = pd.read_csv('dataset/final_depression_dataset_1.csv', encoding='utf-8')

In [5]:
train_df.head(5)

Unnamed: 0_level_0,Name,Gender,Age,City,Working Professional or Student,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
0,Aaradhya,Female,49.0,Ludhiana,Working Professional,Chef,,5.0,,,2.0,More than 8 hours,Healthy,BHM,No,1.0,2.0,No,0
1,Vivan,Male,26.0,Varanasi,Working Professional,Teacher,,4.0,,,3.0,Less than 5 hours,Unhealthy,LLB,Yes,7.0,3.0,No,1
2,Yuvraj,Male,33.0,Visakhapatnam,Student,,5.0,,8.97,2.0,,5-6 hours,Healthy,B.Pharm,Yes,3.0,1.0,No,1
3,Yuvraj,Male,22.0,Mumbai,Working Professional,Teacher,,5.0,,,1.0,Less than 5 hours,Moderate,BBA,Yes,10.0,1.0,Yes,1
4,Rhea,Female,30.0,Kanpur,Working Professional,Business Analyst,,1.0,,,1.0,5-6 hours,Unhealthy,BBA,Yes,9.0,4.0,Yes,0


In [6]:
# Show data types, non-null counts, memory usage
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 140700 entries, 0 to 140699
Data columns (total 19 columns):
 #   Column                                 Non-Null Count   Dtype  
---  ------                                 --------------   -----  
 0   Name                                   140700 non-null  object 
 1   Gender                                 140700 non-null  object 
 2   Age                                    140700 non-null  float64
 3   City                                   140700 non-null  object 
 4   Working Professional or Student        140700 non-null  object 
 5   Profession                             104070 non-null  object 
 6   Academic Pressure                      27897 non-null   float64
 7   Work Pressure                          112782 non-null  float64
 8   CGPA                                   27898 non-null   float64
 9   Study Satisfaction                     27897 non-null   float64
 10  Job Satisfaction                       112790 non-null  float

In [7]:
# Summarizes statistics of numeric columns
train_df.describe()

Unnamed: 0,Age,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Work/Study Hours,Financial Stress,Depression
count,140700.0,27897.0,112782.0,27898.0,27897.0,112790.0,140700.0,140696.0,140700.0
mean,40.388621,3.142273,2.998998,7.658636,2.94494,2.974404,6.252679,2.988983,0.181713
std,12.384099,1.380457,1.405771,1.464466,1.360197,1.416078,3.853615,1.413633,0.385609
min,18.0,1.0,1.0,5.03,1.0,1.0,0.0,1.0,0.0
25%,29.0,2.0,2.0,6.29,2.0,2.0,3.0,2.0,0.0
50%,42.0,3.0,3.0,7.77,3.0,3.0,6.0,3.0,0.0
75%,51.0,4.0,4.0,8.92,4.0,4.0,10.0,4.0,0.0
max,60.0,5.0,5.0,10.0,5.0,5.0,12.0,5.0,1.0


In [8]:
# Check for missing values
train_df.isnull().sum()

Name                                          0
Gender                                        0
Age                                           0
City                                          0
Working Professional or Student               0
Profession                                36630
Academic Pressure                        112803
Work Pressure                             27918
CGPA                                     112802
Study Satisfaction                       112803
Job Satisfaction                          27910
Sleep Duration                                0
Dietary Habits                                4
Degree                                        2
Have you ever had suicidal thoughts ?         0
Work/Study Hours                              0
Financial Stress                              4
Family History of Mental Illness              0
Depression                                    0
dtype: int64

In [9]:
# Check for duplicates
train_df.duplicated().sum()

0

In [11]:
# Replace all Nan (Not a Number) values in the dataframes, with string 'None', then convert all columns to string
train_df = train_df.fillna('None').astype('string')
test_df = test_df.fillna('None').astype('string')

In [12]:
# Extract the target column (Depression) from the training dataframe (y)
# and drop it from the training dataframes (x)
y = train_df['Depression']
x = train_df.drop(columns=['Depression'], axis=1)

In [14]:
catboost_params = {
    'loss_function': 'Logloss',
    'eval_metric': 'AUC',
    'learning_rate': 0.08114394459649094,
    'iterations': 1000,
    'depth': 6,
    'random_strength':0,
    'l2_leaf_reg': 0.7047064221215757,
    'random_seed':42,
    'verbose':False,
    'task_type': 'CPU'
}

In [15]:
cv = StratifiedKFold(5, shuffle=True, random_state=0)
cv_splits = cv.split(x, y)
scores = []
test_preds = []
X_test_pool = Pool(test_df, cat_features=x.columns.values)
for i, (train_idx, val_idx) in enumerate(cv_splits):
    model = CatBoostClassifier(**catboost_params)
    X_train_fold, X_val_fold = x.loc[train_idx], x.loc[val_idx]
    y_train_fold, y_val_fold = y.loc[train_idx], y.loc[val_idx]
    X_train_pool = Pool(X_train_fold, y_train_fold, cat_features=x.columns.values)
    X_valid_pool = Pool(X_val_fold, y_val_fold, cat_features=x.columns.values)
    model.fit(X=X_train_pool, eval_set=X_valid_pool, verbose=False, early_stopping_rounds=200)
    val_pred = model.predict(X_valid_pool)
    score = accuracy_score(y_val_fold, val_pred)
    scores.append(score)
    test_pred = model.predict_proba(X_test_pool)[:, 1]
    test_preds.append(test_pred)
    print(f'Fold {i + 1} accuracy_score: {score}')
print(f'Cross-validated accuracy_score: {np.mean(scores):.3f} +/- {np.std(scores):.3f}')

Fold 1 accuracy_score: 0.9405828002842929
Fold 2 accuracy_score: 0.9380597014925374
Fold 3 accuracy_score: 0.9408315565031983
Fold 4 accuracy_score: 0.9410092395167022
Fold 5 accuracy_score: 0.9393745557924662
Cross-validated accuracy_score: 0.940 +/- 0.001


In [16]:
print(f'Max accuracy_score score: {np.max(scores):.3f}')
print(f'Min accuracy_score score: {np.min(scores):.3f}')

Max accuracy_score score: 0.941
Min accuracy_score score: 0.938


In [17]:
sample_submission = pd.read_csv('dataset/sample_submission.csv', encoding='utf-8')
sample_submission['Depression'] = np.round(np.mean(test_preds, axis=0))
sample_submission

Unnamed: 0,id,Depression
0,140700,0.0
1,140701,0.0
2,140702,0.0
3,140703,1.0
4,140704,0.0
...,...,...
93795,234495,0.0
93796,234496,1.0
93797,234497,0.0
93798,234498,1.0


In [18]:
sample_submission.to_csv('prediction/20241121_Submission.csv', index=False)