In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from catboost import Pool, CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
import optuna
from optuna.samplers import TPESampler
import warnings
import shap
warnings.filterwarnings("ignore")

In [2]:
train_df = pd.read_csv('dataset/train.csv', index_col='id', encoding='utf-8')
test_df = pd.read_csv('dataset/test.csv', index_col='id', encoding='utf-8')
original_df = pd.read_csv('dataset/final_depression_dataset_1.csv', encoding='utf-8')

In [3]:
original_df['Depression'] = original_df['Depression'].map({"Yes": 1, "No": 0})

In [4]:
train_df = pd.concat([train_df, original_df], ignore_index=True)

train_df = train_df.fillna('None').astype('string')
test_df = test_df.fillna('None').astype('string')

In [5]:
train_workingprofessional = train_df[train_df['Working Professional or Student'] != 'Student']
test_workingprofessional = test_df[test_df['Working Professional or Student'] != 'Student'].copy()
y = train_workingprofessional['Depression']
X = train_workingprofessional.drop(['Depression'], axis=1)

X.reset_index(drop=True, inplace=True)
y.reset_index(drop=True, inplace=True)

In [6]:
catboost_params1 = {
        'loss_function': 'Logloss',
        'eval_metric': 'AUC',
        'learning_rate': 0.08,
        'iterations': 1000,
        'depth': 4,
        'random_strength':1,
        'l2_leaf_reg': 0.5,
        'min_data_in_leaf' : 2,
        'random_seed':42,
        'verbose':False,
        'task_type': 'CPU'
    }

In [7]:
cv = StratifiedKFold(3, shuffle=True, random_state=0)
cv_splits = cv.split(X, y)
scores1 = []
test_preds1 = []
X_test_pool = Pool(test_workingprofessional, cat_features=X.columns.values)
for i, (train_idx, val_idx) in enumerate(cv_splits):
    model = CatBoostClassifier(**catboost_params1)
    X_train_fold, X_val_fold = X.loc[train_idx], X.loc[val_idx]
    y_train_fold, y_val_fold = y.loc[train_idx], y.loc[val_idx]
    X_train_pool = Pool(X_train_fold, y_train_fold, cat_features=X.columns.values)
    X_valid_pool = Pool(X_val_fold, y_val_fold, cat_features=X.columns.values)
    model.fit(X=X_train_pool, eval_set=X_valid_pool, verbose=500, early_stopping_rounds=200)
    val_pred = model.predict(X_valid_pool)
    score = accuracy_score(y_val_fold, val_pred)
    scores1.append(score)
    test_pred1 = model.predict_proba(X_test_pool)[:, 1]
    test_preds1.append(test_pred1)
    #print(f'Fold {i + 1} accuracy_score: {score}')
print(f'workingprofessional:{np.mean(scores1):.4f};')

0:	test: 0.9260975	best: 0.9260975 (0)	total: 217ms	remaining: 3m 36s
500:	test: 0.9719919	best: 0.9719924 (496)	total: 47.2s	remaining: 47s
999:	test: 0.9723871	best: 0.9723871 (999)	total: 1m 20s	remaining: 0us

bestTest = 0.9723871084
bestIteration = 999

0:	test: 0.9389861	best: 0.9389861 (0)	total: 73.6ms	remaining: 1m 13s
500:	test: 0.9712558	best: 0.9712636 (494)	total: 36.2s	remaining: 36s
999:	test: 0.9718514	best: 0.9718519 (998)	total: 1m 15s	remaining: 0us

bestTest = 0.9718519144
bestIteration = 998

Shrink model to first 999 iterations.
0:	test: 0.9184922	best: 0.9184922 (0)	total: 75.3ms	remaining: 1m 15s
500:	test: 0.9693865	best: 0.9693865 (500)	total: 43.3s	remaining: 43.1s
999:	test: 0.9699581	best: 0.9699581 (998)	total: 1m 22s	remaining: 0us

bestTest = 0.9699581372
bestIteration = 998

Shrink model to first 999 iterations.
workingprofessional:0.9623;


In [8]:
train_student = train_df[train_df['Working Professional or Student'] == 'Student']
test_student = test_df[test_df['Working Professional or Student'] == 'Student'].copy()

y = train_student['Depression']
X = train_student.drop(['Depression'], axis=1)

X.reset_index(drop=True, inplace=True)
y.reset_index(drop=True, inplace=True)

In [9]:
catboost_params2 = {
        'loss_function': 'Logloss',
        'eval_metric': 'AUC',
        'learning_rate': 0.08,
        'iterations': 1000,
        'depth': 4,
        'random_strength':1,
        'l2_leaf_reg': 0.5,
        'min_data_in_leaf' : 2,
        'random_seed':42,
        'verbose':False,
        'task_type': 'CPU'
    }

In [10]:
cv = StratifiedKFold(3, shuffle=True, random_state=0)
cv_splits = cv.split(X, y)
scores2 = []
test_preds2 = []
X_test_pool = Pool(test_student, cat_features=X.columns.values)
for i, (train_idx, val_idx) in enumerate(cv_splits):
    model = CatBoostClassifier(**catboost_params2)
    X_train_fold, X_val_fold = X.loc[train_idx], X.loc[val_idx]
    y_train_fold, y_val_fold = y.loc[train_idx], y.loc[val_idx]
    X_train_pool = Pool(X_train_fold, y_train_fold, cat_features=X.columns.values)
    X_valid_pool = Pool(X_val_fold, y_val_fold, cat_features=X.columns.values)
    model.fit(X=X_train_pool, eval_set=X_valid_pool, verbose=500, early_stopping_rounds=200)
    val_pred = model.predict(X_valid_pool)
    score = accuracy_score(y_val_fold, val_pred)
    scores2.append(score)
    test_pred2 = model.predict_proba(X_test_pool)[:, 1]
    test_preds2.append(test_pred2)
    #print(f'Fold {i + 1} accuracy_score: {score}')
print(f'student:{np.mean(scores2):.4f};')

0:	test: 0.8783904	best: 0.8783904 (0)	total: 218ms	remaining: 3m 38s
500:	test: 0.9226983	best: 0.9227856 (357)	total: 25.9s	remaining: 25.8s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.9227855908
bestIteration = 357

Shrink model to first 358 iterations.
0:	test: 0.8846805	best: 0.8846805 (0)	total: 49.8ms	remaining: 49.8s
500:	test: 0.9240055	best: 0.9240714 (438)	total: 21.6s	remaining: 21.5s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.9242246024
bestIteration = 628

Shrink model to first 629 iterations.
0:	test: 0.8790044	best: 0.8790044 (0)	total: 45.8ms	remaining: 45.7s
500:	test: 0.9216078	best: 0.9217410 (386)	total: 22.6s	remaining: 22.6s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.9217410466
bestIteration = 386

Shrink model to first 387 iterations.
student:0.8502;


In [11]:
test_workingprofessional = test_workingprofessional.reset_index()
test_student = test_student.reset_index()

preds_workingprofessional = np.round(np.mean(test_preds1, axis=0))
preds_student = np.round(np.mean(test_preds2, axis=0))

submission_workingprofessional = pd.DataFrame({'id': test_workingprofessional['id'], 'Depression': preds_workingprofessional})
submission_student = pd.DataFrame({'id': test_student['id'], 'Depression': preds_student})

submission = pd.concat([submission_student, submission_workingprofessional], axis=0)
submission_s94344 = submission.sort_values(by='id', ascending=True)

In [12]:
submission_s94344

Unnamed: 0,id,Depression
0,140700,0.0
1,140701,0.0
2,140702,0.0
0,140703,1.0
3,140704,0.0
...,...,...
75024,234495,0.0
75025,234496,1.0
18771,234497,0.0
75026,234498,1.0


In [13]:
submission_s94344.to_csv('20241127_submission_model1.csv', index=False)

In [90]:
train_df = pd.read_csv('dataset/train.csv', encoding='utf-8')
test_df = pd.read_csv('dataset/test.csv', encoding='utf-8')
submission_df = pd.read_csv('dataset/sample_submission.csv', encoding='utf-8')

In [91]:
train_df.columns = [col.replace(' ', '_') for col in train_df.columns]
test_df.columns = [col.replace(' ', '_') for col in test_df.columns]

In [92]:
sleep={
        "More than 8 hours":9,
        'Less than 5 hours':4,
        '5-6 hours':5.5,
        '7-8 hours':7.5,
        '1-2 hours':1.5,
        '6-8 hours':7,
        '4-6 hours':5,
        '6-7 hours':6.5,
        '10-11 hours':10.5,
        '8-9 hours':8.5,
        '9-11 hours':10,
        '2-3 hours':2.5,
        '3-4 hours':3.5,
        'Moderate':6,
        '4-5 hours':4.5,
        '9-6 hours':7.5,
        '1-3 hours':2,
        '1-6 hours':4,
        '8 hours':8,
        '10-6 hours':8,
        'Unhealthy':3,
        'Work_Study_Hours':6,
        '3-6 hours':3.5,
        '9-5':7,
        '9-5 hours':7,
}

train_df['Sleep_Duration'] = train_df['Sleep_Duration'].map(sleep)
test_df['Sleep_Duration'] = test_df['Sleep_Duration'].map(sleep)
sleep_med = train_df['Sleep_Duration'].median()
train_df.fillna({'Sleep_Duration':sleep_med}, inplace=True)
test_df.fillna({'Sleep_Duration':sleep_med}, inplace=True)

In [93]:
gender={
    'Male':0,
    'Female':1,
}

work={
    'Working Professional':1,
    'Student':0,
}

Thoughts={
    'No':0,
    'Yes':1,
}

History={
    'No':0,
    'Yes':1,
}

train_df['Working_Professional_or_Student'] = train_df['Working_Professional_or_Student'].map(work)
test_df['Working_Professional_or_Student'] = test_df['Working_Professional_or_Student'].map(work)

train_df['Gender'] = train_df['Gender'].map(gender)
test_df['Gender'] = test_df['Gender'].map(gender)

train_df['Have_you_ever_had_suicidal_thoughts_?'] = train_df['Have_you_ever_had_suicidal_thoughts_?'].map(Thoughts)
test_df['Have_you_ever_had_suicidal_thoughts_?'] = test_df['Have_you_ever_had_suicidal_thoughts_?'].map(Thoughts)

train_df['Family_History_of_Mental_Illness'] = train_df['Family_History_of_Mental_Illness'].map(History)
test_df['Family_History_of_Mental_Illness'] = test_df['Family_History_of_Mental_Illness'].map(History)

In [94]:
train_df['Work_Hours'] = train_df.apply(
    lambda row: np.nan if pd.isna(row['Work/Study_Hours'])
    else row['Work/Study_Hours'] if row['Working_Professional_or_Student'] == 1
    else 0,
    axis=1
)

train_df['Study_Hours'] = train_df.apply(
    lambda row: np.nan if pd.isna(row['Work/Study_Hours'])
    else row['Work/Study_Hours'] if row['Working_Professional_or_Student'] == 0
    else 0,
    axis=1
)


test_df['Work_Hours'] = test_df.apply(
    lambda row: np.nan if pd.isna(row['Work/Study_Hours'])
    else row['Work/Study_Hours'] if row['Working_Professional_or_Student'] == 1
    else 0,
    axis=1
)

test_df['Study_Hours'] = test_df.apply(
    lambda row: np.nan if pd.isna(row['Work/Study_Hours'])
    else row['Work/Study_Hours'] if row['Working_Professional_or_Student'] == 0
    else 0,
    axis=1
)


train_df.drop(['Work/Study_Hours'], axis=1, inplace=True)
test_df.drop(['Work/Study_Hours'], axis=1, inplace=True)

In [95]:
train_df['Academic_Pressure'] = train_df.apply(
    lambda row: 0 if row['Working_Professional_or_Student'] == 1
    else (np.nan if pd.isna(row['Academic_Pressure']) else row['Academic_Pressure']),
    axis=1
)

test_df['Academic_Pressure'] = test_df.apply(
    lambda row: 0 if row['Working_Professional_or_Student'] == 1
    else (np.nan if pd.isna(row['Academic_Pressure']) else row['Academic_Pressure']),
    axis=1
)


train_df['Work_Pressure'] = train_df.apply(
    lambda row: 0 if row['Working_Professional_or_Student'] == 0
    else (np.nan if pd.isna(row['Work_Pressure']) else row['Work_Pressure']),
    axis=1
)

test_df['Work_Pressure'] = test_df.apply(
    lambda row: 0 if row['Working_Professional_or_Student'] == 0
    else (np.nan if pd.isna(row['Work_Pressure']) else row['Work_Pressure']),
    axis=1
)

In [96]:
diet={
    'More Healty':0,
    'Healthy':1,
    'Less than Healthy':2,
    'Less Healthy':2,
    'Moderate':3,
    'Unhealthy':4,
    'No Healthy':4,
}

train_df['Dietary_Habits'] = train_df['Dietary_Habits'].map(diet)
test_df['Dietary_Habits'] = test_df['Dietary_Habits'].map(diet)

In [97]:
degree = {
    "BCom": "B.Com", "B.Com": "B.Com", "B.Comm": "B.Com",
    "B.Tech": "B.Tech", "BTech": "B.Tech", "B.T": "B.Tech",
    "BSc": "B.Sc", "B.Sc": "B.Sc", "Bachelor of Science": "B.Sc",
    "BArch": "B.Arch", "B.Arch": "B.Arch",
    "BA": "B.A", "B.A": "B.A",
    "BBA": "BBA", "BB": "BBA",
    "BCA": "BCA",
    "BE": "BE",
    "BEd": "B.Ed", "B.Ed": "B.Ed",
    "BPharm": "B.Pharm", "B.Pharm": "B.Pharm",
    "BHM": "BHM",
    "LLB": "LLB", "LL B": "LLB", "LL BA": "LLB", "LL.Com": "LLB", "LLCom": "LLB",
    "MCom": "M.Com", "M.Com": "M.Com",
    "M.Tech": "M.Tech", "MTech": "M.Tech", "M.T": "M.Tech",
    "MSc": "M.Sc", "M.Sc": "M.Sc", "Master of Science": "M.Sc",
    "MBA": "MBA",
    "MCA": "MCA",
    "MD": "MD",
    "ME": "ME",
    "MEd": "M.Ed", "M.Ed": "M.Ed",
    "MArch": "M.Arch", "M.Arch": "M.Arch",
    "MPharm": "M.Pharm", "M.Pharm": "M.Pharm",
    "MA": "MA", "M.A": "MA",
    "MPA": "MPA",
    "LLM": "LLM",
    "PhD": "PhD",
    "MBBS": "MBBS",
    "CA": "CA",
    "Class 12": "Class 12", "12th": "Class 12",
    "Class 11": "Class 11", "11th": "Class 11"
}

train_df['Degree'] = train_df['Degree'].map(degree)
test_df['Degree'] = test_df['Degree'].map(degree)

In [98]:
nanlist=['Work_Hours','Study_Hours','Profession','Academic_Pressure','Work_Pressure','CGPA','Study_Satisfaction','Job_Satisfaction','Dietary_Habits','Degree','Financial_Stress']

In [99]:
train_df['Work_Stress'] = train_df.apply(
    lambda row:(row['Financial_Stress'] + row['Work_Pressure'] - row['Job_Satisfaction'])if row['Working_Professional_or_Student'] == 1
    else 0,
    axis=1
)

train_df['Academic_Stress'] = train_df.apply(
    lambda row:(row['Financial_Stress'] + row['Academic_Pressure'] - row['Study_Satisfaction'])if row['Working_Professional_or_Student'] == 0
    else 0,
    axis=1
)

test_df['Work_Stress'] = test_df.apply(
    lambda row:(row['Financial_Stress'] + row['Work_Pressure'] - row['Job_Satisfaction'])if row['Working_Professional_or_Student'] == 1
    else 0,
    axis=1
)

test_df['Academic_Stress'] = test_df.apply(
    lambda row:(row['Financial_Stress'] + row['Academic_Pressure'] - row['Study_Satisfaction'])if row['Working_Professional_or_Student'] == 0
    else 0,
    axis=1
)

In [100]:
nanlist=['Work_Stress','Academic_Stress','Work_Hours','Study_Hours','Profession','Academic_Pressure','Work_Pressure','CGPA','Study_Satisfaction','Job_Satisfaction','Dietary_Habits','Degree','Financial_Stress']

In [101]:
train_df.fillna({'Degree': 'Unknown', 'Profession': 'Unknown'}, inplace=True)
test_df.fillna({'Degree': 'Unknown', 'Profession': 'Unknown'}, inplace=True)

In [102]:
from category_encoders import TargetEncoder
n_splits = 5
kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)


target_encoder = TargetEncoder(cols=['Profession', 'Degree'])


for train_index, val_index in kf.split(train_df, train_df['Depression']):
    train_fold = train_df.iloc[train_index]
    val_fold = train_df.iloc[val_index]


    train_fold_encoded = target_encoder.fit_transform(train_fold[['Profession', 'Degree']], train_fold['Depression'])
    val_fold_encoded = target_encoder.transform(val_fold[['Profession', 'Degree']])


    train_df.loc[val_index, ['Profession', 'Degree']] = val_fold_encoded

test_encoded = target_encoder.transform(test_df[['Profession', 'Degree']])
test_df[['Profession', 'Degree']] = test_encoded

In [103]:
mean_n = train_df.groupby('Name')['Depression'].mean()
train_df['Name'] = train_df['Name'].map(mean_n)
test_df['Name'] = test_df['Name'].map(mean_n)

mean_n = train_df.groupby('City')['Depression'].mean()
train_df['City'] = train_df['City'].map(mean_n)
test_df['City'] = test_df['City'].map(mean_n)

In [104]:
for col in nanlist:
    med=train_df[col].median()
    train_df.fillna({col:med}, inplace=True)
    test_df.fillna({col:med}, inplace=True)

In [105]:
newnan=["Name","City"]

for col in newnan:
    med=train_df[col].median()
    test_df.fillna({col:med}, inplace=True)

In [106]:
id_col_tra= train_df['id']
train_df.drop(['id'],axis = 1,inplace = True)
id_col_test= test_df['id']
test_df.drop(['id'],axis = 1,inplace = True)

In [107]:
y = train_df['Depression']
train = train_df.drop(['Depression'],axis=1)
X = train

In [108]:
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=607)

def objective(trial):
    colsample_bytree= trial.suggest_float('colsample_bytree',0,1)
    n_estimators = trial.suggest_int('n_estimators', 400,1000)
    learning_rate = trial.suggest_float('learning_rate', 0.01,0.1)
    reg_lambda = trial.suggest_float('reg_lambda', 0,4)
    reg_alpha = trial.suggest_float('reg_alpha', 0,4)
    max_depth = trial.suggest_int('max_depth', 2,10)
    gamma = trial.suggest_float('gamma', 0,0.5)
    eval_metric='auc'

    model = XGBClassifier(
    colsample_bytree = colsample_bytree,
    n_estimators=n_estimators,
    learning_rate=learning_rate,
    max_depth=max_depth,
    reg_alpha=reg_alpha,
    reg_lambda=reg_lambda,
    gamma=gamma,
    eval_metric='auc',
    random_state=607
)
    model.fit(X_train, y_train)
    score = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])

    return score

In [109]:
study = optuna.create_study(direction='maximize',sampler=optuna.samplers.RandomSampler(seed=607))
optuna.logging.set_verbosity(optuna.logging.WARNING)

def log_best_trial(study, trial):
    if study.best_trial == trial:
        print(f"New best trial: {trial.number} with value: {trial.value} and params: {trial.params}")


study.optimize(objective, n_trials=100,callbacks=[log_best_trial])

New best trial: 0 with value: 0.9737262650324346 and params: {'colsample_bytree': 0.9241441149445456, 'n_estimators': 480, 'learning_rate': 0.01934695515938726, 'reg_lambda': 0.995746986500277, 'reg_alpha': 0.7922495581786353, 'max_depth': 3, 'gamma': 0.00960148482380424}
New best trial: 1 with value: 0.9747485543853113 and params: {'colsample_bytree': 0.1621141939217663, 'n_estimators': 866, 'learning_rate': 0.01830911692556304, 'reg_lambda': 2.9106714720433975, 'reg_alpha': 2.3081746881941307, 'max_depth': 8, 'gamma': 0.2014396057468147}
New best trial: 2 with value: 0.9752102590611635 and params: {'colsample_bytree': 0.8807953818921426, 'n_estimators': 412, 'learning_rate': 0.0758420999774204, 'reg_lambda': 0.7061383279618365, 'reg_alpha': 2.210813909235981, 'max_depth': 3, 'gamma': 0.36577013322844903}
New best trial: 3 with value: 0.975361279127712 and params: {'colsample_bytree': 0.14492566230028125, 'n_estimators': 714, 'learning_rate': 0.0638419910172031, 'reg_lambda': 0.799541

In [110]:
best_params = study.best_params
best_score = study.best_value
print(f"Best Hyperparameters: {best_params}")
print(f"Best Accuracy: {best_score:.6f}")

Best Hyperparameters: {'colsample_bytree': 0.1644800532756021, 'n_estimators': 654, 'learning_rate': 0.06302637503740172, 'reg_lambda': 0.8524760019100182, 'reg_alpha': 3.754808385795471, 'max_depth': 5, 'gamma': 0.2215562152216386}
Best Accuracy: 0.975481


In [111]:
n_estimators = best_params['n_estimators']
reg_alpha = best_params['reg_alpha']
learning_rate = best_params['learning_rate']
reg_lambda = best_params['reg_lambda']
max_depth = best_params['max_depth']
colsample_bytree = best_params['colsample_bytree']
gamma = best_params['gamma']

In [112]:
best_xgb=XGBClassifier(
    colsample_bytree = colsample_bytree,
    n_estimators = n_estimators,
    learning_rate = learning_rate,
    reg_alpha = reg_alpha,
    reg_lambda = reg_lambda,
    max_depth = max_depth,
    gamma=gamma,
    eval_metric='auc',
    random_state=607
)

eval_set = [(X_train, y_train), (X_test, y_test)]

best_xgb.fit(X_train,y_train,eval_set=eval_set,verbose=False)

y_pred = best_xgb.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'accuracy: {accuracy:.4f}')
roc_auc = roc_auc_score(y_test, best_xgb.predict_proba(X_test)[:, 1])
print(roc_auc)

accuracy: 0.9392
0.9754808503763771


In [113]:
x_test_df = test_df[X_train.columns]
y_test_pred = best_xgb.predict(x_test_df)
test_df['predicted'] = y_test_pred

In [114]:
test_df['predicted']

0        0
1        0
2        0
3        1
4        0
        ..
93795    0
93796    1
93797    0
93798    1
93799    0
Name: predicted, Length: 93800, dtype: int32

In [115]:
test_df.columns

Index(['Name', 'Gender', 'Age', 'City', 'Working_Professional_or_Student',
       'Profession', 'Academic_Pressure', 'Work_Pressure', 'CGPA',
       'Study_Satisfaction', 'Job_Satisfaction', 'Sleep_Duration',
       'Dietary_Habits', 'Degree', 'Have_you_ever_had_suicidal_thoughts_?',
       'Financial_Stress', 'Family_History_of_Mental_Illness', 'Work_Hours',
       'Study_Hours', 'Work_Stress', 'Academic_Stress', 'predicted'],
      dtype='object')

In [117]:
submission_df["id"] = id_col_test
submission_df["Depression"] = test_df['predicted']
submission_df

Unnamed: 0,id,Depression
0,140700,0
1,140701,0
2,140702,0
3,140703,1
4,140704,0
...,...,...
93795,234495,0
93796,234496,1
93797,234497,0
93798,234498,1


In [118]:
submission_df.to_csv('20241127_submission_model2.csv', index=False)

third model

In [119]:
train_df = pd.read_csv('dataset/train.csv', index_col='id' ,encoding='utf-8')
original_df = pd.read_csv('dataset/final_depression_dataset_1.csv', encoding='utf-8')


In [120]:
original_df['Depression'] = original_df['Depression'].map({"Yes": 1, "No": 0})

In [121]:
train_df = pd.concat([train_df, original_df])

In [123]:
test_df = pd.read_csv('dataset/test.csv', index_col='id')
train_df = train_df.fillna('None').astype('string')
test_df = test_df.fillna('None').astype('string')

y = train_df['Depression']
X = train_df.drop(['Depression'], axis=1)

In [124]:
catboost_params = {
    'loss_function': 'Logloss',
    'eval_metric': 'AUC',
    'learning_rate': 0.08114394459649094,
    'iterations': 1000,
    'depth': 6,
    'random_strength':0,
    'l2_leaf_reg': 0.7047064221215757,
    'task_type':'CPU',
    'random_seed':42,
    'verbose':False
}

In [None]:
cv = StratifiedKFold(5, shuffle=True, random_state=0)
cv_splits = cv.split(X, y)
scores = []
test_preds = []
X_test_pool = Pool(test_df, cat_features=X.columns.values)
for i, (train_idx, val_idx) in enumerate(cv_splits):
    model = CatBoostClassifier(**catboost_params)
    X_train_fold, X_val_fold = X.iloc[train_idx], X.iloc[val_idx]
    y_train_fold, y_val_fold = y.iloc[train_idx], y.iloc[val_idx]
    X_train_pool = Pool(X_train_fold, y_train_fold, cat_features=X.columns.values)
    X_valid_pool = Pool(X_val_fold, y_val_fold, cat_features=X.columns.values)
    model.fit(X=X_train_pool, eval_set=X_valid_pool, verbose=False, early_stopping_rounds=200)
    val_pred = model.predict(X_valid_pool)
    score = accuracy_score(y_val_fold, val_pred)
    scores.append(score)
    test_pred = model.predict_proba(X_test_pool)[:, 1]
    test_preds.append(test_pred)
    print(f'Fold {i + 1} accuracy_score: {score}')
print(f'Cross-validated accuracy_score: {np.mean(scores):.3f} +/- {np.std(scores):.3f}')
print(f'Max accuracy_score score: {np.max(scores):.3f}')
print(f'Min accuracy_score score: {np.min(scores):.3f}')

Fold 1 accuracy_score: 0.9399343850342036
Fold 2 accuracy_score: 0.9403511221248823
Fold 3 accuracy_score: 0.9405954416948797
