## Imports

In [1]:
import pandas as pd
from xgboost import XGBClassifier

from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score

train = pd.read_csv('/kaggle/input/analyze-the-insights-over-mental-health-data/train.csv')
test = pd.read_csv('/kaggle/input/analyze-the-insights-over-mental-health-data/test.csv')

## Data Cleaning

In [2]:
train=train.drop('Name',axis=1)
test=test.drop('Name',axis=1)

train=train.drop('id',axis=1)
ID=test['id']
test=test.drop('id',axis=1)
ID

0        140700
1        140701
2        140702
3        140703
4        140704
          ...  
93795    234495
93796    234496
93797    234497
93798    234498
93799    234499
Name: id, Length: 93800, dtype: int64

In [3]:
columns=train.columns.to_list()
columns

for i in columns:
    print(i,len(train[i].unique()))

Gender 2
Age 43
City 98
Working Professional or Student 2
Profession 65
Academic Pressure 6
Work Pressure 6
CGPA 332
Study Satisfaction 6
Job Satisfaction 6
Sleep Duration 36
Dietary Habits 24
Degree 116
Have you ever had suicidal thoughts ? 2
Work/Study Hours 13
Financial Stress 6
Family History of Mental Illness 2
Depression 2


### One-hot encoding

In [4]:
train_y = train['Depression']
train_X = train.drop(columns='Depression')

In [5]:
categorical_cols = train_X.select_dtypes(include=['object', 'category']).columns
numeric_cols = train_X.select_dtypes(include=['int64', 'float64']).columns

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
        ('num', 'passthrough', numeric_cols)
    ]
)

In [6]:
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', XGBClassifier())
])

params = {
    'model__n_estimators': [300, 500],
    'model__max_depth': [4, 6],
    'model__learning_rate': [0.03, 0.05],
    'model__subsample': [0.8, 1.0]
}

grid_search = GridSearchCV(
    pipeline,
    params,
    cv=5,
    scoring='roc_auc',
    n_jobs=-1
)

grid_search.fit(train_X, train_y)

print("Best params (grid):", grid_search.best_params_)

Best params (grid): {'model__learning_rate': 0.05, 'model__max_depth': 4, 'model__n_estimators': 500, 'model__subsample': 0.8}


In [7]:
sample = pd.read_csv("/kaggle/input/analyze-the-insights-over-mental-health-data/sample_submission.csv")
sample.head()

Unnamed: 0,id,Depression
0,140700,0
1,140701,0
2,140702,0
3,140703,0
4,140704,0


In [8]:
best_model = grid_search.best_estimator_

predicted = best_model.predict_proba(test)[:, 1]

predicted_binary = (predicted >= 0.5).astype(int)

final_submission = pd.DataFrame({
    "id": ID,
    "Depression": predicted_binary
})

final_submission.to_csv('submission.csv',index = False)