## Winning code (simplified): Busara Mental Health Challenge by Steven Simba

In [1]:
import pandas as pd
import numpy as np

import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")
df_train.head()

Unnamed: 0,surveyid,village,survey_date,femaleres,age,married,children,hhsize,edu,hh_children,...,given_mpesa,amount_given_mpesa,received_mpesa,amount_received_mpesa,net_mpesa,saved_mpesa,amount_saved_mpesa,early_survey,depressed,day_of_week
0,926,91,23-Nov-61,1,28.0,1,4,6,10,0,...,0,0.0,0,0.0,0.0,1,0.0,0,0,5
1,747,57,24-Oct-61,1,23.0,1,3,5,8,0,...,0,0.0,1,4.804611,4.804611,0,0.0,0,1,3
2,1190,115,05-Oct-61,1,22.0,1,3,5,9,0,...,0,0.0,0,8.007685,8.007685,1,0.0,0,0,5
3,1065,97,23-Sep-61,1,27.0,1,2,4,10,2,...,0,0.0,0,0.0,0.0,1,1.249199,0,0,0
4,806,42,12-Sep-61,0,59.0,0,4,6,10,4,...,0,0.0,0,0.0,0.0,0,0.0,0,0,3


## Data Cleaning

In [2]:
seps = df_train.shape[0]
comb = pd.concat([df_train, df_test], axis=0)
comb['age'] = comb['age'].apply(lambda x: str(x) )
comb['age'] = comb['age'].apply(lambda x: str(0) if x == ".d" else x)
comb['age'] = comb['age'].apply(lambda x: float(x))

le = LabelEncoder()

comb['survey_date'] = comb['survey_date'].apply(lambda x: str(x))
le.fit(comb['survey_date'])
comb['survey_date'] = le.transform(comb['survey_date'])

colNull = comb.isnull().sum()
colNull = [keys for keys, values in colNull.items() if values > 0]
for i in colNull:
    comb[i] = comb[i].interpolate()

In [3]:
train = comb[:seps]
test = comb[seps:]
train.dropna(inplace=True)
train.reset_index(drop=True, inplace=True)

y_train = train['depressed']
x_train = train.drop(labels=['depressed'], axis=1)
x_test = test.drop(labels=['depressed'], axis=1)

## Top score was a product of blending 3 models; random forest, gradient boosting and extreme gradient boosting

In [4]:
model = GradientBoostingClassifier(n_estimators=90, max_depth=3, random_state=8) 
model.fit(x_train,y_train)
gb_pred = model.predict(x_test)

model= xgb.XGBClassifier(seed=3)
model.fit(x_train, y_train)
p_pred = model.predict_proba(x_test)

xgb_pred = []
for pp in p_pred:
    if 0.5 < pp[1] < 0.6:
        xgb_pred.append(1)
    else:
        xgb_pred.append(0)

        
model = RandomForestClassifier(random_state=3, n_estimators=20)
model.fit(x_train, y_train)
rf_pred = model.predict(x_test)

blend = []
for p in range(len(gb_pred)):
    if (gb_pred[p] > 0) | (xgb_pred[p] > 0) | (rf_pred[p] > 0):
        blend.append(1)
    else:
        blend.append(0)

submiss = pd.DataFrame({"surveyid": x_test['surveyid'],  "depressed": blend})
submiss = submiss[['surveyid', 'depressed']]
submiss.to_csv("gfinal.csv", index = False)