In [None]:
import pandas as pd
import numpy as np

In [None]:
train = pd.read_csv('TRAIN.csv')
test = pd.read_csv('TEST.csv')

In [None]:
train

In [None]:
print(train.shape), print(test.shape)

In [None]:
feat_research = pd.DataFrame(train.isna().sum() / train.shape[0], columns=['train_null_share'])
feat_research['test_null_share'] = test.isna().sum() / test.shape[0]
feat_research['train_dtypes'] = train.dtypes
feat_research['train_mean'] = train.mean()
feat_research['test_mean'] = test.mean()

feat_research

In [None]:
import matplotlib.pyplot as plt

In [None]:

fig, ax = plt.subplots(len(train.columns)-1, 1)
fig.set_figheight(25)
fig.set_figwidth(15)


for i in range(1, len(test.columns)):
    feat = train.columns[i]
    ax[i-1].hist(train[feat], label=feat+'_train', alpha=0.5, color='blue')
    ax[i-1].hist(test[feat], label=feat+'_test', alpha=0.5, color='red')
    ax[i-1].legend()

plt.show()

In [None]:
import seaborn as sns
sns.heatmap(train.corr())
plt.title('train_corr')
plt.show()

In [None]:
# test correlation matrix 

sns.heatmap(test.corr())
plt.title('test_corr')
plt.show()

In [None]:
# drop *doctor* (correlated with *stress*, but less corrlatd with target) 
train = train.drop(columns=['id'])
test = test.drop(columns=['id'])

In [None]:
# fix nulls 

# pernicious_1 & pernicious_2 correelate with *sex* a little 
# so fill nulls with regard to it: 
# 1 for *sex* == 2 and 0, otherwise 
pernicious_1_nan_male = np.where((np.isnan(test['pernicious_1']) & (test['sex']==2)), 
                               1, 
                               test['pernicious_1'])
test['pernicious_1'] = pernicious_1_nan_male
test['pernicious_1'] = test['pernicious_1'].fillna(0)

pernicious_2_nan_male = np.where((np.isnan(test['pernicious_2']) & (test['sex']==2)), 
                               1, 
                               test['pernicious_2'])
test['pernicious_2'] = pernicious_2_nan_male
test['pernicious_2'] = test['pernicious_2'].fillna(0)

# fill in *sport* with median
test['sport'] = test['sport'].median()

In [None]:
# one-hot-encoding categorical features

stress_train_ohe = pd.get_dummies(train['stress'], prefix='stress').drop(columns='stress_3')
stress_test_ohe = pd.get_dummies(test['stress'], prefix='stress').drop(columns='stress_3')

train = train.drop(columns='stress')
test = test.drop(columns='stress')

train = train.join(stress_train_ohe)
test = test.join(stress_test_ohe)

In [None]:
# *sex*: 1/2 -> 0/1

train['sex'] -= 1

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# train-test split 

X_train, X_val, y_train, y_val = train_test_split(train.drop(columns='insomnia'), 
                                                  train['insomnia'], 
                                                  test_size=0.3, 
                                                  random_state=13)

In [None]:
# create pipeline for logistic regression 

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import log_loss


pipe = Pipeline([('scaler', StandardScaler()), 
                 ('lr', LogisticRegression(random_state=13))])

pipe.fit(X_train, y_train)
log_loss(y_val, pipe.predict_proba(X_val))

In [None]:
# create pipeline for catboost classifier

from catboost import CatBoostClassifier
from sklearn.preprocessing import PolynomialFeatures


pipe = Pipeline([('poly', PolynomialFeatures(2)), 
                  ('scaler', StandardScaler()), 
                 ('cbr', CatBoostClassifier(silent=True, random_seed=13))])

pipe.fit(X_train, y_train)
log_loss(y_val, pipe.predict_proba(X_val))

In [None]:
# catboostclassifier gridsearchcv 

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold


pipe = Pipeline([('poly', PolynomialFeatures(1)), 
                  ('scaler', StandardScaler()), 
                  ('cbr', CatBoostClassifier(silent=True, random_seed=13, loss_function='Logloss'))])

cbr_par = {'cbr__iterations': [20, 50, 100, 250], 
          'cbr__depth': [3, 5, 7]}
cbr = CatBoostClassifier(silent=True)
cbr_gscv = GridSearchCV(pipe, cbr_par)
cbr_gscv.fit(X_train, y_train)
cbr_best = cbr_gscv.best_estimator_

print('best params: {}'.format(cbr_gscv.best_params_))
print('logloss on best model: {}'.format(log_loss(y_val, cbr_best.predict_proba(X_val))))

In [None]:
# feature importances for best model 

print(X_train.columns) 
cbr_best.steps[2][1].feature_importances_

In [None]:
test_pred = cbr_best.predict_proba(test)[:, 1]

plt.hist(test_pred)
plt.title('test pred distribution')
plt.show()

In [None]:
test_submission = pd.DataFrame(sample_submission['id'])
test_submission['insomnia'] = test_pred
test_submission.head()

In [None]:
test_submission.to_csv('test_submission.csv', index=False)