In [116]:
import numpy as np
import pandas as pd
train = pd.read_csv('./ml5/train.csv', sep=';', na_values='None')
test = pd.read_csv('./ml5/test.csv', sep=';', na_values='None')

In [117]:
train.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [119]:
test.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active
0,5,18888,1,154,85.0,130,80,1,1,0.0,0.0,1.0
1,6,19042,2,170,69.0,130,90,1,1,,0.0,1.0
2,7,20432,1,160,70.0,120,75,1,1,0.0,0.0,0.0
3,10,18133,2,185,94.0,130,90,1,1,,0.0,1.0
4,11,16093,2,170,76.0,120,80,1,1,0.0,0.0,1.0


In [148]:
data = pd.concat([train.drop(['cardio'], 1), test], 0)

In [149]:
data = data.fillna(data.median())

In [150]:
numeric_cols = ['age', 'height', 'weight', 'ap_hi', 'ap_lo']
category_cols = ['gender', 'cholesterol', 'gluc', 'smoke', 'alco']

data_numeric = data.loc[:, numeric_cols]
data_category = data.loc[:, category_cols]

data_numeric = (data_numeric - data_numeric.mean()) / data_numeric.std()
data_category = pd.get_dummies(data_category, columns=category_cols)

data = pd.concat([data['id'], data_numeric, data_category], 1)

In [217]:
data.head()

Unnamed: 0,id,age,height,weight,ap_hi,ap_lo,gender_1,gender_2,cholesterol_1,cholesterol_2,cholesterol_3,gluc_1,gluc_2,gluc_3,smoke_0.0,smoke_1.0,alco_0.0,alco_1.0
0,0,-0.433878,0.444742,-0.847051,-0.110882,-0.090103,0,1,1,0,0,1,0,0,1,0,1,0
1,1,0.308908,-1.016679,0.752449,0.06189,-0.034987,1,0,0,0,1,1,0,0,1,0,1,0
2,2,-0.246056,0.079387,-0.707964,0.004299,-0.145218,1,0,0,0,1,1,0,0,1,0,1,0
3,3,-0.745565,0.566527,0.543818,0.11948,0.020128,0,1,1,0,0,1,0,0,1,0,1,0
4,4,-0.805878,-1.016679,-1.264312,-0.168472,-0.200334,1,0,1,0,0,1,0,0,1,0,1,0


In [174]:
train = data.merge(train.loc[:,['id','cardio']], on=['id'])
test = data.merge(test.loc[:,['id']], on=['id'])

In [178]:
X_train = train.drop(['cardio', 'id'], 1)
y_train = train['cardio']
X_test = test.drop(['id'], 1)

In [212]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
from sklearn.metrics import make_scorer
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

scoring = make_scorer(log_loss, greater_is_better=False, needs_proba=True)
kfold = KFold(n_splits=4, shuffle=True, random_state=12345)
param_grid = {'C': [1, 10, 100, 1000]}

grid = GridSearchCV(LogisticRegression(), param_grid, scoring, cv=kfold)

In [213]:
grid.fit(X_train, y_train)
pd.DataFrame(grid.cv_results_)

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_C,params,rank_test_score,split0_test_score,split0_train_score,split1_test_score,split1_train_score,split2_test_score,split2_train_score,split3_test_score,split3_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
0,0.375325,0.012558,-0.579438,-0.57857,1,{'C': 1},4,-0.581269,-0.572999,-0.579327,-0.580132,-0.579406,-0.579677,-0.577751,-0.58147,0.065044,0.00392,0.001246,0.003283
1,0.357905,0.008362,-0.579227,-0.578351,10,{'C': 10},3,-0.581252,-0.572888,-0.579042,-0.579887,-0.579192,-0.579414,-0.577424,-0.581216,0.060165,0.000191,0.001359,0.003223
2,0.339097,0.007906,-0.579207,-0.57833,100,{'C': 100},1,-0.581251,-0.572877,-0.579014,-0.579862,-0.579171,-0.579388,-0.577393,-0.581191,0.051088,0.000264,0.00137,0.003217
3,0.324458,0.008128,-0.579208,-0.57833,1000,{'C': 1000},2,-0.581251,-0.572876,-0.579011,-0.57986,-0.579169,-0.579385,-0.577402,-0.581199,0.043163,0.000411,0.001367,0.003219


In [215]:
y_test = grid.best_estimator_.predict_proba(X_test)
y_test = y_test[:,1]

In [216]:
with open('submission.txt', 'w') as dst:
    dst.writelines('%s\n' % y for y in y_test)