In [4]:
# Import Libraries
import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from xgboost import XGBClassifier

In [5]:
# Read in Data

data = pd.read_csv("train.csv", index_col='id')

more_data = pd.read_csv('kidney stone urine analysis.csv')

data = pd.concat([data, more_data])
data.tail()

Unnamed: 0,gravity,ph,osmo,cond,urea,calc,target
74,1.025,7.9,721,23.6,301,9.04,1
75,1.017,4.81,410,13.3,195,0.58,1
76,1.024,5.4,803,21.8,394,7.82,1
77,1.016,6.81,594,21.4,255,12.2,1
78,1.015,6.03,416,12.8,178,9.39,1


In [6]:
numerical_transformer = Pipeline(steps=[
                                ('scaler', StandardScaler())
                                ])

num_features = ['cond', 'calc']

preprocessor = ColumnTransformer([
                                ('numericals', numerical_transformer, num_features)
                                ])

pipeline = Pipeline([
                    ('preprocessing', preprocessor),
                    ('model', XGBClassifier(learning_rate=0.1, random_state=8))
])

In [9]:
X=data.drop('target', axis=1)
y=data['target']

In [8]:
params = {
    #'model__learning_rate': np.arange(0.05, 0.15, 0.05),
    'model__max_depth': [3,4],
    'model__n_estimators': [10,12],
    'model__colsample_bytree': [0.75, 0.8, 0.85], 
    'model__gamma': [0.15, 0.2],             
    'model__reg_alpha': [0.8, 0.85, 0.9],            
    'model__reg_lambda': [0.85, 0.9],           
    'model__scale_pos_weight': [0.5, 0.6, 0.7],       
    'model__subsample': [0.75, 0.8]         
}

gscv = GridSearchCV(estimator=pipeline,
                       param_grid=params,
                       scoring='roc_auc',
                       cv=5,
                       verbose=1,
                       n_jobs=-1)

In [10]:
gscv.fit(X,y)

Fitting 5 folds for each of 864 candidates, totalling 4320 fits


In [11]:
print(f'Best Score {gscv.best_score_}')
print(f'Best Parameters {gscv.best_params_}')

Best Score 0.8148347107438016
Best Parameters {'model__colsample_bytree': 0.75, 'model__gamma': 0.2, 'model__max_depth': 4, 'model__n_estimators': 10, 'model__reg_alpha': 0.85, 'model__reg_lambda': 0.9, 'model__scale_pos_weight': 0.5, 'model__subsample': 0.75}


In [12]:
sample = pd.read_csv("sample_submission.csv", index_col='id')
sample.head()

Unnamed: 0_level_0,target
id,Unnamed: 1_level_1
414,0.5
415,0.5
416,0.5
417,0.5
418,0.5


In [13]:
X_test = pd.read_csv('test.csv')
X_test.head()

Unnamed: 0,id,gravity,ph,osmo,cond,urea,calc
0,414,1.017,5.24,345,11.5,152,1.16
1,415,1.02,5.68,874,29.0,385,3.46
2,416,1.024,5.36,698,19.5,354,13.0
3,417,1.02,5.33,668,25.3,252,3.46
4,418,1.011,5.87,567,29.0,457,2.36


In [14]:
y_test = pd.DataFrame(gscv.predict_proba(X_test)[:,1], index=X_test['id'], columns=['target'])
y_test.head()

Unnamed: 0_level_0,target
id,Unnamed: 1_level_1
414,0.244335
415,0.379965
416,0.537478
417,0.337397
418,0.310878


In [15]:
y_test.to_csv('predictions.csv')