# Women in Data Science Datathon 2020
## Contributors:
1. Aarthe Jayaprakash
2. Chaitanya Chaphalkar
3. Eashani Deorukhkar

Importing environment variables and dataset.
Dataset: https://www.kaggle.com/c/17807/download-all

In [1]:
import numpy as np
import lightgbm as lgb
import re
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
import pickle
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score
from sklearn.utils import resample
import pandas as pd
import os
import time
from scipy.stats import randint as sp_randint
from sklearn.metrics import (confusion_matrix, precision_recall_curve, auc,
                             roc_curve, recall_score, classification_report, f1_score,
                             precision_recall_fscore_support, roc_auc_score)
from sklearn.model_selection import StratifiedKFold, GroupKFold
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
from sklearn.model_selection import cross_val_score
%matplotlib inline 

In [2]:
data = pd.read_csv("training_v2.csv")
test = pd.read_csv("unlabeled.csv")

In [3]:
data.hospital_death.value_counts()

0    83798
1     7915
Name: hospital_death, dtype: int64

The data is skewed by a huge ratio. It needs to be upsampled for a fair analysis.

### Upsampling the data to remove imbalance in the target column.

In [4]:
df_majority = data[data.hospital_death==0]
df_minority = data[data.hospital_death==1]

In [5]:
df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=83798,    # to match majority class
                                 random_state= 303) # reproducible results
 
# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])
 
# Display new class counts
df_upsampled.hospital_death.value_counts()

1    83798
0    83798
Name: hospital_death, dtype: int64

In [6]:
data=df_upsampled

### Setting the target column and looking at column names.

In [7]:
y = data['hospital_death']
X = data
X = X.drop(['hospital_death'], axis=1)

In [8]:
test = test.drop(['hospital_death'], axis=1)
test.columns

Index(['encounter_id', 'patient_id', 'hospital_id', 'age', 'bmi',
       'elective_surgery', 'ethnicity', 'gender', 'height',
       'hospital_admit_source',
       ...
       'aids', 'cirrhosis', 'diabetes_mellitus', 'hepatic_failure',
       'immunosuppression', 'leukemia', 'lymphoma',
       'solid_tumor_with_metastasis', 'apache_3j_bodysystem',
       'apache_2_bodysystem'],
      dtype='object', length=185)

In [9]:
X.head()
X.columns

Index(['encounter_id', 'patient_id', 'hospital_id', 'age', 'bmi',
       'elective_surgery', 'ethnicity', 'gender', 'height',
       'hospital_admit_source',
       ...
       'aids', 'cirrhosis', 'diabetes_mellitus', 'hepatic_failure',
       'immunosuppression', 'leukemia', 'lymphoma',
       'solid_tumor_with_metastasis', 'apache_3j_bodysystem',
       'apache_2_bodysystem'],
      dtype='object', length=185)

## Data Cleaning

### Removing columns that have a high degree of correlation with one another (>0.9)

In [10]:
# threshold for removing correlated variables
threshold = 0.9

# Absolute value correlation matrix
corr_matrix = X.corr().abs()
corr_matrix.head()

Unnamed: 0,encounter_id,patient_id,hospital_id,age,bmi,elective_surgery,height,icu_id,pre_icu_los_days,readmission_status,...,apache_4a_hospital_death_prob,apache_4a_icu_death_prob,aids,cirrhosis,diabetes_mellitus,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis
encounter_id,1.0,0.015342,0.007626,0.003655,0.003362,0.006607,0.00861,0.005902,0.003817,,...,0.002492,0.004714,0.002806,0.010884,0.010205,0.007714,0.00109,0.002507,0.004454,0.004721
patient_id,0.015342,1.0,0.00714,0.004471,0.007138,0.001852,0.004753,0.007158,0.021116,,...,0.005151,0.006286,0.002369,0.004249,0.000116,0.001577,0.005193,0.009341,0.000951,0.006784
hospital_id,0.007626,0.00714,1.0,0.013256,0.027932,0.041729,0.020469,0.011263,0.007547,,...,0.015605,0.005726,0.013256,0.001238,0.023915,0.002574,0.006815,0.000989,0.003505,0.004299
age,0.003655,0.004471,0.013256,1.0,0.120479,0.021266,0.110775,0.020419,0.046786,,...,0.137545,0.0714,0.038805,0.053937,0.062828,0.052939,0.007532,0.024258,0.020561,0.003582
bmi,0.003362,0.007138,0.027932,0.120479,1.0,0.015265,0.056373,0.005975,0.003669,,...,0.040753,0.016057,0.024159,0.011689,0.174443,0.015488,0.040154,0.015329,0.021958,0.054457


In [11]:
# Upper triangle of correlations
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
upper.head()

# Select columns with correlations above threshold
to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
print(to_drop)
print('There are %d columns to remove.' % (len(to_drop)))
#Drop the columns with high correlations
X = X.drop(columns = to_drop)
test=test.drop(columns=to_drop)

['paco2_for_ph_apache', 'd1_diasbp_noninvasive_max', 'd1_diasbp_noninvasive_min', 'd1_mbp_noninvasive_max', 'd1_mbp_noninvasive_min', 'd1_sysbp_noninvasive_max', 'd1_sysbp_noninvasive_min', 'h1_diasbp_noninvasive_max', 'h1_diasbp_noninvasive_min', 'h1_mbp_noninvasive_max', 'h1_mbp_noninvasive_min', 'h1_sysbp_noninvasive_max', 'h1_sysbp_noninvasive_min', 'd1_albumin_max', 'd1_albumin_min', 'd1_bilirubin_max', 'd1_bilirubin_min', 'd1_bun_max', 'd1_bun_min', 'd1_creatinine_max', 'd1_creatinine_min', 'd1_glucose_max', 'd1_hemaglobin_min', 'd1_hematocrit_max', 'd1_hematocrit_min', 'd1_platelets_min', 'd1_wbc_max', 'd1_wbc_min', 'h1_albumin_min', 'h1_bilirubin_max', 'h1_bilirubin_min', 'h1_bun_max', 'h1_bun_min', 'h1_calcium_min', 'h1_creatinine_max', 'h1_creatinine_min', 'h1_glucose_min', 'h1_hco3_min', 'h1_hemaglobin_max', 'h1_hemaglobin_min', 'h1_hematocrit_max', 'h1_hematocrit_min', 'h1_inr_max', 'h1_inr_min', 'h1_lactate_min', 'h1_platelets_max', 'h1_platelets_min', 'h1_potassium_min', 

## Handling missing values

### Identifying columns having more than 75% missing values

In [12]:
# Train missing values (in percent)
train_missing = (X.isnull().sum() / len(X)).sort_values(ascending = False)
train_missing.head()
train_missing = train_missing.index[train_missing > 0.75]
print('There are %d columns with more than 75%% missing values' % len(train_missing))
X = X.drop(columns = train_missing)
test=test.drop(columns=train_missing)

There are 17 columns with more than 75% missing values


In [13]:
col=list(X.nunique().sort_values().index) 
l=list(X.nunique().sort_values())
col_list = pd.DataFrame({'col': col,'list':l})

### Displaying columns along with number of unique values.

In [14]:
pd.set_option('display.max_rows', col_list.shape[0]+1)
print(col_list)

                               col   list
0               readmission_status      1
1                       arf_apache      2
2                        cirrhosis      2
3                 intubated_apache      2
4                diabetes_mellitus      2
5            apache_post_operative      2
6                gcs_unable_apache      2
7                  hepatic_failure      2
8                ventilated_apache      2
9                 elective_surgery      2
10               immunosuppression      2
11                        leukemia      2
12                        lymphoma      2
13     solid_tumor_with_metastasis      2
14                          gender      2
15                            aids      2
16                   icu_stay_type      3
17                 gcs_eyes_apache      4
18               gcs_verbal_apache      5
19                icu_admit_source      5
20                gcs_motor_apache      6
21                       ethnicity      6
22                        icu_type

### Categorizing columns as single, binary, categorical, IDs and miscellaneous.

1. If the column has a single unique value it can be removed as it carries no value to our predictions. 
2. If the column has binary values, missing cells can be replaced with most frequent value(mode).
3. If the column is a category(less than 30 unique values), missing values can be replaced with most frequent value(mode).
4. If the columns are IDs they need to be excluded from training models.
5. All the remaining features were numeric(float64) and are replaced with average(mean) value.

In [15]:
single=set(col_list['col'][col_list['list']==1])
binary=set(col_list['col'][col_list['list']==2])
category=set(col_list['col'][(col_list['list']<=30) & (col_list['list']>2)])
ids=set(col_list['col'][col_list['list']>90000])
misc=set(set(col_list['col'][(col_list['list']>30)& (col_list['list']<=90000)])-set(ids))

In [16]:
X=X.drop(columns=single)
test=test.drop(columns=single)

In [17]:
ids.add('hospital_id')
ids.add('icu_id')
ids

{'encounter_id', 'hospital_id', 'icu_id', 'patient_id'}

In [18]:
misc.remove('hospital_id')
misc.remove('icu_id')

Filling missing values with appropriate replacements.

In [19]:
for col in binary:
    X[col].fillna(X[col].mode()[0], inplace=True)
    test[col].fillna(test[col].mode()[0], inplace=True)

In [20]:
for i in X[category]:
    X[i].fillna(X[i].mode()[0], inplace=True)
    test[i].fillna(X[i].mode()[0], inplace=True)  

In [21]:
for col in misc:
    X[col].fillna(X[col].mean(), inplace=True)
    test[col].fillna(X[col].mean(), inplace=True)

In [22]:
test.isna().sum().sum()

0

Removing ID columns from X variable.

In [23]:
X_col=list(X.columns.difference(ids))

In [24]:
X = pd.get_dummies(X[X_col])
test=pd.get_dummies(test)

### Making sure all there are no differences between X and test indexes.

In [25]:
train_df=list(set(X.columns).difference(set(test.columns)))
df_train=list(set(test.columns).difference(set(X.columns)))
train_df

['hospital_admit_source_Observation']

In [26]:
X = X.drop(columns = train_df)

### Calculating feature importances to remove columns with zero value to the analysis

In [27]:
# Initialize an empty array to hold feature importances
feature_importances = np.zeros(X.shape[1])

In [28]:
%%time
# Create the model with several hyperparameters
model = lgb.LGBMClassifier(objective='binary', boosting_type = 'goss', n_estimators = 10000, class_weight = 'balanced')  

for i in range(2):
    
    # Split into training and validation set
    train_features, valid_features, train_y, valid_y = train_test_split(X
                                                                        , y, test_size = 0.25, random_state = i)
    
    # Train using early stopping
    model.fit(train_features, train_y, early_stopping_rounds=100, eval_set = [(valid_features, valid_y)],eval_metric = 'auc', verbose = 200)
    
    # Record the feature importances
    feature_importances += model.feature_importances_

Training until validation scores don't improve for 100 rounds
[200]	valid_0's auc: 0.955175	valid_0's binary_logloss: 0.280351
[400]	valid_0's auc: 0.977465	valid_0's binary_logloss: 0.209917
[600]	valid_0's auc: 0.986838	valid_0's binary_logloss: 0.165247
[800]	valid_0's auc: 0.991365	valid_0's binary_logloss: 0.134865
[1000]	valid_0's auc: 0.993973	valid_0's binary_logloss: 0.112442
[1200]	valid_0's auc: 0.995322	valid_0's binary_logloss: 0.0958812
[1400]	valid_0's auc: 0.99628	valid_0's binary_logloss: 0.0838154
[1600]	valid_0's auc: 0.996799	valid_0's binary_logloss: 0.0748834
[1800]	valid_0's auc: 0.99717	valid_0's binary_logloss: 0.0677781
[2000]	valid_0's auc: 0.9975	valid_0's binary_logloss: 0.0622224
[2200]	valid_0's auc: 0.997722	valid_0's binary_logloss: 0.0580422
[2400]	valid_0's auc: 0.99794	valid_0's binary_logloss: 0.0547563
[2600]	valid_0's auc: 0.998104	valid_0's binary_logloss: 0.0522396
[2800]	valid_0's auc: 0.998204	valid_0's binary_logloss: 0.0504159
[3000]	valid_0

In [29]:
# Make sure to average feature importances! 
feature_importances = feature_importances / 2
feature_importances = pd.DataFrame({'feature': list(X.columns), 'importance': feature_importances}).sort_values('importance', ascending = False)
feature_importances.head()

Unnamed: 0,feature,importance
5,apache_4a_hospital_death_prob,2497.5
91,pre_icu_los_days,2490.0
41,d1_platelets_max,2325.5
9,bmi,2150.0
0,age,2099.5


In [30]:
# Find the features with zero importance
zero_features = list(feature_importances[feature_importances['importance'] == 0.0]['feature'])
print('There are %d features with 0.0 importance' % len(zero_features))
feature_importances.tail()
# Drop features with zero importance
X.drop(columns = zero_features, inplace=True)
test.drop(columns=zero_features,inplace=True)

There are 10 features with 0.0 importance


Making sure there is no difference between indexes of X and test frames after removing few columns.

In [31]:
train_df=list(set(X.columns).difference(set(test.columns)))
df_train=list(set(test.columns).difference(set(X.columns)))
train_df

[]

### Splitting the X and y variable for modeling.

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

### xgboost model with binary logistic tree. Runtime:16 mins.

In [33]:
%%time
xgb3=XGBClassifier(
    n_estimators=3000,
    objective='binary:logistic',
    booster="gbtree",
    learning_rate=0.01,
    scale_pos_weight=1,
    max_depth=4,
    min_child_weight=6,
    gamma=0,
    subsample=0.4,
    colsample_bytree=0.8,
    reg_alpha=0.08,
#         n_jobs=-1
)
xgb3.fit(X_train, y_train)
scores = cross_val_score(xgb3, X_train, y_train, cv=5, scoring='roc_auc')
print("Scores:", scores)
print("Mean:", scores.mean())
print("Standard Deviation:", scores.std())

Scores: [0.93784421 0.93643406 0.93540328 0.93603462 0.93918211]
Mean: 0.9369796557312581
Standard Deviation: 0.001361911478161707
Wall time: 18min 8s


In [34]:
y_pred = xgb3.predict(test[X_train.columns])
newdata = pd.DataFrame({'encounter_id': test['encounter_id'],'hospital_death':y_pred})
newdata.to_csv('submissionXG.csv', index=False)

#### Results in 0.61 AUC after submission.

### Switching to LightGBM model and hyper parameter tuning with randomsearch.

#### Initial run with randomized values for 'cat_smooth', min_data_per_group' and 'max_cat_threshold'. Running it multiple times to find reasonable values.

In [35]:
clf = lgb.LGBMClassifier(silent=True, random_state = 304, metric='roc_auc', n_jobs=4)
params ={'cat_smooth' : sp_randint(1, 100), 'min_data_per_group': sp_randint(1,1000), 'max_cat_threshold': sp_randint(1,100)}

In [36]:
fit_params={"early_stopping_rounds":2, 
            "eval_metric" : 'auc', 
            "eval_set" : [(X_train, y_train),(X_test,y_test)],
            'eval_names': ['train','valid'],
            'verbose': 300,
            'categorical_feature': 'auto'}

In [37]:
gs = RandomizedSearchCV( estimator=clf, param_distributions=params, scoring='roc_auc',cv=3, refit=True,random_state=304,verbose=True)

In [38]:
gs.fit(X_train, y_train, **fit_params)
print('Best score reached: {} with params: {} '.format(gs.best_score_, gs.best_params_))

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Training until validation scores don't improve for 2 rounds
Did not meet early stopping. Best iteration is:
[100]	train's auc: 0.943806	valid's auc: 0.935552
Training until validation scores don't improve for 2 rounds
Did not meet early stopping. Best iteration is:
[100]	train's auc: 0.944643	valid's auc: 0.935952
Training until validation scores don't improve for 2 rounds
Did not meet early stopping. Best iteration is:
[100]	train's auc: 0.944234	valid's auc: 0.935835
Training until validation scores don't improve for 2 rounds
Did not meet early stopping. Best iteration is:
[100]	train's auc: 0.943806	valid's auc: 0.935552
Training until validation scores don't improve for 2 rounds
Did not meet early stopping. Best iteration is:
[100]	train's auc: 0.944643	valid's auc: 0.935952
Training until validation scores don't improve for 2 rounds
Did not meet early stopping. Best iteration is:
[100]	train's auc: 0.944234	valid's auc: 0.935835
Training until validation scores don't improve for 2

[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:  1.7min finished


Training until validation scores don't improve for 2 rounds
Did not meet early stopping. Best iteration is:
[100]	train's auc: 0.945751	valid's auc: 0.937304
Best score reached: 0.9355465962753253 with params: {'cat_smooth': 32, 'max_cat_threshold': 75, 'min_data_per_group': 82} 


In [39]:
gs.best_params_, gs.best_score_

({'cat_smooth': 32, 'max_cat_threshold': 75, 'min_data_per_group': 82},
 0.9355465962753253)

#### Based on previous results, finding optimal 'learning_rate' and 'num_iterations'

In [40]:
clf2 = lgb.LGBMClassifier(random_state=304, metric = 'roc_auc', cat_smooth = 32, max_cat_threshold = 75, min_data_per_group = 82, n_jobs=-1)

In [41]:
params_2 = {'learning_rate': [0.04, 0.05, 0.08, 0.1],   
            'num_iterations': [1000, 1200, 1400, 1600]}

In [42]:
gs2 = RandomizedSearchCV(estimator=clf2, param_distributions=params_2, scoring='roc_auc',cv=3,refit=True,random_state=304,verbose=True)

In [43]:
%%time
gs2.fit(X_train, y_train, **fit_params)
print('Best score reached: {} with params: {} '.format(gs2.best_score_, gs2.best_params_))

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Training until validation scores don't improve for 2 rounds
[300]	train's auc: 0.973123	valid's auc: 0.960687
[600]	train's auc: 0.990784	valid's auc: 0.979357
[900]	train's auc: 0.995531	valid's auc: 0.987421
Early stopping, best iteration is:
[924]	train's auc: 0.995735	valid's auc: 0.987852




Training until validation scores don't improve for 2 rounds
[300]	train's auc: 0.973699	valid's auc: 0.960203
[600]	train's auc: 0.991079	valid's auc: 0.978942
[900]	train's auc: 0.995841	valid's auc: 0.986973
Early stopping, best iteration is:
[1072]	train's auc: 0.997038	valid's auc: 0.989753




Training until validation scores don't improve for 2 rounds
[300]	train's auc: 0.973246	valid's auc: 0.960471
[600]	train's auc: 0.990646	valid's auc: 0.978793
Early stopping, best iteration is:
[844]	train's auc: 0.995193	valid's auc: 0.986388




Training until validation scores don't improve for 2 rounds
[300]	train's auc: 0.973123	valid's auc: 0.960687
[600]	train's auc: 0.990784	valid's auc: 0.979357
[900]	train's auc: 0.995531	valid's auc: 0.987421
Early stopping, best iteration is:
[924]	train's auc: 0.995735	valid's auc: 0.987852




Training until validation scores don't improve for 2 rounds
[300]	train's auc: 0.973699	valid's auc: 0.960203
[600]	train's auc: 0.991079	valid's auc: 0.978942
[900]	train's auc: 0.995841	valid's auc: 0.986973
Early stopping, best iteration is:
[1072]	train's auc: 0.997038	valid's auc: 0.989753




Training until validation scores don't improve for 2 rounds
[300]	train's auc: 0.973246	valid's auc: 0.960471
[600]	train's auc: 0.990646	valid's auc: 0.978793
Early stopping, best iteration is:
[844]	train's auc: 0.995193	valid's auc: 0.986388




Training until validation scores don't improve for 2 rounds
[300]	train's auc: 0.951034	valid's auc: 0.941721
[600]	train's auc: 0.973711	valid's auc: 0.960795
[900]	train's auc: 0.985212	valid's auc: 0.972383
[1200]	train's auc: 0.990976	valid's auc: 0.979699
Did not meet early stopping. Best iteration is:
[1400]	train's auc: 0.993228	valid's auc: 0.983174




Training until validation scores don't improve for 2 rounds
[300]	train's auc: 0.950982	valid's auc: 0.941338
[600]	train's auc: 0.974214	valid's auc: 0.961077
[900]	train's auc: 0.985605	valid's auc: 0.972439
[1200]	train's auc: 0.99126	valid's auc: 0.979481
Did not meet early stopping. Best iteration is:
[1400]	train's auc: 0.993563	valid's auc: 0.982997




Training until validation scores don't improve for 2 rounds
[300]	train's auc: 0.951405	valid's auc: 0.942109
[600]	train's auc: 0.973939	valid's auc: 0.961204
[900]	train's auc: 0.985232	valid's auc: 0.972669
[1200]	train's auc: 0.990997	valid's auc: 0.979838
Did not meet early stopping. Best iteration is:
[1400]	train's auc: 0.993324	valid's auc: 0.983303




Training until validation scores don't improve for 2 rounds
[300]	train's auc: 0.95786	valid's auc: 0.947156
[600]	train's auc: 0.980188	valid's auc: 0.967003
[900]	train's auc: 0.989689	valid's auc: 0.977787
[1200]	train's auc: 0.993881	valid's auc: 0.984198
Early stopping, best iteration is:
[1402]	train's auc: 0.995468	valid's auc: 0.987141




Training until validation scores don't improve for 2 rounds
[300]	train's auc: 0.958466	valid's auc: 0.947517
[600]	train's auc: 0.9807	valid's auc: 0.96734
[900]	train's auc: 0.990388	valid's auc: 0.97827
[1200]	train's auc: 0.994491	valid's auc: 0.984604
Early stopping, best iteration is:
[1447]	train's auc: 0.996207	valid's auc: 0.987988




Training until validation scores don't improve for 2 rounds
[300]	train's auc: 0.958494	valid's auc: 0.947672
[600]	train's auc: 0.980465	valid's auc: 0.967058
[900]	train's auc: 0.989795	valid's auc: 0.977878
[1200]	train's auc: 0.993936	valid's auc: 0.984161
Early stopping, best iteration is:
[1396]	train's auc: 0.995489	valid's auc: 0.987043




Training until validation scores don't improve for 2 rounds
[300]	train's auc: 0.95786	valid's auc: 0.947156
[600]	train's auc: 0.980188	valid's auc: 0.967003
[900]	train's auc: 0.989689	valid's auc: 0.977787
[1200]	train's auc: 0.993881	valid's auc: 0.984198
Did not meet early stopping. Best iteration is:
[1200]	train's auc: 0.993881	valid's auc: 0.984198




Training until validation scores don't improve for 2 rounds
[300]	train's auc: 0.958466	valid's auc: 0.947517
[600]	train's auc: 0.9807	valid's auc: 0.96734
[900]	train's auc: 0.990388	valid's auc: 0.97827
[1200]	train's auc: 0.994491	valid's auc: 0.984604
Did not meet early stopping. Best iteration is:
[1200]	train's auc: 0.994491	valid's auc: 0.984604




Training until validation scores don't improve for 2 rounds
[300]	train's auc: 0.958494	valid's auc: 0.947672
[600]	train's auc: 0.980465	valid's auc: 0.967058
[900]	train's auc: 0.989795	valid's auc: 0.977878
[1200]	train's auc: 0.993936	valid's auc: 0.984161
Did not meet early stopping. Best iteration is:
[1200]	train's auc: 0.993936	valid's auc: 0.984161




Training until validation scores don't improve for 2 rounds
[300]	train's auc: 0.979657	valid's auc: 0.966654
[600]	train's auc: 0.993951	valid's auc: 0.984239
Early stopping, best iteration is:
[854]	train's auc: 0.996748	valid's auc: 0.990077




Training until validation scores don't improve for 2 rounds
[300]	train's auc: 0.980695	valid's auc: 0.966642
[600]	train's auc: 0.994301	valid's auc: 0.984024
Early stopping, best iteration is:
[828]	train's auc: 0.996873	valid's auc: 0.989365




Training until validation scores don't improve for 2 rounds
[300]	train's auc: 0.980125	valid's auc: 0.966913
[600]	train's auc: 0.993923	valid's auc: 0.983808
Early stopping, best iteration is:
[731]	train's auc: 0.995916	valid's auc: 0.987763




Training until validation scores don't improve for 2 rounds
[300]	train's auc: 0.973123	valid's auc: 0.960687
[600]	train's auc: 0.990784	valid's auc: 0.979357
[900]	train's auc: 0.995531	valid's auc: 0.987421
Early stopping, best iteration is:
[924]	train's auc: 0.995735	valid's auc: 0.987852




Training until validation scores don't improve for 2 rounds
[300]	train's auc: 0.973699	valid's auc: 0.960203
[600]	train's auc: 0.991079	valid's auc: 0.978942
[900]	train's auc: 0.995841	valid's auc: 0.986973
Did not meet early stopping. Best iteration is:
[1000]	train's auc: 0.996615	valid's auc: 0.988709




Training until validation scores don't improve for 2 rounds
[300]	train's auc: 0.973246	valid's auc: 0.960471
[600]	train's auc: 0.990646	valid's auc: 0.978793
Early stopping, best iteration is:
[844]	train's auc: 0.995193	valid's auc: 0.986388




Training until validation scores don't improve for 2 rounds
[300]	train's auc: 0.95786	valid's auc: 0.947156
[600]	train's auc: 0.980188	valid's auc: 0.967003
[900]	train's auc: 0.989689	valid's auc: 0.977787
[1200]	train's auc: 0.993881	valid's auc: 0.984198
Did not meet early stopping. Best iteration is:
[1400]	train's auc: 0.995447	valid's auc: 0.987108




Training until validation scores don't improve for 2 rounds
[300]	train's auc: 0.958466	valid's auc: 0.947517
[600]	train's auc: 0.9807	valid's auc: 0.96734
[900]	train's auc: 0.990388	valid's auc: 0.97827
[1200]	train's auc: 0.994491	valid's auc: 0.984604
Did not meet early stopping. Best iteration is:
[1400]	train's auc: 0.995941	valid's auc: 0.98742




Training until validation scores don't improve for 2 rounds
[300]	train's auc: 0.958494	valid's auc: 0.947672
[600]	train's auc: 0.980465	valid's auc: 0.967058
[900]	train's auc: 0.989795	valid's auc: 0.977878
[1200]	train's auc: 0.993936	valid's auc: 0.984161
Early stopping, best iteration is:
[1396]	train's auc: 0.995489	valid's auc: 0.987043




Training until validation scores don't improve for 2 rounds
[300]	train's auc: 0.979657	valid's auc: 0.966654
[600]	train's auc: 0.993951	valid's auc: 0.984239
Early stopping, best iteration is:
[854]	train's auc: 0.996748	valid's auc: 0.990077




Training until validation scores don't improve for 2 rounds
[300]	train's auc: 0.980695	valid's auc: 0.966642
[600]	train's auc: 0.994301	valid's auc: 0.984024
Early stopping, best iteration is:
[828]	train's auc: 0.996873	valid's auc: 0.989365




Training until validation scores don't improve for 2 rounds
[300]	train's auc: 0.980125	valid's auc: 0.966913
[600]	train's auc: 0.993923	valid's auc: 0.983808
Early stopping, best iteration is:
[731]	train's auc: 0.995916	valid's auc: 0.987763




Training until validation scores don't improve for 2 rounds
[300]	train's auc: 0.95786	valid's auc: 0.947156
[600]	train's auc: 0.980188	valid's auc: 0.967003
[900]	train's auc: 0.989689	valid's auc: 0.977787
Did not meet early stopping. Best iteration is:
[1000]	train's auc: 0.991341	valid's auc: 0.980153




Training until validation scores don't improve for 2 rounds
[300]	train's auc: 0.958466	valid's auc: 0.947517
[600]	train's auc: 0.9807	valid's auc: 0.96734
[900]	train's auc: 0.990388	valid's auc: 0.97827
Did not meet early stopping. Best iteration is:
[1000]	train's auc: 0.992111	valid's auc: 0.980658




Training until validation scores don't improve for 2 rounds
[300]	train's auc: 0.958494	valid's auc: 0.947672
[600]	train's auc: 0.980465	valid's auc: 0.967058
[900]	train's auc: 0.989795	valid's auc: 0.977878
Did not meet early stopping. Best iteration is:
[1000]	train's auc: 0.991539	valid's auc: 0.980279


[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:  9.4min finished


Training until validation scores don't improve for 2 rounds
[300]	train's auc: 0.983404	valid's auc: 0.969095
[600]	train's auc: 0.997358	valid's auc: 0.986344
Early stopping, best iteration is:
[841]	train's auc: 0.999482	valid's auc: 0.99196
Best score reached: 0.9890742216490487 with params: {'num_iterations': 1000, 'learning_rate': 0.1} 
Wall time: 9min 38s


In [44]:
gs2.best_params_, gs2.best_score_

({'num_iterations': 1000, 'learning_rate': 0.1}, 0.9890742216490487)

#### Based on previous results, finding optimal 'max_bin' and 'max_depth' values.

In [45]:
clf3 = lgb.LGBMClassifier(random_state=304, metric = 'roc_auc', cat_smooth = 32, max_cat_threshold = 75, min_data_per_group = 82, num_iterations=1200, learning_rate=0.08, n_jobs=-1)

In [46]:
params_3={
    'max_bin': [800, 1000, 1200],
    'max_depth': [13, 14, 15]
}

In [47]:
gs3 = RandomizedSearchCV(estimator=clf3, param_distributions=params_3, scoring='roc_auc',cv=3,refit=True,random_state=304,verbose=True)

In [48]:
%%time
gs3.fit(X_train, y_train, **fit_params)
print('Best score reached: {} with params: {} '.format(gs3.best_score_, gs3.best_params_))

Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Training until validation scores don't improve for 2 rounds
[300]	train's auc: 0.973973	valid's auc: 0.961104
[600]	train's auc: 0.991429	valid's auc: 0.979732
[900]	train's auc: 0.995918	valid's auc: 0.987803
Early stopping, best iteration is:
[921]	train's auc: 0.996097	valid's auc: 0.988226




Training until validation scores don't improve for 2 rounds
[300]	train's auc: 0.973475	valid's auc: 0.960085
[600]	train's auc: 0.990881	valid's auc: 0.978458
[900]	train's auc: 0.995943	valid's auc: 0.987005
Early stopping, best iteration is:
[982]	train's auc: 0.996536	valid's auc: 0.988371




Training until validation scores don't improve for 2 rounds
[300]	train's auc: 0.973301	valid's auc: 0.960273
[600]	train's auc: 0.990866	valid's auc: 0.97926
[900]	train's auc: 0.99568	valid's auc: 0.987333
Early stopping, best iteration is:
[1002]	train's auc: 0.996518	valid's auc: 0.989184




Training until validation scores don't improve for 2 rounds
[300]	train's auc: 0.973632	valid's auc: 0.960873
[600]	train's auc: 0.990936	valid's auc: 0.979561
Early stopping, best iteration is:
[706]	train's auc: 0.993343	valid's auc: 0.983402




Training until validation scores don't improve for 2 rounds
[300]	train's auc: 0.973892	valid's auc: 0.960493
[600]	train's auc: 0.991378	valid's auc: 0.97931
Early stopping, best iteration is:
[843]	train's auc: 0.995451	valid's auc: 0.986277




Training until validation scores don't improve for 2 rounds
[300]	train's auc: 0.973982	valid's auc: 0.960326
[600]	train's auc: 0.990904	valid's auc: 0.97899
[900]	train's auc: 0.995873	valid's auc: 0.987565
Early stopping, best iteration is:
[1050]	train's auc: 0.996899	valid's auc: 0.990016




Training until validation scores don't improve for 2 rounds
[300]	train's auc: 0.973234	valid's auc: 0.960082
[600]	train's auc: 0.990964	valid's auc: 0.979481
Early stopping, best iteration is:
[866]	train's auc: 0.99549	valid's auc: 0.987181




Training until validation scores don't improve for 2 rounds
[300]	train's auc: 0.974226	valid's auc: 0.960661
[600]	train's auc: 0.991384	valid's auc: 0.979395
[900]	train's auc: 0.995991	valid's auc: 0.987511
Early stopping, best iteration is:
[960]	train's auc: 0.996452	valid's auc: 0.988556




Training until validation scores don't improve for 2 rounds
[300]	train's auc: 0.973963	valid's auc: 0.960919
[600]	train's auc: 0.990964	valid's auc: 0.9793
[900]	train's auc: 0.995838	valid's auc: 0.987712
Early stopping, best iteration is:
[956]	train's auc: 0.99632	valid's auc: 0.988818




Training until validation scores don't improve for 2 rounds
[300]	train's auc: 0.973466	valid's auc: 0.960488
[600]	train's auc: 0.990831	valid's auc: 0.979254
[900]	train's auc: 0.995726	valid's auc: 0.987518
Early stopping, best iteration is:
[992]	train's auc: 0.996406	valid's auc: 0.989088




Training until validation scores don't improve for 2 rounds
[300]	train's auc: 0.973352	valid's auc: 0.960254
[600]	train's auc: 0.990875	valid's auc: 0.9787
[900]	train's auc: 0.995845	valid's auc: 0.987137
Early stopping, best iteration is:
[988]	train's auc: 0.996523	valid's auc: 0.98867




Training until validation scores don't improve for 2 rounds
[300]	train's auc: 0.973239	valid's auc: 0.960422
[600]	train's auc: 0.990737	valid's auc: 0.979253
[900]	train's auc: 0.995796	valid's auc: 0.987707
Early stopping, best iteration is:
[1096]	train's auc: 0.997123	valid's auc: 0.990729




Training until validation scores don't improve for 2 rounds
[300]	train's auc: 0.973511	valid's auc: 0.960478
[600]	train's auc: 0.990795	valid's auc: 0.97928
Early stopping, best iteration is:
[880]	train's auc: 0.9955	valid's auc: 0.987224




Training until validation scores don't improve for 2 rounds
[300]	train's auc: 0.973638	valid's auc: 0.959946
[600]	train's auc: 0.99091	valid's auc: 0.978574
[900]	train's auc: 0.995841	valid's auc: 0.98694
Early stopping, best iteration is:
[962]	train's auc: 0.996345	valid's auc: 0.988068




Training until validation scores don't improve for 2 rounds
[300]	train's auc: 0.973669	valid's auc: 0.960943
[600]	train's auc: 0.990786	valid's auc: 0.979336
Early stopping, best iteration is:
[833]	train's auc: 0.995019	valid's auc: 0.986037




Training until validation scores don't improve for 2 rounds
[300]	train's auc: 0.973255	valid's auc: 0.960177
[600]	train's auc: 0.990595	valid's auc: 0.979302
Early stopping, best iteration is:
[792]	train's auc: 0.994197	valid's auc: 0.985071




Training until validation scores don't improve for 2 rounds
[300]	train's auc: 0.973528	valid's auc: 0.959961
[600]	train's auc: 0.990843	valid's auc: 0.978761
Early stopping, best iteration is:
[825]	train's auc: 0.995143	valid's auc: 0.985869




Training until validation scores don't improve for 2 rounds
[300]	train's auc: 0.974007	valid's auc: 0.960984
[600]	train's auc: 0.991061	valid's auc: 0.979368
Early stopping, best iteration is:
[808]	train's auc: 0.994933	valid's auc: 0.985656




Training until validation scores don't improve for 2 rounds
[300]	train's auc: 0.973325	valid's auc: 0.960495
[600]	train's auc: 0.990647	valid's auc: 0.978994
Early stopping, best iteration is:
[871]	train's auc: 0.99527	valid's auc: 0.986554




Training until validation scores don't improve for 2 rounds
[300]	train's auc: 0.9738	valid's auc: 0.960477
[600]	train's auc: 0.991336	valid's auc: 0.979598
Early stopping, best iteration is:
[836]	train's auc: 0.995309	valid's auc: 0.986211




Training until validation scores don't improve for 2 rounds
[300]	train's auc: 0.973436	valid's auc: 0.960391
[600]	train's auc: 0.990575	valid's auc: 0.979213
[900]	train's auc: 0.995656	valid's auc: 0.987372
Early stopping, best iteration is:
[1013]	train's auc: 0.996528	valid's auc: 0.989313




Training until validation scores don't improve for 2 rounds
[300]	train's auc: 0.972826	valid's auc: 0.959691
[600]	train's auc: 0.990373	valid's auc: 0.978766
[900]	train's auc: 0.995531	valid's auc: 0.987217
Early stopping, best iteration is:
[1129]	train's auc: 0.997035	valid's auc: 0.990877




Training until validation scores don't improve for 2 rounds
[300]	train's auc: 0.973927	valid's auc: 0.960326
[600]	train's auc: 0.991012	valid's auc: 0.978992
[900]	train's auc: 0.995982	valid's auc: 0.987272
Early stopping, best iteration is:
[919]	train's auc: 0.996115	valid's auc: 0.987554




Training until validation scores don't improve for 2 rounds
[300]	train's auc: 0.973358	valid's auc: 0.960484
[600]	train's auc: 0.991014	valid's auc: 0.979674
Early stopping, best iteration is:
[622]	train's auc: 0.991623	valid's auc: 0.980492




Training until validation scores don't improve for 2 rounds
[300]	train's auc: 0.97382	valid's auc: 0.961054
[600]	train's auc: 0.990982	valid's auc: 0.979289
[900]	train's auc: 0.995658	valid's auc: 0.987446
Early stopping, best iteration is:
[920]	train's auc: 0.995808	valid's auc: 0.987785




Training until validation scores don't improve for 2 rounds
[300]	train's auc: 0.973453	valid's auc: 0.959922
[600]	train's auc: 0.991067	valid's auc: 0.978568
[900]	train's auc: 0.995841	valid's auc: 0.98695
Early stopping, best iteration is:
[926]	train's auc: 0.996064	valid's auc: 0.987484




Training until validation scores don't improve for 2 rounds
[300]	train's auc: 0.973775	valid's auc: 0.96053
[600]	train's auc: 0.990687	valid's auc: 0.97908
Early stopping, best iteration is:
[867]	train's auc: 0.995354	valid's auc: 0.986798


[Parallel(n_jobs=1)]: Done  27 out of  27 | elapsed:  8.3min finished


Training until validation scores don't improve for 2 rounds
[300]	train's auc: 0.976481	valid's auc: 0.96252
[600]	train's auc: 0.994454	valid's auc: 0.981473
[900]	train's auc: 0.998789	valid's auc: 0.989581
Early stopping, best iteration is:
[1191]	train's auc: 0.99978	valid's auc: 0.993439
Best score reached: 0.989471950269299 with params: {'max_depth': 13, 'max_bin': 1000} 
Wall time: 8min 49s


In [49]:
gs3.best_params_, gs3.best_score_

({'max_depth': 13, 'max_bin': 1000}, 0.989471950269299)

#### Based on previous results finding optimal values for 'reg_alpha' and 'reg_lambda'.

In [50]:
clf4 = lgb.LGBMClassifier(random_state=304, metric = 'roc_auc', cat_smooth = 32, max_cat_threshold = 75, min_data_per_group = 82, num_iterations=1200, learning_rate=0.08, max_depth=13, max_bin=800, n_jobs=-1)

In [51]:
params_4={
    'reg_alpha': [0.01, 0.1, 0.15],
    'reg_lambda': [15, 20, 25]
}

In [52]:
gs4 = RandomizedSearchCV(estimator=clf4, param_distributions=params_4, scoring='roc_auc',cv=3,refit=True,random_state=304,verbose=True)

In [53]:
%%time
gs4.fit(X_train, y_train, **fit_params)
print('Best score reached: {} with params: {} '.format(gs3.best_score_, gs3.best_params_))

Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Training until validation scores don't improve for 2 rounds
[300]	train's auc: 0.967316	valid's auc: 0.956606
[600]	train's auc: 0.986365	valid's auc: 0.974547
[900]	train's auc: 0.993131	valid's auc: 0.983161
[1200]	train's auc: 0.995879	valid's auc: 0.987831
Did not meet early stopping. Best iteration is:
[1200]	train's auc: 0.995879	valid's auc: 0.987831




Training until validation scores don't improve for 2 rounds
[300]	train's auc: 0.967477	valid's auc: 0.956012
[600]	train's auc: 0.986731	valid's auc: 0.974435
Early stopping, best iteration is:
[889]	train's auc: 0.993334	valid's auc: 0.982725




Training until validation scores don't improve for 2 rounds
[300]	train's auc: 0.967733	valid's auc: 0.956633
[600]	train's auc: 0.986597	valid's auc: 0.974579
[900]	train's auc: 0.993451	valid's auc: 0.983275
Early stopping, best iteration is:
[966]	train's auc: 0.994233	valid's auc: 0.984478




Training until validation scores don't improve for 2 rounds
[300]	train's auc: 0.966418	valid's auc: 0.955593
[600]	train's auc: 0.985233	valid's auc: 0.973179
[900]	train's auc: 0.992551	valid's auc: 0.98182
Early stopping, best iteration is:
[1168]	train's auc: 0.995302	valid's auc: 0.986273




Training until validation scores don't improve for 2 rounds
[300]	train's auc: 0.966027	valid's auc: 0.955065
[600]	train's auc: 0.985576	valid's auc: 0.973454
[900]	train's auc: 0.992973	valid's auc: 0.982314
[1200]	train's auc: 0.995909	valid's auc: 0.98707
Did not meet early stopping. Best iteration is:
[1200]	train's auc: 0.995909	valid's auc: 0.98707




Training until validation scores don't improve for 2 rounds
[300]	train's auc: 0.966627	valid's auc: 0.955687
[600]	train's auc: 0.985512	valid's auc: 0.973576
[900]	train's auc: 0.992904	valid's auc: 0.982523
Early stopping, best iteration is:
[1095]	train's auc: 0.995102	valid's auc: 0.985997




Training until validation scores don't improve for 2 rounds
[300]	train's auc: 0.965218	valid's auc: 0.954554
[600]	train's auc: 0.984471	valid's auc: 0.972442
[900]	train's auc: 0.992125	valid's auc: 0.981401
[1200]	train's auc: 0.995323	valid's auc: 0.986359
Did not meet early stopping. Best iteration is:
[1200]	train's auc: 0.995323	valid's auc: 0.986359




Training until validation scores don't improve for 2 rounds
[300]	train's auc: 0.965642	valid's auc: 0.954732
[600]	train's auc: 0.984677	valid's auc: 0.972217
[900]	train's auc: 0.992369	valid's auc: 0.981051
[1200]	train's auc: 0.995532	valid's auc: 0.985978
Did not meet early stopping. Best iteration is:
[1200]	train's auc: 0.995532	valid's auc: 0.985978




Training until validation scores don't improve for 2 rounds
[300]	train's auc: 0.965575	valid's auc: 0.955042
[600]	train's auc: 0.984715	valid's auc: 0.972858
[900]	train's auc: 0.99239	valid's auc: 0.981711
[1200]	train's auc: 0.995621	valid's auc: 0.986777
Did not meet early stopping. Best iteration is:
[1200]	train's auc: 0.995621	valid's auc: 0.986777




Training until validation scores don't improve for 2 rounds
[300]	train's auc: 0.966899	valid's auc: 0.955876
[600]	train's auc: 0.98593	valid's auc: 0.974016
[900]	train's auc: 0.99293	valid's auc: 0.98266
Early stopping, best iteration is:
[1120]	train's auc: 0.995279	valid's auc: 0.986544




Training until validation scores don't improve for 2 rounds
[300]	train's auc: 0.966968	valid's auc: 0.956036
[600]	train's auc: 0.986809	valid's auc: 0.974723
[900]	train's auc: 0.993748	valid's auc: 0.983465
Early stopping, best iteration is:
[1144]	train's auc: 0.995846	valid's auc: 0.987058




Training until validation scores don't improve for 2 rounds
[300]	train's auc: 0.96709	valid's auc: 0.955723
[600]	train's auc: 0.986587	valid's auc: 0.974603
[900]	train's auc: 0.993356	valid's auc: 0.983261
[1200]	train's auc: 0.996054	valid's auc: 0.987841
Did not meet early stopping. Best iteration is:
[1200]	train's auc: 0.996054	valid's auc: 0.987841




Training until validation scores don't improve for 2 rounds
[300]	train's auc: 0.966421	valid's auc: 0.955776
[600]	train's auc: 0.98508	valid's auc: 0.973446
[900]	train's auc: 0.992509	valid's auc: 0.982261
[1200]	train's auc: 0.995551	valid's auc: 0.987208
Did not meet early stopping. Best iteration is:
[1200]	train's auc: 0.995551	valid's auc: 0.987208




Training until validation scores don't improve for 2 rounds
[300]	train's auc: 0.965877	valid's auc: 0.955028
[600]	train's auc: 0.985625	valid's auc: 0.973432
[900]	train's auc: 0.993071	valid's auc: 0.982479
Early stopping, best iteration is:
[1172]	train's auc: 0.995758	valid's auc: 0.986755




Training until validation scores don't improve for 2 rounds
[300]	train's auc: 0.966257	valid's auc: 0.955617
[600]	train's auc: 0.985502	valid's auc: 0.973773
[900]	train's auc: 0.993019	valid's auc: 0.982818
[1200]	train's auc: 0.995832	valid's auc: 0.987618
Did not meet early stopping. Best iteration is:
[1200]	train's auc: 0.995832	valid's auc: 0.987618




Training until validation scores don't improve for 2 rounds
[300]	train's auc: 0.965173	valid's auc: 0.954452
[600]	train's auc: 0.984293	valid's auc: 0.972448
[900]	train's auc: 0.992076	valid's auc: 0.981541
[1200]	train's auc: 0.995297	valid's auc: 0.986578
Did not meet early stopping. Best iteration is:
[1200]	train's auc: 0.995297	valid's auc: 0.986578




Training until validation scores don't improve for 2 rounds
[300]	train's auc: 0.96549	valid's auc: 0.954644
[600]	train's auc: 0.984639	valid's auc: 0.972126
[900]	train's auc: 0.992318	valid's auc: 0.980944
Early stopping, best iteration is:
[1157]	train's auc: 0.995204	valid's auc: 0.985248




Training until validation scores don't improve for 2 rounds
[300]	train's auc: 0.965546	valid's auc: 0.954887
[600]	train's auc: 0.985086	valid's auc: 0.973144
[900]	train's auc: 0.992562	valid's auc: 0.981929
Early stopping, best iteration is:
[1048]	train's auc: 0.994349	valid's auc: 0.984617




Training until validation scores don't improve for 2 rounds
[300]	train's auc: 0.967068	valid's auc: 0.956399
[600]	train's auc: 0.986066	valid's auc: 0.974364
[900]	train's auc: 0.993163	valid's auc: 0.983286
Early stopping, best iteration is:
[1182]	train's auc: 0.99577	valid's auc: 0.987746




Training until validation scores don't improve for 2 rounds
[300]	train's auc: 0.967031	valid's auc: 0.955846
[600]	train's auc: 0.986358	valid's auc: 0.974273
[900]	train's auc: 0.99338	valid's auc: 0.982866
Early stopping, best iteration is:
[1008]	train's auc: 0.994752	valid's auc: 0.985088




Training until validation scores don't improve for 2 rounds
[300]	train's auc: 0.967542	valid's auc: 0.956453
[600]	train's auc: 0.986542	valid's auc: 0.974713
[900]	train's auc: 0.99331	valid's auc: 0.983394
Early stopping, best iteration is:
[1009]	train's auc: 0.994629	valid's auc: 0.98545




Training until validation scores don't improve for 2 rounds
[300]	train's auc: 0.96636	valid's auc: 0.955836
[600]	train's auc: 0.985384	valid's auc: 0.973731
[900]	train's auc: 0.992532	valid's auc: 0.982161
Early stopping, best iteration is:
[1139]	train's auc: 0.995117	valid's auc: 0.986228




Training until validation scores don't improve for 2 rounds
[300]	train's auc: 0.966362	valid's auc: 0.955434
[600]	train's auc: 0.985701	valid's auc: 0.973657
[900]	train's auc: 0.993046	valid's auc: 0.982594
Early stopping, best iteration is:
[1135]	train's auc: 0.995516	valid's auc: 0.986462




Training until validation scores don't improve for 2 rounds
[300]	train's auc: 0.966674	valid's auc: 0.955891
[600]	train's auc: 0.985572	valid's auc: 0.973692
[900]	train's auc: 0.993086	valid's auc: 0.98286
Early stopping, best iteration is:
[1095]	train's auc: 0.995183	valid's auc: 0.986225




Training until validation scores don't improve for 2 rounds
[300]	train's auc: 0.965284	valid's auc: 0.954947
[600]	train's auc: 0.984136	valid's auc: 0.972341
[900]	train's auc: 0.991958	valid's auc: 0.981334
[1200]	train's auc: 0.995206	valid's auc: 0.986312
Did not meet early stopping. Best iteration is:
[1200]	train's auc: 0.995206	valid's auc: 0.986312




Training until validation scores don't improve for 2 rounds
[300]	train's auc: 0.965341	valid's auc: 0.954542
[600]	train's auc: 0.984841	valid's auc: 0.972466
[900]	train's auc: 0.992517	valid's auc: 0.981594
Early stopping, best iteration is:
[969]	train's auc: 0.993487	valid's auc: 0.983034




Training until validation scores don't improve for 2 rounds
[300]	train's auc: 0.965616	valid's auc: 0.955301
[600]	train's auc: 0.984692	valid's auc: 0.973133
[900]	train's auc: 0.992408	valid's auc: 0.982222
Early stopping, best iteration is:
[1170]	train's auc: 0.995417	valid's auc: 0.98693


[Parallel(n_jobs=1)]: Done  27 out of  27 | elapsed: 10.8min finished


Training until validation scores don't improve for 2 rounds
[300]	train's auc: 0.969652	valid's auc: 0.958185
[600]	train's auc: 0.989107	valid's auc: 0.97649
[900]	train's auc: 0.996451	valid's auc: 0.985671
Early stopping, best iteration is:
[1157]	train's auc: 0.998739	valid's auc: 0.98978
Best score reached: 0.989471950269299 with params: {'max_depth': 13, 'max_bin': 1000} 
Wall time: 11min 16s


In [54]:
gs4.best_params_, gs4.best_score_

({'reg_lambda': 20, 'reg_alpha': 0.1}, 0.9871681287343794)

## Final Model

#### Finalizing the LightGBM model with calculated parameters from previous results such as 'num_leaves' and 'sub_sample' etc.

In [55]:
params_final = {
 'bagging_fraction': 0.4,
 'boosting': 'dart',
 'num_iterations': 1200, 
 'learning_rate': 0.08,
 'colsample_bytree': 0.5,
 'cat_smooth': 32, 
 'max_cat_threshold':75, 
 'min_data_per_group': 82,
 'max_bin': 800,
 'max_depth': 13,
 'num_leaves': 8190,
 'min_child_samples': 407,
 'min_child_weight': 0.1,
 'min_data_in_leaf': 2420,
 'reg_alpha': 0.1,
 'reg_lambda': 15,
 'scale_pos_weight': 3,
 'subsample': 0.734,
 'subsample_for_bin': 512,
 'scoring': 'roc_auc',
 'metric': 'auc',
 'objective': 'binary'}

In [56]:
lgbm_train2 = lgb.Dataset(X_train, y_train, categorical_feature=category)
# lgbm_test = lgbm.Dataset(X_test, y_test, categorical_feature=cat_cols)
lgbm_val2 = lgb.Dataset(X_test, y_test, reference = lgbm_train2)

In [57]:
evals_result = {}  # to record eval results for plotting
model_lgbm_2 = lgb.train(params_final,
                lgbm_train2,
                num_boost_round=250,
                valid_sets=[lgbm_train2, lgbm_val2],
                feature_name=['f' + str(i + 1) for i in range(X_train.shape[-1])],
                categorical_feature= [182],
                evals_result=evals_result,
                verbose_eval=100)

New categorical_feature is [182]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


[100]	training's auc: 0.909621	valid_1's auc: 0.906936
[200]	training's auc: 0.920641	valid_1's auc: 0.916543
[300]	training's auc: 0.931114	valid_1's auc: 0.925878
[400]	training's auc: 0.938745	valid_1's auc: 0.932354
[500]	training's auc: 0.94607	valid_1's auc: 0.938683
[600]	training's auc: 0.950091	valid_1's auc: 0.942345
[700]	training's auc: 0.954464	valid_1's auc: 0.946153
[800]	training's auc: 0.957595	valid_1's auc: 0.949012
[900]	training's auc: 0.96146	valid_1's auc: 0.952468
[1000]	training's auc: 0.964441	valid_1's auc: 0.95511
[1100]	training's auc: 0.96717	valid_1's auc: 0.957576
[1200]	training's auc: 0.968976	valid_1's auc: 0.959168


In [58]:
y_pred = model_lgbm_2.predict(test[X_train.columns])
newdata = pd.DataFrame({'encounter_id': test['encounter_id'],'hospital_death':y_pred})
newdata.to_csv('submissionLG.csv', index=False)

## Final AUC: 0.90721 after submission.