# Import datasets and libraries

In [1]:
import numpy as np
import pandas as pd
import os

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
train_df = pd.read_csv('equip_failures_training_set.csv')

In [4]:
# Replace string 'na' with NaN value
train_df = train_df.replace('na', np.nan)

# Coerce type to float to get rid of NaN problem
train_df = train_df.astype(float)
train_df['id'] = train_df['id'].astype(int)
train_df['target'] = train_df['target'].astype(int)

In [27]:
train_df = train_df.drop(['sensor68_measure','sensor40_measure','sensor41_measure',
               'sensor42_measure','sensor43_measure'], axis=1)

In [5]:
test_df = pd.read_csv('equip_failures_test_set.csv')
test_df = test_df.replace('na', np.nan)
test_df = test_df.astype(float)
test_df['id'] = test_df['id'].astype(int)

In [36]:
test_df = test_df.drop(['sensor68_measure','sensor40_measure','sensor41_measure',
               'sensor42_measure','sensor43_measure'], axis=1)

# Different Models

In [6]:
import lightgbm
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

In [28]:
#Split train data in order to reserve %80 of train data for test.
data_train_y = train_df['target']
data_train_x = train_df.drop(['id','target'], axis=1)


from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(data_train_x,data_train_y,
                                                    test_size = 0.2,random_state=314)

In [8]:
def learning_rate_010_decay_power_099(current_iter):
    base_learning_rate = 0.1
    lr = base_learning_rate  * np.power(.99, current_iter)
    return lr if lr > 1e-3 else 1e-3

def learning_rate_010_decay_power_0995(current_iter):
    base_learning_rate = 0.1
    lr = base_learning_rate  * np.power(.995, current_iter)
    return lr if lr > 1e-3 else 1e-3

def learning_rate_005_decay_power_099(current_iter):
    base_learning_rate = 0.05
    lr = base_learning_rate  * np.power(.99, current_iter)
    return lr if lr > 1e-3 else 1e-3

In [29]:
fit_params={"early_stopping_rounds":30, 
            "eval_metric" : 'auc', 
            "eval_set" : [(x_test,y_test)],
            'eval_names': ['valid'],
            'verbose': 100,
            'categorical_feature': 'auto'}

In [30]:
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
param_test ={'num_leaves': sp_randint(6, 50), 
             'min_child_samples': sp_randint(100, 500), 
             'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
             'subsample': sp_uniform(loc=0.2, scale=0.8), 
             'colsample_bytree': sp_uniform(loc=0.4, scale=0.6),
             'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
             'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100]}

In [39]:
#This parameter defines the number of HP points to be tested
n_HP_points_to_test = 100

import lightgbm as lgb
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

#n_estimators is set to a "large value". The actual number of trees build will depend on early stopping and 5000 define only the absolute maximum
clf = lgb.LGBMClassifier(max_depth=-1, random_state=314, silent=True, 
                         metric='None', n_jobs=4, n_estimators=5000)

gs = RandomizedSearchCV(
    estimator=clf, param_distributions=param_test, 
    n_iter=n_HP_points_to_test,
    scoring='roc_auc',
    cv=10,
    refit=True,
    random_state=314,
    verbose=True)

In [40]:
gs.fit(x_train, y_train, **fit_params)
print('Best score reached: {} with params: {} '.format(gs.best_score_, gs.best_params_))

Fitting 10 folds for each of 100 candidates, totalling 1000 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Training until validation scores don't improve for 30 rounds.
[100]	valid's auc: 0.984229
Early stopping, best iteration is:
[144]	valid's auc: 0.985548
Training until validation scores don't improve for 30 rounds.
[100]	valid's auc: 0.981619
[200]	valid's auc: 0.984299
[300]	valid's auc: 0.985777
Early stopping, best iteration is:
[351]	valid's auc: 0.986049
Training until validation scores don't improve for 30 rounds.
[100]	valid's auc: 0.984508
[200]	valid's auc: 0.98608
[300]	valid's auc: 0.986566
Early stopping, best iteration is:
[299]	valid's auc: 0.986604
Training until validation scores don't improve for 30 rounds.
[100]	valid's auc: 0.984086
Early stopping, best iteration is:
[162]	valid's auc: 0.985233
Training until validation scores don't improve for 30 rounds.
[100]	valid's auc: 0.984634
[200]	valid's auc: 0.986504
[300]	valid's auc: 0.987419
Early stopping, best iteration is:
[333]	valid's auc: 0.987703
Training until validation scores don't improve for 30 rounds.
Early 

[100]	valid's auc: 0.98645
Early stopping, best iteration is:
[120]	valid's auc: 0.988412
Training until validation scores don't improve for 30 rounds.
[100]	valid's auc: 0.989611
Early stopping, best iteration is:
[97]	valid's auc: 0.989963
Training until validation scores don't improve for 30 rounds.
[100]	valid's auc: 0.986965
Early stopping, best iteration is:
[80]	valid's auc: 0.988194
Training until validation scores don't improve for 30 rounds.
[100]	valid's auc: 0.986752
Early stopping, best iteration is:
[73]	valid's auc: 0.987395
Training until validation scores don't improve for 30 rounds.
[100]	valid's auc: 0.986889
Early stopping, best iteration is:
[102]	valid's auc: 0.98716
Training until validation scores don't improve for 30 rounds.
[100]	valid's auc: 0.989147
Early stopping, best iteration is:
[107]	valid's auc: 0.989277
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[56]	valid's auc: 0.987736
Training until validation

Early stopping, best iteration is:
[54]	valid's auc: 0.986565
Training until validation scores don't improve for 30 rounds.
[100]	valid's auc: 0.985717
[200]	valid's auc: 0.986132
Early stopping, best iteration is:
[176]	valid's auc: 0.986597
Training until validation scores don't improve for 30 rounds.
[100]	valid's auc: 0.985537
Early stopping, best iteration is:
[93]	valid's auc: 0.985783
Training until validation scores don't improve for 30 rounds.
[100]	valid's auc: 0.986695
Early stopping, best iteration is:
[129]	valid's auc: 0.987022
Training until validation scores don't improve for 30 rounds.
[100]	valid's auc: 0.986791
Early stopping, best iteration is:
[97]	valid's auc: 0.986871
Training until validation scores don't improve for 30 rounds.
[100]	valid's auc: 0.986919
Early stopping, best iteration is:
[85]	valid's auc: 0.987225
Training until validation scores don't improve for 30 rounds.
[100]	valid's auc: 0.985451
Early stopping, best iteration is:
[137]	valid's auc: 0.98

Training until validation scores don't improve for 30 rounds.
[100]	valid's auc: 0.984831
Early stopping, best iteration is:
[97]	valid's auc: 0.984949
Training until validation scores don't improve for 30 rounds.
[100]	valid's auc: 0.984535
Early stopping, best iteration is:
[160]	valid's auc: 0.984927
Training until validation scores don't improve for 30 rounds.
[100]	valid's auc: 0.985918
Early stopping, best iteration is:
[128]	valid's auc: 0.987415
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[45]	valid's auc: 0.983551
Training until validation scores don't improve for 30 rounds.
[100]	valid's auc: 0.987323
[200]	valid's auc: 0.988806
Early stopping, best iteration is:
[185]	valid's auc: 0.989002
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[29]	valid's auc: 0.983808
Training until validation scores don't improve for 30 rounds.
[100]	valid's auc: 0.9867
Early stopping, best iter

[100]	valid's auc: 0.979961
Early stopping, best iteration is:
[84]	valid's auc: 0.979983
Training until validation scores don't improve for 30 rounds.
[100]	valid's auc: 0.979453
Early stopping, best iteration is:
[75]	valid's auc: 0.979457
Training until validation scores don't improve for 30 rounds.
[100]	valid's auc: 0.979406
Early stopping, best iteration is:
[76]	valid's auc: 0.979414
Training until validation scores don't improve for 30 rounds.
[100]	valid's auc: 0.985638
[200]	valid's auc: 0.98803
Early stopping, best iteration is:
[230]	valid's auc: 0.988093
Training until validation scores don't improve for 30 rounds.
[100]	valid's auc: 0.985859
[200]	valid's auc: 0.988338
Early stopping, best iteration is:
[249]	valid's auc: 0.988719
Training until validation scores don't improve for 30 rounds.
[100]	valid's auc: 0.986701
[200]	valid's auc: 0.98855
Early stopping, best iteration is:
[234]	valid's auc: 0.989019
Training until validation scores don't improve for 30 rounds.
[10

Training until validation scores don't improve for 30 rounds.
[100]	valid's auc: 0.985932
[200]	valid's auc: 0.987103
[300]	valid's auc: 0.988195
[400]	valid's auc: 0.988744
[500]	valid's auc: 0.98902
Early stopping, best iteration is:
[501]	valid's auc: 0.989021
Training until validation scores don't improve for 30 rounds.
[100]	valid's auc: 0.985148
[200]	valid's auc: 0.987202
Early stopping, best iteration is:
[207]	valid's auc: 0.987276
Training until validation scores don't improve for 30 rounds.
[100]	valid's auc: 0.987759
Early stopping, best iteration is:
[139]	valid's auc: 0.988472
Training until validation scores don't improve for 30 rounds.
[100]	valid's auc: 0.984306
Early stopping, best iteration is:
[160]	valid's auc: 0.985391
Training until validation scores don't improve for 30 rounds.
[100]	valid's auc: 0.985851
[200]	valid's auc: 0.98769
[300]	valid's auc: 0.988123
Early stopping, best iteration is:
[330]	valid's auc: 0.988372
Training until validation scores don't im

KeyboardInterrupt: 

In [33]:
opt_parameters = {'application': 'binary',
                  'objective': 'binary',
                  'colsample_bytree': 0.4211627356071123, 
                  'min_child_samples': 371, 
                  'min_child_weight': 0.001, 
                  'num_leaves': 15, 
                  'reg_alpha': 1, 
                  'reg_lambda': 0.1, 
                  'subsample': 0.9518946967698729}

In [34]:
#Configure locally from hardcoded values
clf_final = lgb.LGBMClassifier(**clf.get_params())
#set optimal parameters
clf_final.set_params(**opt_parameters)

#Train the final model with learning rate decay
clf_final.fit(x_train, y_train, **fit_params, 
              callbacks=[lgb.reset_parameter(learning_rate=learning_rate_010_decay_power_0995)])

LGBMClassifier(application='binary', boosting_type='gbdt', class_weight=None,
        colsample_bytree=0.4211627356071123, importance_type='split',
        learning_rate=0.1, max_depth=-1, metric='None',
        min_child_samples=371, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=5000, n_jobs=4, num_leaves=15, objective='binary',
        random_state=314, reg_alpha=1, reg_lambda=0.1, silent=True,
        subsample=0.9518946967698729, subsample_for_bin=200000,
        subsample_freq=0)

Training until validation scores don't improve for 30 rounds.
[100]	valid's auc: 0.987745
Early stopping, best iteration is:
[76]	valid's auc: 0.988559


LGBMClassifier(application='binary', boosting_type='gbdt', class_weight=None,
        colsample_bytree=0.4211627356071123, importance_type='split',
        learning_rate=0.1, max_depth=-1, metric='None',
        min_child_samples=371, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=5000, n_jobs=4, num_leaves=15, objective='binary',
        random_state=314, reg_alpha=1, reg_lambda=0.1, silent=True,
        subsample=0.9518946967698729, subsample_for_bin=200000,
        subsample_freq=0)

In [37]:
probabilities = clf_final.predict(test_df.drop(['id'], axis=1))


In [38]:
#probabilities = clf_final.predict(test_df.drop(['id'], axis=1))
submission = pd.DataFrame({
    'id': test_df['id'],
    'target': probabilities})

submission.to_csv("submission.csv", index=False)