In [1]:
# Data manipulation
import pandas as pd
import numpy as np

# Modeling
import lightgbm as lgb

# Evaluation of the model
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import roc_auc_score

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

plt.rcParams['font.size'] = 18
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

# Custom

from utils.preprocessing import one_hot_encoder
from utils.preprocessing import missing_values_table

### Set up training and test set

In [2]:
spec_feature = pd.read_csv('~/HomeCredit/feature_matrix_spec.csv')
missing_table = missing_values_table(spec_feature)

dump_feats = missing_table[missing_table['% of Total Values'] > 30].index.tolist()

spec_feature = spec_feature.drop(dump_feats, axis = 1)

spec_feature, _ = one_hot_encoder(spec_feature)

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Imputer

Target = spec_feature['TARGET']
ID = spec_feature['SK_ID_CURR']

dataset_temp = spec_feature.drop(['TARGET','SK_ID_CURR'], axis = 1)

imputer = Imputer(strategy = 'median')
scaler = MinMaxScaler(feature_range = (0,1))

dataset_preprocessed = imputer.fit_transform(dataset_temp)
dataset_preprocessed = scaler.fit_transform(dataset_preprocessed)
dataset_preprocessed = pd.DataFrame(dataset_preprocessed, columns = dataset_temp.columns)

dataset_preprocessed['TARGET'] = Target
dataset_preprocessed['SK_ID_CURR'] = ID

train_df = dataset_preprocessed[dataset_preprocessed['TARGET'] != -999]
test_df = dataset_preprocessed[dataset_preprocessed['TARGET'] == -999]

train_features = train_df.drop(['TARGET'], axis = 1)
train_labels = train_df['TARGET']

# Training set
train_set = lgb.Dataset(train_features, label = train_labels)

There are total 885 columns.
823 of them have missing values.


### Reduce dataset

In [3]:
test_features = test_df.drop(['TARGET'], axis = 1)
test_labels = test_df['TARGET']

In [4]:
N = 15000
T = 3000
X_train, Y_train = train_features[:N], train_labels[:N]
X_test, Y_test = test_features[:T], test_labels[:T]

In [5]:
print(X_train.shape)

(15000, 790)


## Random Search

### Baseline

In [6]:
# Define a baseline model
from sklearn.ensemble import RandomForestClassifier

clr_rf = RandomForestClassifier(n_estimators=500) 
clr_rf.fit(X_train, Y_train)
print("clr_rf trained")
preds = clr_rf.predict(X_test)


clr_rf trained


In [7]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score, roc_curve, auc
# accuracy
acc_scores = cross_val_score(clr_rf, X_train, Y_train, cv = 5)

print('5 fold CV accuracy for random forest is %0.2f (+/- %f)' %(acc_scores.mean(), acc_scores.std() * 2))



5 fold CV accuracy for random forest is 0.79 (+/- 0.346235)


In [8]:
# roc auc score
roc_auc_scores = cross_val_score(clr_rf, X_train, Y_train, cv=5, scoring='roc_auc')
print('5 fold CV ROC_AUC is %0.2f (+/- %0.2f)' %(roc_auc_scores.mean(), roc_auc_scores.std() * 2))

5 fold CV ROC_AUC is 0.62 (+/- 0.17)


In [9]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint
# parameters for GridSearchCV
# specify parameters and distributions to sample from
param_dist = {"max_depth": [3, 5],
              "max_features": sp_randint(1, 11),
              "min_samples_split": sp_randint(2, 11),
              "min_samples_leaf": sp_randint(1, 11),
              "bootstrap": [True, False]
             }
# run randomized search
n_iter_search = 100
random_search = RandomizedSearchCV(clr_rf, param_distributions=param_dist,
                                   n_iter=n_iter_search)

In [10]:
random_search.fit(X_train, Y_train)

RandomizedSearchCV(cv='warn', error_score='raise-deprecating',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
          fit_params=None, iid='warn', n_iter=100, n_jobs=None,
          param_distributions={'max_depth': [3, 5], 'max_features': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fa6f03c67f0>, 'min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fa6f03c6f60>, 'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fa6f03c65c0>, 'bootstrap': [True, False]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
       

In [17]:
from operator import itemgetter
# Utility function to report best scores
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [18]:
report(random_search.cv_results_)

Model with rank: 1
Mean validation score: 0.922 (std: 0.000)
Parameters: {'bootstrap': True, 'max_depth': 5, 'max_features': 10, 'min_samples_leaf': 5, 'min_samples_split': 6}

Model with rank: 1
Mean validation score: 0.922 (std: 0.000)
Parameters: {'bootstrap': True, 'max_depth': 5, 'max_features': 6, 'min_samples_leaf': 3, 'min_samples_split': 4}

Model with rank: 1
Mean validation score: 0.922 (std: 0.000)
Parameters: {'bootstrap': False, 'max_depth': 5, 'max_features': 6, 'min_samples_leaf': 6, 'min_samples_split': 5}

Model with rank: 1
Mean validation score: 0.922 (std: 0.000)
Parameters: {'bootstrap': True, 'max_depth': 5, 'max_features': 7, 'min_samples_leaf': 2, 'min_samples_split': 10}

Model with rank: 1
Mean validation score: 0.922 (std: 0.000)
Parameters: {'bootstrap': False, 'max_depth': 5, 'max_features': 4, 'min_samples_leaf': 3, 'min_samples_split': 5}

Model with rank: 1
Mean validation score: 0.922 (std: 0.000)
Parameters: {'bootstrap': False, 'max_depth': 3, 'max_f

In [19]:
random_search.cv_results_.keys()

dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_bootstrap', 'param_max_depth', 'param_max_features', 'param_min_samples_leaf', 'param_min_samples_split', 'params', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'mean_test_score', 'std_test_score', 'rank_test_score', 'split0_train_score', 'split1_train_score', 'split2_train_score', 'mean_train_score', 'std_train_score'])

In [22]:
clr_rf_best = RandomForestClassifier(n_estimators=500, max_depth=5, min_samples_leaf=5, min_samples_split=6) 
clr_rf_best.fit(X_train, Y_train)
print("clr_rf trained")
preds = clr_rf_best.predict(X_test)

clr_rf trained


In [23]:
# roc auc score
roc_auc_scores_best_para = cross_val_score(clr_rf_best, X_train, Y_train, cv=5, scoring='roc_auc')
print('5 fold CV ROC_AUC is %0.5f (+/- %0.5f)' %(roc_auc_scores_best_para.mean(), roc_auc_scores_best_para.std() * 2))

5 fold CV ROC_AUC is 0.68559 (+/- 0.07158)
