In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', 0)

from pathlib import Path

In [2]:
from category_encoders import OrdinalEncoder

In [3]:
SEED = 77

In [4]:
files = Path('.').glob('../data/input/application_*.ftr')
for f in files:
    print(f)
    globals()[f.stem] = pd.read_feather(f)

../data/input/application_test.ftr
../data/input/application_train.ftr


In [5]:
train = application_train
test = application_test

In [6]:
categorical_columns = [
    'NAME_CONTRACT_TYPE',
    'CODE_GENDER',
    'FLAG_OWN_CAR',
    'FLAG_OWN_REALTY',
    'NAME_TYPE_SUITE',
    'NAME_INCOME_TYPE',
    'NAME_EDUCATION_TYPE',
    'NAME_FAMILY_STATUS',
    'NAME_HOUSING_TYPE',
    'OCCUPATION_TYPE',
    'WEEKDAY_APPR_PROCESS_START',
    'ORGANIZATION_TYPE',
    'FONDKAPREMONT_MODE',
    'HOUSETYPE_MODE',
    'WALLSMATERIAL_MODE',
    'EMERGENCYSTATE_MODE'
]
enc = OrdinalEncoder(cols=categorical_columns, verbose=1)
train[categorical_columns] = enc.fit_transform(train[categorical_columns])
test[categorical_columns] = enc.transform(test[categorical_columns])

In [7]:
X_train = train.drop(['SK_ID_CURR', 'TARGET'], axis=1)
y_train = train.TARGET.values
X_test = test.drop(['SK_ID_CURR'], axis=1)

In [8]:
from sklearn.model_selection import GridSearchCV, KFold, StratifiedKFold
import lightgbm as lgb

In [10]:
params = {
    'metric': ['auc'],
    'learning_rate': [0.1],
    'num_leaves': [i*10 for i in range(2, 6)],
    'min_data_in_leaf': [5, 10, 15, 20],
    'random_state': [SEED],
    'colsample_bytree': [0.8],
    'subsample': [0.8],
    'verbose': [1]
}

cv = StratifiedKFold(5, shuffle=True, random_state=SEED)
model = GridSearchCV(lgb.LGBMClassifier(), params, scoring='roc_auc', n_jobs=4, cv=cv.split(X_train, y_train), verbose=3)
model.fit(X_train, y_train)

Fitting 3 folds for each of 16 candidates, totalling 48 fits
[CV] learning_rate=0.1, metric=auc, min_data_in_leaf=5, num_leaves=20, random_state=77, verbose=1 
[CV] learning_rate=0.1, metric=auc, min_data_in_leaf=5, num_leaves=20, random_state=77, verbose=1 
[CV] learning_rate=0.1, metric=auc, min_data_in_leaf=5, num_leaves=20, random_state=77, verbose=1 
[CV]  learning_rate=0.1, metric=auc, min_data_in_leaf=5, num_leaves=20, random_state=77, verbose=1, score=0.7542977792194173, total=  43.2s
[CV] learning_rate=0.1, metric=auc, min_data_in_leaf=5, num_leaves=30, random_state=77, verbose=1 
[CV]  learning_rate=0.1, metric=auc, min_data_in_leaf=5, num_leaves=20, random_state=77, verbose=1, score=0.7538588921332902, total=  43.8s
[CV] learning_rate=0.1, metric=auc, min_data_in_leaf=5, num_leaves=30, random_state=77, verbose=1 
[CV]  learning_rate=0.1, metric=auc, min_data_in_leaf=5, num_leaves=20, random_state=77, verbose=1, score=0.7550425201224977, total=  45.7s
[CV] learning_rate=0.1, 

[Parallel(n_jobs=3)]: Done  26 tasks      | elapsed:  7.6min


[CV]  learning_rate=0.1, metric=auc, min_data_in_leaf=15, num_leaves=20, random_state=77, verbose=1, score=0.7560339432528955, total=  32.5s
[CV] learning_rate=0.1, metric=auc, min_data_in_leaf=15, num_leaves=30, random_state=77, verbose=1 
[CV]  learning_rate=0.1, metric=auc, min_data_in_leaf=15, num_leaves=30, random_state=77, verbose=1, score=0.7552710974508045, total=  38.1s
[CV] learning_rate=0.1, metric=auc, min_data_in_leaf=15, num_leaves=40, random_state=77, verbose=1 
[CV]  learning_rate=0.1, metric=auc, min_data_in_leaf=15, num_leaves=30, random_state=77, verbose=1, score=0.753652919661329, total=  39.1s
[CV] learning_rate=0.1, metric=auc, min_data_in_leaf=15, num_leaves=40, random_state=77, verbose=1 
[CV]  learning_rate=0.1, metric=auc, min_data_in_leaf=15, num_leaves=30, random_state=77, verbose=1, score=0.755629591373601, total=  42.5s
[CV] learning_rate=0.1, metric=auc, min_data_in_leaf=15, num_leaves=40, random_state=77, verbose=1 
[CV]  learning_rate=0.1, metric=auc, m

KeyboardInterrupt: 

In [None]:
pred = model.predict_proba(X_test)[:, 1]
plt.hist(pred, bins=50)
plt.show()

In [None]:
feat_df = pd.DataFrame({'importance': model.best_estimator_.feature_importances_}, index=X_train.columns).sort_values('importance', ascending=False)
feat_df[:30].plot.bar(figsize=(20, 5))

In [None]:
import time
sample_submission.TARGET = pred
sample_submission.to_csv(f"{time.strftime('%y%m%d_%H%M%S')}_submission.csv.gz", index=None, compression='gzip')