# Lightgbm

In [None]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

import lightgbm

from sklearn.externals import joblib
from sklearn.model_selection import StratifiedKFold

from IPython.display import display

from utils import *

In [None]:
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
%load_ext autoreload
%autoreload 2
%matplotlib inline

### Load possible outliers

In [None]:
outliers_idx = np.load('isolation_forest_outliers.npy', mmap_mode='r')
outliers_idx = np.array(outliers_idx)

### Load data

In [None]:
train = joblib.load('models/train.joblib')
print(train.shape)

In [None]:
test = joblib.load('models/test.joblib')
print(test.shape)

In [None]:
train = train.drop(index=train.index[outliers_idx])
train = train.reset_index(drop=True)
targets = train['TARGET']

train_ids = train['SK_ID_CURR']
train = train.drop(columns=['SK_ID_CURR', 'TARGET'])

In [None]:
test_ids = test['SK_ID_CURR']
test = test.drop(columns=['SK_ID_CURR'])

### Drop redundant columns

In [None]:
cols_drop = appartment_mode_cols + appartment_medi_cols
train.drop(columns=cols_drop, inplace=True)
test.drop(columns=cols_drop, inplace=True)

In [None]:
print(train.shape)
print(test.shape)

### Convert to np arrays

In [None]:
features = np.array(train)
test_features = np.array(test)

In [None]:
k_fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

valid_scores = []
train_scores = []
test_predictions = np.zeros(test_features.shape[0])

for train_indices, valid_indices in k_fold.split(features, targets):
    # Training data for the fold
    train_features, train_labels = features[train_indices], targets[train_indices]
    # Validation data for the fold
    valid_features, valid_labels = features[valid_indices], targets[valid_indices]
    
    # d_train = lightgbm.Dataset(train_features, label=train_labels)
    # d_valid = lightgbm.Dataset(valid_features, label=valid_labels)
    # model = lightgbm.train(parameters, d_train, verbose_eval=100, valid_sets=[d_valid], num_boost_round=20000, early_stopping_rounds=200)
    
    # The ‘balanced’ mode uses the values of y to automatically adjust weights inversely proportional
    # to class frequencies in the input data as n_samples / (n_classes * np.bincount(y))
    model = lightgbm.LGBMClassifier(n_estimators=10000, objective='binary', 
                                   class_weight='balanced', learning_rate=0.001, 
                                   reg_alpha=0.3, reg_lambda=0.3, 
                                   subsample=0.8, n_jobs=6, random_state=4242)
    
    model.fit(train_features, train_labels, eval_metric='auc',
                  eval_set=[(valid_features, valid_labels), (train_features, train_labels)],
                  eval_names=['valid', 'train'], early_stopping_rounds=100, verbose=100)

    best_iteration = model.best_iteration_
    
    test_predictions += model.predict_proba(test_features, num_iteration = best_iteration)[:, 1] / k_fold.n_splits
    
    valid_score = model.best_score_['valid']['auc']
    train_score = model.best_score_['train']['auc']
        
    valid_scores.append(valid_score)
    train_scores.append(train_score)


In [None]:
submission = pd.DataFrame({'SK_ID_CURR': test_ids, 'TARGET': test_predictions})
submission.to_csv('submissions/lightgbm.csv', index=False)