In [5]:
import pandas as pd
import time
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import Imputer
import lightgbm as lgbm
from sklearn.ensemble import RandomForestClassifier
from utils import ftextraction
import numpy as np
import matplotlib.pyplot as plt
import gc
gc.enable()
%matplotlib inline

##### Preprocess Datasets

In [2]:
#load datasets
start = time.time()
app_train = pd.read_csv('data/application_train.csv')
app_test = pd.read_csv('data/application_test.csv')
buro = pd.read_csv('data/bureau.csv')
bb = pd.read_csv('data/bureau_balance.csv')
ccb = pd.read_csv('data/credit_card_balance.csv')
ipay = pd.read_csv('data/installments_payments.csv')
pos = pd.read_csv('data/POS_CASH_balance.csv')
pa = pd.read_csv('data/previous_application.csv')
end = time.time()
print('Datasets loaded in: {:.2f} minutes'.format((end-start)/60))

Datasets loaded in: 1.10 minutes


In [None]:
#process datasets
start = time.time()
fe = ftextraction.Extractor()
data = fe.process_datasets(app_train,app_test,buro,bb,pa,ipay,ccb,pos)
end = time.time()
print('Datasets processed in: {:.2f} minutes'.format((end-start)/60))

##### Modelling 

In [8]:
def model(data):
    """Credit Risk Default Prediction Model
    
       Parameters:
       -----------
       data(pandas dataframe): training and testing dataset
       
       Returns:
       -------
       submission(pandas dataframe): test set predictions
       scores(pandas dataframe): model training and validation scores
       feature_importances(pandas dataframe): feature importance of the model
       fprs(dictionary): false positive rates across folds
       tprs(dictionary): true positive rates acrros folds
       aucs(list): roc auc score across folds 
       
    """   
    #preprocessing data    
    train = data.loc[data['TARGET'].notnull(),:].copy()
    test = data.loc[data['TARGET'].isnull(),:].copy()
    del data
    gc.collect()

    # save test IDs for final submission dataframe
    test_IDs = test['SK_ID_CURR']
    # save target labels
    labels = train['TARGET']
    # drop ID columns
    train = train.drop(columns=['SK_ID_CURR','TARGET'])
    test = test.drop(columns=['SK_ID_CURR'])

    #feature selection from previous runs
    fi = pd.read_csv('fi_lgbm_V0_fe.csv')
    exclude = list(fi.loc[fi['importance']==0,'features'])
    
    # drop features with no importance
    train.drop(columns=exclude,inplace=True)
    test.drop(columns=exclude,inplace=True)

    # aligining dataframes
    train,test = train.align(test,join='inner',axis=1)

    # for storing feature importances
    features = list(train.columns)
    ft_importances = np.zeros(len(features))

    # converting to numpy array for lgbm consumptions
    train = np.array(train)
    test = np.array(test)

    #DATA STRUCTURES TO STORE PREDICTIONS AND METRICS

    #store cv predictions
    oof_predictions = np.zeros(train.shape[0])
    #store predictions on test dataset
    test_preds = np.zeros(test.shape[0])
    #store predictions on train dataset
    train_preds = np.zeros(train.shape[0])
    #store scores
    scores = {}

    #SPLITTING AND TRAINING 
    kfold = StratifiedKFold(n_splits=10,shuffle=True,random_state=40)
    fold = 1
    start = time.time()
    fprs = {}
    tprs = {}
    aucs = []
    print('Training started')
    for train_i,valid_i in kfold.split(train,labels):
        xtrain,ytrain = train[train_i],labels[train_i]
        xvalid,yvalid = train[valid_i],labels[valid_i]
    
        # creating the classifier 
        clf = lgbm.LGBMClassifier(boosting_type='gbdt',num_leaves=40,
                                 learning_rate=0.02,reg_alpha=1,
                                 min_child_samples=45,reg_lambda=0.142857,
                                 colsample_bytree=0.5,subsample=0.915152,
                                 is_unbalance=True,
                                 n_estimators=10000,random_state=50)

        # fitting on the training set
        clf.fit(xtrain,ytrain,eval_set=[(xtrain, ytrain), (xvalid, yvalid)],eval_metric ='auc',
                verbose= 100, early_stopping_rounds= 100,eval_names = ['train','valid'])

        #best iteration
        best_iter = clf.best_iteration_
        
        # storing out of fold predictions:
        oof_predictions[valid_i] = clf.predict_proba(xvalid,num_iteration=best_iter)[:, 1]

        # storing test set predictions:
        test_preds += clf.predict_proba(test,num_iteration=best_iter)[:, 1] /kfold.n_splits

        #stortin training set predictions
        train_preds[train_i] = clf.predict_proba(xtrain,num_iteration=best_iter)[:, 1]

        # storing feature importances            
        ft_importances += clf.feature_importances_ /kfold.n_splits
        
        #false and true positive rates for plotting roc curve
        fpr,tpr,threshold = roc_curve(yvalid,oof_predictions[valid_i])
        fprs[f'fold_{fold}'] = fpr
        tprs[f'fold_{fold}'] = tpr
        rauc_scr = auc(fpr,tpr)
        aucs.append(rauc_scr)
        print('Fold %d done.'%fold)
        fold += 1

        # freeing up memory
        del xtrain,ytrain,xvalid,yvalid,clf
        gc.collect()
        
    end = time.time()    
    print('Training done in {:.2f} minutes'.format((end-start)/60))    
    #SCORES

    feature_importances = pd.DataFrame({'features':features,'importance':ft_importances})
    
    training_score = roc_auc_score(labels,train_preds)
    cv = roc_auc_score(labels,oof_predictions)
    
    fpr,tpr,threshold = roc_curve(labels,oof_predictions)
    fprs['OCV'] = fpr
    tprs['OCV'] = tpr
    aucs.append(cv) 
    
    scores['training score'] = [training_score]
    scores['cv score'] = [cv]
    scores = pd.DataFrame.from_dict(scores)
    print('Overall CV Score: %.4f' %cv)
    # submission dataframe:
    submission = pd.DataFrame({'SK_ID_CURR': test_IDs, 'TARGET': test_preds})

    return submission, scores, feature_importances,fprs,tprs,aucs

In [None]:
submission, scores, feature_importances,fprs,tprs,aucs = model(data)

In [10]:
submission.to_csv('submission.csv',index=False)