In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,f1_score
import catboost

from skopt import gp_minimize
from skopt.space import Real, Integer
from skopt.utils import use_named_args
from skopt.plots import plot_convergence
import time


# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

['cleaned_ltfs.csv']


In [None]:
data = pd.read_csv("../input/cleaned_ltfs.csv")
# data = data.drop(["SEC.DISBURSED.AMOUNT","DisbursalDate","State_ID","Employee_code_ID",
#                     "SEC.ACTIVE.ACCTS","SEC.OVERDUE.ACCTS","SEC.CURRENT.BALANCE","SEC.SANCTIONED.AMOUNT","SEC.NO.OF.ACCTS","AVERAGE.ACCT.AGE",
#                     "CREDIT.HISTORY.LENGTH","PERFORM_CNS.SCORE","branch_id","supplier_id","Current_pincode_ID","SEC.INSTAL.AMT"],axis=1)
#train = train.drop(["manufacturer_id","Employment_Type","Aadhar_flag","PAN_flag","VoterID_flag","Driving_flag","Passport_flag","PERFORM_CNS_SCORE_DESCRIPTION","loan_default"],axis=1)

In [None]:
data.head()

In [None]:
data = data.drop(["Unnamed: 0"],axis=1)

In [None]:

data['manufacturer_id']=data['manufacturer_id'].astype(np.object)
data['Aadhar_flag']=data['Aadhar_flag'].astype(np.object)
data['PAN_flag']=data['PAN_flag'].astype(np.object)
data['VoterID_flag']=data['VoterID_flag'].astype(np.object)
data['Driving_flag']=data['Driving_flag'].astype(np.object)
data['Aadhar_flag']=data['Aadhar_flag'].astype(np.object)
data['Passport_flag']=data['Passport_flag'].astype(np.object)
data['branch_id']=data['branch_id'].astype(np.object)
data['supplier_id']=data['supplier_id'].astype(np.object)
data['Employee_code_ID']=data['Employee_code_ID'].astype(np.object)
data['State_ID']=data['State_ID'].astype(np.object)
data['Current_pincode_ID']=data['Current_pincode_ID'].astype(np.object)

data.info()

In [None]:
data['Employment_Type'].fillna('Other',inplace=True)

In [None]:
X,Y = data.drop('loan_default',axis=1),data['loan_default']
X_train,X_val,Y_train,Y_val = train_test_split(X,Y,test_size=0.3,random_state = 2001)

In [None]:
class ModelOptimizer:
    best_score = None
    opt = None
    
    def __init__(self, model, X_train, y_train, categorical_columns_indices=None, n_fold=5, seed=2001, early_stopping_rounds=30, is_stratified=True, is_shuffle=True):
        self.model = model
        self.X_train = X_train
        self.y_train = y_train
        self.categorical_columns_indices = categorical_columns_indices
        self.n_fold = n_fold
        self.seed = seed
        self.early_stopping_rounds = early_stopping_rounds
        self.is_stratified = is_stratified
        self.is_shuffle = is_shuffle
        
        
    def update_model(self, **kwargs):
        for k, v in kwargs.items():
            setattr(self.model, k, v)
            
    def evaluate_model(self):
        pass
    
    def optimize(self, param_space, max_evals=10, n_random_starts=2):
        start_time = time.time()
        
        @use_named_args(param_space)
        def _minimize(**params):
            self.model.set_params(**params)
            return self.evaluate_model()
        
        opt = gp_minimize(_minimize, param_space, n_calls=max_evals, n_random_starts=n_random_starts, random_state=2405, n_jobs=-1)
        best_values = opt.x
        optimal_values = dict(zip([param.name for param in param_space], best_values))
        best_score = opt.fun
        self.best_score = best_score
        self.opt = opt
        
        print('optimal_parameters: {}\noptimal score: {}\noptimization time: {}'.format(optimal_values, best_score, time.time() - start_time))
        print('updating model with optimal values')
        self.update_model(**optimal_values)
        plot_convergence(opt)
        return optimal_values
    
class CatboostOptimizer(ModelOptimizer):
    def evaluate_model(self):
        validation_scores = catboost.cv(
        catboost.Pool(self.X_train, 
                      self.y_train, 
                      cat_features=self.categorical_columns_indices),
        self.model.get_params(), 
        nfold=self.n_fold,
        stratified=self.is_stratified,
        seed=self.seed,
        early_stopping_rounds=self.early_stopping_rounds,
        shuffle=self.is_shuffle,
        verbose=100,
        plot=False)
        self.scores = validation_scores
        test_scores = validation_scores.iloc[:, 2]
        best_metric = test_scores.max()
        return 1 - best_metric

In [None]:
categorical_features_indices = np.where(X_train.dtypes =='object')[0]
categorical_features_indices

In [None]:
cb = catboost.CatBoostClassifier(n_estimators=4000, # use large n_estimators deliberately to make use of the early stopping
                         loss_function='Logloss',
                         eval_metric='AUC',
                         boosting_type='Plain', # use permutations
                         random_seed=1994, 
                         use_best_model=True)
cb_optimizer = CatboostOptimizer(cb, X_train, Y_train,categorical_columns_indices=categorical_features_indices)
params_space = [Real(0.01, 0.8, name='learning_rate'),]
cb_optimal_values = cb_optimizer.optimize(params_space)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score,confusion_matrix,roc_auc_score
m=CatBoostClassifier(n_estimators=3000,random_state=1994,eval_metric='AUC',max_depth=6,learning_rate=0.029,od_wait=50
                     ,l2_leaf_reg=5,cat_features=categorical_features_indices,bagging_temperature=0.85,random_strength=100,
                     use_best_model=True)
m.fit(X_train,Y_train,eval_set=[(X_train,Y_train),(X_val, Y_val)], early_stopping_rounds=100,verbose=100)
p2=m.predict_proba(X_val)[:,-1]


In [None]:
sorted(zip(m.feature_importances_,X_train),reverse=True)



In [None]:
feature = pd.DataFrame()
X_train = pd.DataFrame(X_train)

feature['name'] = X_train.columns
feature['importance'] = m.feature_importances_

feature.sort_values(by = ['importance'], ascending = True, inplace = True)
feature.set_index('name', inplace = True)

feature.plot(kind = 'bar', color = 'orange', figsize = (15, 5), fontsize = 10)
