In [1]:
# necessary libraries for prediciton
import utils
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# Useful Functions

In [2]:
def pretty_matrix(matrix, row_label, col_label):
    """Pretty print of the given matrix """

    # Restraining labels that are too big
    row_label = [el[:10] + '..' if len(el) > 10 else el
                for el in row_label]
    col_label = [el[:10] + '..' if len(el) > 10 else el
                for el in col_label]

    # Stringfying everything & Joining top label
    s_matrix = [list([" "] + (col_label))] + \
               [[row_label[row_idx]] + \
                [str(e) for e in row] for row_idx, row in enumerate(matrix)]

    # Length of each matrix column
    len_s = [max(map(len, col)) for col in zip(*s_matrix)]

    # Cell formatation
    formatation = '\t'.join('{{:{}}}'.format(x) for x in len_s)

    # Apply cell formation to each matrix element
    pretty_mat = [formatation.format(*row) for row in s_matrix]

    # Print Pretty Matrix
    print('\n'.join(pretty_mat))


def display_confusion_matrix(values):
    '''Display the given array as a confusion matrix'''
    pretty_matrix([values[0:2], values[2:4]],
                  ['Actual NO', 'Actual YES'],
                  ['Predic NO', 'Predic YES'])

# Prediction Algorithms

* Decision Tree

In [3]:
def create_DT():
    '''Create a new Decision Tree'''
    # Useful DecisionTree tutorial:
    # https://www.datacamp.com/community/tutorials/decision-tree-classification-python
    return DecisionTreeClassifier()

 * Random Forest

In [4]:
def create_RF():
    '''Create a new Ranfom Forest model'''
    return RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)

* Gradient Boosting - XGBoost

In [5]:
def create_XGB():
    '''Create a Gradient Boost Model'''
    return XGBClassifier()

# Prediction

In [6]:
dataset =  utils.read_csv_to_df('dataset/preprocessed_data.csv')
dataset.rename(columns={'no. of municipalities with inhabitants < 499 ': 'no. of municipalities with inhabitants less than 499',
                         'no. of municipalities with inhabitants >10000 ': 'no. of municipalities with inhabitants more than 10000'}, 
                 inplace=True)
display(dataset.head())
list(dataset.columns)

Unnamed: 0,date,amount,duration,payments,account_creation_date,frequency_MI,frequency_WI,balance_mean,balance_max,balance_min,...,no. of enterpreneurs per 1000 inhabitants,no. of commited crimes '95,no. of commited crimes '96,owner_count,disponent_count,owner_gender,owner_birthdate,loan_to_account_age,salary_over_payments,status
0,0.0,96396,12,8033,0.054011,0,1,12250.0,20100.0,1100.0,...,100,2985,2804,1,1,1,0.272076,105,1617,-1
1,0.262785,52128,24,2172,0.19301,1,0,33459.680282,59944.2,144.2,...,100,2985,2804,1,1,1,0.533465,264,7478,1
2,0.004721,165960,36,4610,0.024623,1,0,52083.859459,120512.8,700.0,...,117,2854,2618,1,1,0,0.730073,148,3759,1
3,0.018096,127080,60,2118,0.020651,1,0,30060.954167,49590.4,800.0,...,132,2080,2122,1,1,0,0.029255,170,6272,1
4,0.306845,74736,36,2076,0.130262,1,0,37912.998507,62084.0,700.0,...,132,2080,2122,1,1,0,0.670547,399,6314,1


['date',
 'amount',
 'duration',
 'payments',
 'account_creation_date',
 'frequency_MI',
 'frequency_WI',
 'balance_mean',
 'balance_max',
 'balance_min',
 'last_ballance',
 'credit_mean',
 'credit_count',
 'credit_max',
 'credit_min',
 'withdrawal_mean',
 'withdrawal_count',
 'withdrawal_max',
 'withdrawal_min',
 'operation_CC',
 'operation_CAB',
 'operation_WC',
 'operation_RAB',
 'operation_CCW',
 'mean_trans_profit',
 'total_ops',
 'total_trans',
 'name ',
 'region',
 'no. of inhabitants',
 'no. of municipalities with inhabitants less than 499',
 'no. of municipalities with inhabitants 500-1999',
 'no. of municipalities with inhabitants 2000-9999 ',
 'no. of municipalities with inhabitants more than 10000',
 'no. of cities ',
 'ratio of urban inhabitants ',
 'average salary ',
 "unemploymant rate '95 ",
 "unemploymant rate '96 ",
 'no. of enterpreneurs per 1000 inhabitants ',
 "no. of commited crimes '95 ",
 "no. of commited crimes '96 ",
 'owner_count',
 'disponent_count',
 'owner

In [7]:
# Useful MACROS
STATUS_COL = dataset.columns.get_loc("status")
K_FOLD_NUM_SPLITS = 5
SEED = 42

In [8]:
# Setting X and Y
X = dataset.iloc[:, 0:STATUS_COL]
y = dataset.iloc[:, [STATUS_COL]]

In [11]:
#Hyper Parameter Tuning for XGBOOST adapted from "https://towardsdatascience.com/what-is-xgboost-and-how-to-optimize-it-d3c24e0e41b4"
#RANDOMIZED SEARCH
#imports
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

xgb_pipeline =XGBClassifier()

# set the parameters you want tune 
params = {
        'learning_rate': [0.03, 0.01, 0.003, 0.001],
        'min_child_weight': [1,3, 5,7, 10],
        'gamma': [0, 0.5, 1, 1.5, 2, 2.5, 5],
        'subsample': [0.6, 0.8, 1.0, 1.2, 1.4],
        'colsample_bytree': [0.6, 0.8, 1.0, 1.2, 1.4],
        'max_depth': [3, 4, 5, 6, 7, 8, 9 ,10, 12, 14],
        'reg_lambda':np.array([0.4, 0.6, 0.8, 1, 1.2, 1.4])}


# let's run the optimization
random_search = RandomizedSearchCV(xgb_pipeline, param_distributions=params, n_iter=1000,
                                   scoring="roc_auc", n_jobs=-1,  verbose=3, random_state=42, cv=KFold(n_splits=K_FOLD_NUM_SPLITS, random_state=SEED, shuffle=False))
# n_iter : number of iteration
# scoring : loss 
# n_jobs : parallel computation if -1 means use all the threads available
# cv : number of folds of the cross-validation 

# fit the model on training data with specific parameters
random_search.fit(X,y)


Fitting 5 folds for each of 1000 candidates, totalling 5000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    4.9s
[Parallel(n_jobs=-1)]: Done 248 tasks      | elapsed:    7.0s
[Parallel(n_jobs=-1)]: Done 888 tasks      | elapsed:   12.0s
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed:   20.0s
[Parallel(n_jobs=-1)]: Done 2936 tasks      | elapsed:   29.8s
[Parallel(n_jobs=-1)]: Done 4344 tasks      | elapsed:   43.1s
[Parallel(n_jobs=-1)]: Done 5000 out of 5000 | elapsed:   48.9s finished
  return f(**kwargs)


RandomizedSearchCV(cv=KFold(n_splits=5, random_state=42, shuffle=False),
                   estimator=XGBClassifier(base_score=None, booster=None,
                                           colsample_bylevel=None,
                                           colsample_bynode=None,
                                           colsample_bytree=None, gamma=None,
                                           gpu_id=None, importance_type='gain',
                                           interaction_constraints=None,
                                           learning_rate=None,
                                           max_delta_step=None, max_depth=None,
                                           min_child_weight=None, missing=n...
                                           verbosity=None),
                   n_iter=1000, n_jobs=-1,
                   param_distributions={'colsample_bytree': [0.6, 0.8, 1.0, 1.2,
                                                             1.4],
                

In [12]:
random_search.best_estimator_

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.6, gamma=2.5, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.03, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=0.6, scale_pos_weight=1, subsample=0.8,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [14]:
#GRIDSEARCH
    
# param to be tested 
gbm_param_grid = {
    'learning_rate': np.array(range(int(100*random_search.best_params_["learning_rate"])-1, int(100*random_search.best_params_["learning_rate"])+2, 1))/100,
    'subsample': np.array(range(int(10*random_search.best_params_["subsample"])-1, int(10*random_search.best_params_["subsample"])+2, 1))/10,
    'reg_lambda': np.array(range(int(10*random_search.best_params_["reg_lambda"])-1, int(10*random_search.best_params_["reg_lambda"])+2, 1))/10,
    'max_depth':np.array(range(random_search.best_params_["max_depth"]-1, random_search.best_params_["max_depth"]+3, 1)),
    'colsample_bytree': np.array(range(int(10*random_search.best_params_["colsample_bytree"])-1, int(10*random_search.best_params_["colsample_bytree"])+2, 1))/10,
    'min_child_weight': np.array(range(int(10*random_search.best_params_["min_child_weight"])-1, int(10*random_search.best_params_["min_child_weight"])+2, 1))/10
}
#'gamma': np.array(range(int(10*random_search.best_params_["gamma"])-3, int(10*random_search.best_params_["gamma"])+3, 1))/10,


# configure the gridsearch 
grid_search = GridSearchCV(estimator=xgb_pipeline, param_grid=gbm_param_grid, n_jobs=-1, cv=KFold(n_splits=K_FOLD_NUM_SPLITS, random_state=SEED, shuffle=False),
                         scoring='roc_auc', verbose=10 )
# n_jobs : number of thread in parallel if -1 means max thread used
# cv : number of fold in cross-validation

# train the models 
grid_search.fit(X,y)

Fitting 5 folds for each of 972 candidates, totalling 4860 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1349s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done  74 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done  96 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done 122 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done 148 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done 178 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Done 208 tasks      | elapsed:    3.7s
[Parallel(n_jobs=-1)]: Done 242 tasks      | elapsed:    4.4s
[Parallel(n_jobs=-1)]: Done 276 tasks      | elapsed:    5.0s
[Parallel(n_jobs=-1)]: Done 314 tas

GridSearchCV(cv=KFold(n_splits=5, random_state=42, shuffle=False),
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, mon...
                                     scale_pos_weight=None, subsample=None,
                                     tree_method=None, validate_parameters=None,
                                     verbosity=None),
             n_jobs=-1,
             param_grid={'colsample_bytree': array([0.5, 0.6, 0.7]),
                

In [15]:
grid_search.best_estimator_

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.5, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.04, max_delta_step=0, max_depth=5,
              min_child_weight=1.1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=0.6, scale_pos_weight=1, subsample=0.8,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [25]:
# Cross validation settings
auc_scores = []
confusion_matrixes = []
cv = KFold(n_splits=K_FOLD_NUM_SPLITS, random_state=SEED, shuffle=False)

# CHANGE THIS LINE TO CHANGE THE USED CLASSIFICATION METHOD
# classifier = create_DT()
# classifier = create_RF()
classifier = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.5, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.04, max_delta_step=0, max_depth=5,
              min_child_weight=1.1, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=0.6, scale_pos_weight=1, subsample=0.8,
              tree_method='exact', validate_parameters=1, verbosity=None)

# Applying Cross validation
for train_index, test_index in cv.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Training with this fold
    classifier.fit(X_train,y_train)
    
    # Testing & Measuring accuracy
    y_pred = classifier.predict(X_test)
    
    fpr, tpr, _ = metrics.roc_curve(y_test, y_pred)
    auc_scores.append(metrics.auc(fpr, tpr))
    confusion_matrixes.append(metrics.confusion_matrix(y_test, y_pred).ravel())

In [26]:
# Printing the obtained results
print('Classification Method used:', classifier, '\n')
print('AUC scores:', auc_scores)
print('> Average: ', sum(auc_scores)/len(auc_scores))
for cf in confusion_matrixes:
    display_confusion_matrix(cf)

Classification Method used: XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.5, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.04, max_delta_step=0, max_depth=5,
              min_child_weight=1.1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=0.6, scale_pos_weight=1, subsample=0.8,
              tree_method='exact', validate_parameters=1, verbosity=None) 

AUC scores: [0.5909090909090908, 0.6788793103448276, 0.7, 0.8333333333333334, 0.6153846153846154]
> Average:  0.6837012699943734
          	Predic NO	Predic YES
Actual NO 	2        	9         
Actual YES	0        	55        
          	Predic NO	Predic YES
Actual NO 	3        	5         
Actual YES	1        	57        
          	Predic NO	Predic YES
Actual NO 	2        	3         
Ac

### After having our model trained we shall use the model on the data to be sumitted in the kaggle

In [None]:
test_dataset =  utils.read_csv_to_df('dataset/test_dataset.csv')
ids = utils.read_csv_to_df('dataset/ids.csv')
display(test_dataset.head())

In [None]:
# We now remove the Y column with NaNs
test_dataset = test_dataset.iloc[:, 0:STATUS_COL]
display(test_dataset.head())

In [None]:
# Using the model to get the 'status' predictions
test_dataset.rename(columns={'no. of municipalities with inhabitants < 499 ': 'no. of municipalities with inhabitants less than 499',
                         'no. of municipalities with inhabitants >10000 ': 'no. of municipalities with inhabitants more than 10000'}, 
                 inplace=True)
display(test_dataset)
predictions_df = test_dataset.copy()
predictions_df['Predicted'] = classifier.predict(test_dataset)
predictions_df = ids.merge(predictions_df, on=['date', 'amount'])
predictions_df = predictions_df[['loan_id', 'Predicted']]\
                    .rename(columns={
                        'loan_id': 'Id'
                    })\
                    .drop_duplicates()

display(predictions_df)

In [None]:
# Outputting predictions to .csv
# CHANGE FILE NAME TO PRESERVE DIFFERENT INSTANCES
utils.write_df_to_csv(predictions_df, 'predictions', 'prediction2.csv')