#Classification problem using Happy Customer Bank Dataset:


###*Problem statement:* 
Happy Customer Bank is a mid-sized private bank that deals in all kinds of banking products, like savings accounts, current accounts, investment products, credit products, among other offerings.
 
The bank also cross-sells products to its existing customers and to do so they use different kinds of communication like tele-calling, e-mails, recommendations on net banking, mobile banking, etc.
 
In this case, Happy Customer Bank wants to cross-sell its credit cards to its existing customers. The bank has identified a set of customers that are eligible for taking these credit cards.
 
Now, the bank is looking for your help in identifying customers that could show higher intent towards a recommended credit card, given:
 
Customer details (gender, age, region, etc.)
Details of his/her relationship with the bank (Channel_Code, Vintage, 'Avg_Asset_Value, etc.)

##Pre-requisites:

In [10]:
import pandas as pd
import numpy as np

#Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

#For Missing Value and Feature Engineering
from sklearn.feature_selection import SelectKBest, chi2, f_classif, VarianceThreshold
from sklearn.impute import SimpleImputer, KNNImputer, MissingIndicator
from sklearn.preprocessing import KBinsDiscretizer, LabelEncoder, MinMaxScaler
from sklearn.decomposition import PCA

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier

import time

##Dataset:

In [2]:
train = pd.read_csv("/content/train_s3TEQDk.csv")
test = pd.read_csv("/content/test_mSzZ8RL.csv")

In [3]:
train.head()

Unnamed: 0,ID,Gender,Age,Region_Code,Occupation,Channel_Code,Vintage,Credit_Product,Avg_Account_Balance,Is_Active,Is_Lead
0,NNVBBKZB,Female,73,RG268,Other,X3,43,No,1045696,No,0
1,IDD62UNG,Female,30,RG277,Salaried,X1,32,No,581988,No,0
2,HD3DSEMC,Female,56,RG268,Self_Employed,X3,26,No,1484315,Yes,0
3,BF3NC7KV,Male,34,RG270,Salaried,X1,19,No,470454,No,0
4,TEASRWXV,Female,30,RG282,Salaried,X1,33,No,886787,No,0


In [4]:
test.head()

Unnamed: 0,ID,Gender,Age,Region_Code,Occupation,Channel_Code,Vintage,Credit_Product,Avg_Account_Balance,Is_Active
0,VBENBARO,Male,29,RG254,Other,X1,25,Yes,742366,No
1,CCMEWNKY,Male,43,RG268,Other,X2,49,,925537,No
2,VK3KGA9M,Male,31,RG270,Salaried,X1,14,No,215949,No
3,TT8RPZVC,Male,29,RG272,Other,X1,33,No,868070,No
4,SHQZEYTZ,Female,29,RG270,Other,X1,19,No,657087,No


In [16]:
X = train.drop(['ID', 'Is_Lead'], axis=1)
y = train['Is_Lead']

In [17]:
numerical_features = [c for c, dtype in zip(X.columns, X.dtypes)
                     if dtype.kind in ['i','f']]
categorical_features = [c for c, dtype in zip(X.columns, X.dtypes)
                     if dtype.kind not in ['i','f']]

In [7]:
numerical_features

['Age', 'Vintage', 'Avg_Account_Balance']

In [8]:
categorical_features

['Gender',
 'Region_Code',
 'Occupation',
 'Channel_Code',
 'Credit_Product',
 'Is_Active']

In [18]:
for i in categorical_features:
  X[i]=X[i].astype('category').cat.codes.values

In [19]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split( X,  y, test_size=0.3, random_state=0) 

In [22]:
X_train

Unnamed: 0,Gender,Age,Region_Code,Occupation,Channel_Code,Vintage,Credit_Product,Avg_Account_Balance,Is_Active
42989,0,36,30,3,1,15,0,714598,0
233556,0,27,33,2,0,14,0,664082,1
196355,0,41,18,3,2,57,-1,625474,0
121798,1,26,24,2,0,14,0,1095999,0
230025,0,31,22,1,0,19,0,786030,0
...,...,...,...,...,...,...,...,...,...
176963,1,48,18,3,2,91,0,720463,1
117952,1,31,2,1,0,20,0,423969,0
173685,1,26,19,2,0,19,0,966095,0
43567,1,51,27,1,1,99,-1,1147334,1


In [23]:
preprocessor = make_column_transformer(
    
    (make_pipeline(
    KNNImputer(n_neighbors=2, weights="uniform")), numerical_features),
    
    (make_pipeline(
    SimpleImputer(strategy = 'most_frequent', fill_value = 'missing')),
     categorical_features)
    
)

In [24]:
preprocessor_bst = make_pipeline(preprocessor, 
                                  VarianceThreshold())
XGB_Model = make_pipeline(preprocessor_bst, XGBClassifier())

##Model:


In [13]:
param_grid = {
 "xgbclassifier__learning_rate"    : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30 ] ,
 "xgbclassifier__max_depth"        : [ 3, 4, 5, 6],
 "xgbclassifier__min_child_weight" : [ 1, 3, 5 ],
 "xgbclassifier__gamma"            : [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
 "xgbclassifier__colsample_bytree" : [  0.5 , 0.7,1.0 ]   
}

In [25]:
from sklearn.model_selection import RandomizedSearchCV
xgb_RandomGrid = RandomizedSearchCV(estimator = XGB_Model, param_distributions = param_grid, cv = 5, verbose=3, n_jobs = -1, scoring = 'roc_auc', n_iter = 5)

In [26]:
xgb_RandomGrid.fit(X_train, y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:  2.7min finished


RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=Pipeline(memory=None,
                                      steps=[('pipeline',
                                              Pipeline(memory=None,
                                                       steps=[('columntransformer',
                                                               ColumnTransformer(n_jobs=None,
                                                                                 remainder='drop',
                                                                                 sparse_threshold=0.3,
                                                                                 transformer_weights=None,
                                                                                 transformers=[('pipeline-1',
                                                                                                Pipeline(memory=None,
                                                                  

In [27]:
xgb_RandomGrid.best_estimator_

Pipeline(memory=None,
         steps=[('pipeline',
                 Pipeline(memory=None,
                          steps=[('columntransformer',
                                  ColumnTransformer(n_jobs=None,
                                                    remainder='drop',
                                                    sparse_threshold=0.3,
                                                    transformer_weights=None,
                                                    transformers=[('pipeline-1',
                                                                   Pipeline(memory=None,
                                                                            steps=[('knnimputer',
                                                                                    KNNImputer(add_indicator=False,
                                                                                               copy=True,
                                                                              

In [28]:
print(f'Train : {xgb_RandomGrid.score(X_train, y_train):.3f}')
print(f'Test : {xgb_RandomGrid.score(X_test, y_test):.3f}')

Train : 0.878
Test : 0.875


In [30]:
from sklearn.metrics import roc_auc_score
print(f'Train AUC : {roc_auc_score(y_train, xgb_RandomGrid.predict_proba(X_train)[:,1]):.3f}')
print(f'Test AUC : {roc_auc_score(y_test, xgb_RandomGrid.predict_proba(X_test)[:,1]):.3f}')

Train AUC : 0.878
Test AUC : 0.875


In [32]:
for i in categorical_features:
  test[i]=test[i].astype('category').cat.codes.values

In [33]:
test_pred = xgb_RandomGrid.predict_proba(test[X.columns])[:,1]

In [34]:
Sub = pd.DataFrame({ 'ID': test['ID'],
                       'Is_Lead' : test_pred
    
})

In [35]:
Sub.to_csv('XGB_HappyBank.csv', index = False)