In [100]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.metrics import recall_score, precision_score, confusion_matrix, classification_report, roc_auc_score
from sklearn.preprocessing import RobustScaler, OneHotEncoder, PowerTransformer, PolynomialFeatures, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.utils import resample

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, RandomizedSearchCV
import imblearn
from imblearn.over_sampling import SMOTE
pd.set_option('display.max_columns', None)

import joblib

https://www.kaggle.com/volodymyrgavrysh/bank-marketing-campaigns-dataset

__Conducted campaigns were based mostly on direct phone calls, offering bank client to place a term deposit.
If after all marking afforts client had agreed to place deposit - target variable marked 'yes', otherwise 'no'__

# Feature Desc

### bank client data:

1. age
    - (numeric)

2. job : type of job 
    - (categorical: "admin.","blue-collar","entrepreneur","housemaid","management","retired","self-employed","services","student","technician","unemployed","unknown")

3. marital : marital status 
    - (categorical: "divorced","married","single","unknown"; note: "divorced" means divorced or widowed)

4. education 
    - (categorical: "basic.4y","basic.6y","basic.9y","high.school","illiterate","professional.course","university.degree","unknown")

5. default: has credit in default? (lalai bayar (?))
    - (categorical: "no","yes","unknown")

6. housing: has housing loan? 
    - (categorical: "no","yes","unknown")

7. loan: has personal loan?
    - (categorical: "no","yes","unknown")
    
### related with the last contact of the current campaign:
8. contact: contact communication type
    - (categorical: "cellular","telephone")
9. month: last contact month of year
    - (categorical: "jan", "feb", "mar", …, "nov", "dec")

10. dayofweek: last contact day of the week
    - (categorical: "mon","tue","wed","thu","fri")

11. duration: last contact duration, in seconds (numeric). ==> (obvious feature (?))
    - __Important note__: this attribute highly affects the output target (e.g., if duration=0 then y="no"). 
    - Yet, the duration is not known before a call is performed. Also, after the end of the call y is obviously known. Thus, this input should only be included for benchmark purposes and should be discarded if the intention is to have a realistic predictive model.

### other attributes:
12. campaign: number of contacts performed during this campaign and for this client 
    - (numeric, includes last contact)

13. pdays: number of days that passed by after the client was last contacted from a previous campaign
    - (numeric; 999 means client was not previously contacted)

14. previous: number of contacts performed before this campaign and for this client
    - (numeric)

15. poutcome: outcome of the previous marketing campaign 
    - (categorical: "failure","nonexistent","success")

### social and economic context attributes
16. emp.var.rate: employment variation rate - quarterly indicator (???)
    - (numeric)

17. cons.price.idx: consumer price index - monthly indicator
    - changes in the price level of a weighted average __market basket__ of consumer goods and services purchased by households
    - affect inflation
    - (numeric)

18. cons.conf.idx: consumer confidence index - monthly indicator
    - degree of __consumers optimism__ are expressing through their activities of savings and spending.
    - affect consumer behavior
    - (numeric)

19. euribor3m: euribor 3 month rate - daily indicator
    - Euribor (euro interbank offered rate)
    - ??
    - (numeric)

20. nr.employed: number of employees - quarterly indicator
    - Number of employed persons for a quarter. (for the bank ??)
    - (numeric)

### Output variable (desired target):

21. y - has the client subscribed a term deposit?
    - (binary: "yes","no")

In [2]:
df = pd.read_csv('bank-additional-full.csv', sep=';')
df.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,261,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,149,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,226,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,151,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,307,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [3]:
df.education = df.education.apply(lambda x : x.replace(".", " "))

# ML

### GOALS 

__Nurunin jumlah False Positif__ 

__Naikin Nilai Precision kelas 1 (YES)__

### PR

1. coba label encoder untuk categorical

2. coba di bin age, duration(per 180/3 meni, yng diatas 1000 jadi 1000++)

In [4]:
df_1 = df.drop(columns=['duration', 'campaign'
#                         , 'emp.var.rate', 'nr.employed', 'day_of_week', 'housing'
                       ])

In [5]:
df_1.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic 4y,no,no,no,telephone,may,mon,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high school,unknown,no,no,telephone,may,mon,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high school,no,yes,no,telephone,may,mon,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic 6y,no,no,no,telephone,may,mon,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high school,no,no,yes,telephone,may,mon,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [6]:
df_1['y'][df_1['y'] == 'no'] = 0
df_1['y'][df_1['y'] == 'yes'] = 1

In [7]:
df_1['y'].value_counts()

0    36548
1     4640
Name: y, dtype: int64

1 ==> Nasabah subscribe deposit (yes)

0 ==> Nasabah TIDAK subscribe deposit (no)

In [8]:
df_1['y'] = df_1['y'].astype('int')

## Drop beberapa

education = illiterate & age < 20 (teenager) karena sangat sedikit jadi di drop

In [9]:
# df_1 = df_1[df_1['education'] != 'illiterate']

In [10]:
# df_1['education'].value_counts()

In [11]:
# df_1 = df_1[df_1['age'] >= 20]

In [12]:
# df_1['age'][df_1['age']<20]

In [13]:
# df_1['y'].value_counts()

## Cek Imbalance Data

In [14]:
round(df_1['y'].value_counts()/len(df_1)*100, 2)

0    88.73
1    11.27
Name: y, dtype: float64

## Binning Data

Lakukan binning pada kolom age

__AGE__

In [15]:
# age_bin = [0,10,20,40,df_1['age'].max()]
# label_age = [1, 2, 3, 4]
# df_1['age_bin'] = pd.cut(df_1['age'], bins=age_bin, labels=label_age)
# df_1['age_bin'] = df_1['age_bin'].astype(int)

In [16]:
# df_1 = df_1.drop(columns='age')

## Label Encoding

Lakukan Label Encoding pada kolom education

In [17]:
df_1['education'] = df_1['education'].map({'basic 4y':1
                              , 'basic 6y':2
                              , 'basic 9y' :3
                              , "high school":4
                              , "university degree":5
                              , 'professional course':6
                              , 'illiterate':0
                              , 'unknown':0
                             })

## Splitting Data

In [18]:
X = df_1.drop(columns='y')
y = df_1['y']

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=.20, random_state=42)

## Over Sampling

Melakukan teknik Oversampling untuk mengatasa data imbalance sebelum modelling

In [20]:
df_train = pd.concat([X_train,y_train], axis=1)

In [21]:
df_train.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
25611,49,blue-collar,married,3,unknown,no,no,cellular,nov,wed,999,0,nonexistent,-0.1,93.2,-42.0,4.12,5195.8,0
26010,37,entrepreneur,married,5,no,no,no,telephone,nov,wed,999,1,failure,-0.1,93.2,-42.0,4.12,5195.8,0
40194,78,retired,married,1,no,no,no,cellular,jul,mon,999,0,nonexistent,-1.7,94.215,-40.3,0.87,4991.6,1
297,36,admin.,married,5,no,yes,no,telephone,may,mon,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
36344,59,retired,divorced,5,no,no,no,cellular,jun,tue,999,0,nonexistent,-2.9,92.963,-40.8,1.262,5076.2,0


In [22]:
df_train['y'].value_counts()

0    29238
1     3712
Name: y, dtype: int64

In [23]:
non_default = df_train[df_train['y'] == 0] # kelas majority

In [24]:
default = df_train[df_train['y'] == 1] # kelas minority

In [25]:
default_oversample = resample(default, 
                           replace = True, 
                           n_samples = len(non_default),
                           random_state=42)

In [26]:
df_OverSample= pd.concat([non_default, default_oversample])

In [27]:
df_OverSample['y'].value_counts()

1    29238
0    29238
Name: y, dtype: int64

In [28]:
X_train_OS = df_OverSample.drop(columns='y')
y_train_OS = df_OverSample['y']

## Pipeline

In [29]:
df_1.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,1,no,no,no,telephone,may,mon,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
1,57,services,married,4,unknown,no,no,telephone,may,mon,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
2,37,services,married,4,no,yes,no,telephone,may,mon,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
3,40,admin.,married,2,no,no,no,telephone,may,mon,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
4,56,services,married,4,no,no,yes,telephone,may,mon,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0


In [30]:
num_columns = ['age', 'education', 'pdays', 'previous', 'cons.price.idx', 'cons.conf.idx', 'euribor3m'
               , 'nr.employed', 'emp.var.rate'
              ]

cat_columns = [i for i in df_1.columns if (i not in num_columns) & (i!='y')]

In [31]:
numeric_pipeline = Pipeline([
    ('scaler', RobustScaler()),
#     ('poly', PolynomialFeatures(degree=2, include_bias=False)),
#     ('power', PowerTransformer(method='yeo-johnson'))
])

categoric_pipeline = Pipeline([
    ('encoder', OneHotEncoder())
])

preprocessor = ColumnTransformer([
    ('numeric', numeric_pipeline, num_columns),
    ('categorical', categoric_pipeline, cat_columns)
])

pipe_RF = Pipeline([
    ('prep', preprocessor),
    ('algo', RandomForestClassifier(random_state=42))
])

pipe_KNN = Pipeline([
    ('prep', preprocessor),
    ('algo', KNeighborsClassifier())
])

pipe_SVM = Pipeline([
    ('prep', preprocessor),
    ('algo', SVC(max_iter=600))
])

## Def Eva Metrix

Fungsi def yg isinya evaluation metrix.
Evaluation matrix yang dipakai kali ini adalah Cunfusion matrix dan Classification report

In [32]:
def conf_mat (Model, X_train, X_test, y_train, y_test,Nama):
    y_pred_test = Model.predict(X_test)
    cm_test = confusion_matrix(y_test, y_pred_test, labels=[1,0])
    df_test = pd.DataFrame(cm_test, index = ['Akt1', 'Akt0'], columns=['Pred1', 'Pred0'])
    print( 'Classification report data TEST ' + Nama + '\n\n', classification_report(y_test, y_pred_test))
    print('\nROC AUC test :', round(roc_auc_score(y_test, y_pred_test), 2), '\n')
    print('\nConfusion matrix data test ' + Nama + '\n\n')
    print(df_test)
    print('='*100)
    y_pred_train = Model.predict(X_train)
    cm_train = confusion_matrix(y_train, y_pred_train, labels=[1,0])
    df_train = pd.DataFrame(cm_train, index = ['Akt1', 'Akt0'], columns=['Pred1', 'Pred0'])
    print( 'Classification report data TRAIN ' + Nama + '\n\n', classification_report(y_train, y_pred_train))
    print('\nROC AUC train :', round(roc_auc_score(y_train, y_pred_train), 2), '\n')
    print('\nConfusion matrix data train ' + Nama + '\n\n')
    print(df_train)

In [33]:
def prec_rec (Model, X_test, y_test, Nama):
    data = {}
    prec = []
    rec = []
    for i in Model :
        y_pred_ts = i.predict(X_test)
        precision = precision_score(y_test, y_pred_ts)
        recall = recall_score(y_test, y_pred_ts)
        prec.append(precision)
        rec.append(recall)
    for j in range (len(Nama)):
        data[Nama[j]] = [prec[j], rec[j]]
    
    df = pd.DataFrame(data, index=['Precison', 'Recall'])
    return df

## Modelling

__RF__

In [34]:
pipe_RF.fit(X_train_OS, y_train_OS)

Pipeline(steps=[('prep',
                 ColumnTransformer(transformers=[('numeric',
                                                  Pipeline(steps=[('scaler',
                                                                   RobustScaler())]),
                                                  ['age', 'education', 'pdays',
                                                   'previous', 'cons.price.idx',
                                                   'cons.conf.idx', 'euribor3m',
                                                   'nr.employed',
                                                   'emp.var.rate']),
                                                 ('categorical',
                                                  Pipeline(steps=[('encoder',
                                                                   OneHotEncoder())]),
                                                  ['job', 'marital', 'default',
                                                   'housing', 'l

In [35]:
conf_mat(pipe_RF, X_train_OS, X_test, y_train_OS, y_test, 'RF Base')

Classification report data TEST RF Base

               precision    recall  f1-score   support

           0       0.93      0.94      0.93      7310
           1       0.45      0.41      0.43       928

    accuracy                           0.88      8238
   macro avg       0.69      0.67      0.68      8238
weighted avg       0.87      0.88      0.87      8238


ROC AUC test : 0.67 


Confusion matrix data test RF Base


      Pred1  Pred0
Akt1    382    546
Akt0    473   6837
Classification report data TRAIN RF Base

               precision    recall  f1-score   support

           0       1.00      0.98      0.99     29238
           1       0.99      1.00      0.99     29238

    accuracy                           0.99     58476
   macro avg       0.99      0.99      0.99     58476
weighted avg       0.99      0.99      0.99     58476


ROC AUC train : 0.99 


Confusion matrix data train RF Base


      Pred1  Pred0
Akt1  29228     10
Akt0    442  28796


__Default__
- 'algo__n_estimators': 100,
- 'algo__max_depth': None,
- 'algo__min_samples_leaf': 1,
- 'algo__class_weight': None

__KNN__

In [36]:
pipe_KNN.fit(X_train_OS, y_train_OS)

Pipeline(steps=[('prep',
                 ColumnTransformer(transformers=[('numeric',
                                                  Pipeline(steps=[('scaler',
                                                                   RobustScaler())]),
                                                  ['age', 'education', 'pdays',
                                                   'previous', 'cons.price.idx',
                                                   'cons.conf.idx', 'euribor3m',
                                                   'nr.employed',
                                                   'emp.var.rate']),
                                                 ('categorical',
                                                  Pipeline(steps=[('encoder',
                                                                   OneHotEncoder())]),
                                                  ['job', 'marital', 'default',
                                                   'housing', 'l

In [37]:
conf_mat(pipe_KNN, X_train_OS, X_test, y_train_OS, y_test, 'KNN Base')

Classification report data TEST KNN Base

               precision    recall  f1-score   support

           0       0.94      0.81      0.87      7310
           1       0.27      0.57      0.37       928

    accuracy                           0.78      8238
   macro avg       0.60      0.69      0.62      8238
weighted avg       0.86      0.78      0.81      8238


ROC AUC test : 0.69 


Confusion matrix data test KNN Base


      Pred1  Pred0
Akt1    531    397
Akt0   1424   5886
Classification report data TRAIN KNN Base

               precision    recall  f1-score   support

           0       1.00      0.86      0.92     29238
           1       0.87      1.00      0.93     29238

    accuracy                           0.93     58476
   macro avg       0.93      0.93      0.93     58476
weighted avg       0.93      0.93      0.93     58476


ROC AUC train : 0.93 


Confusion matrix data train KNN Base


      Pred1  Pred0
Akt1  29115    123
Akt0   4194  25044


__SVM__

In [38]:
pipe_SVM.fit(X_train_OS, y_train_OS)

Pipeline(steps=[('prep',
                 ColumnTransformer(transformers=[('numeric',
                                                  Pipeline(steps=[('scaler',
                                                                   RobustScaler())]),
                                                  ['age', 'education', 'pdays',
                                                   'previous', 'cons.price.idx',
                                                   'cons.conf.idx', 'euribor3m',
                                                   'nr.employed',
                                                   'emp.var.rate']),
                                                 ('categorical',
                                                  Pipeline(steps=[('encoder',
                                                                   OneHotEncoder())]),
                                                  ['job', 'marital', 'default',
                                                   'housing', 'l

In [39]:
conf_mat(pipe_SVM, X_train_OS, X_test, y_train_OS, y_test, 'SVM Base')

Classification report data TEST SVM Base

               precision    recall  f1-score   support

           0       0.35      0.01      0.03      7310
           1       0.09      0.80      0.17       928

    accuracy                           0.10      8238
   macro avg       0.22      0.41      0.10      8238
weighted avg       0.32      0.10      0.04      8238


ROC AUC test : 0.41 


Confusion matrix data test SVM Base


      Pred1  Pred0
Akt1    741    187
Akt0   7208    102
Classification report data TRAIN SVM Base

               precision    recall  f1-score   support

           0       0.07      0.02      0.02     29238
           1       0.45      0.79      0.57     29238

    accuracy                           0.40     58476
   macro avg       0.26      0.40      0.30     58476
weighted avg       0.26      0.40      0.30     58476


ROC AUC train : 0.4 


Confusion matrix data train SVM Base


      Pred1  Pred0
Akt1  23095   6143
Akt0  28792    446


__XGB__

In [101]:
pipe_XGB = Pipeline([
    ('prep', preprocessor),
    ('algo', XGBClassifier())
])

In [102]:
pipe_XGB.fit(X_train_OS, y_train_OS)



Pipeline(steps=[('prep',
                 ColumnTransformer(transformers=[('numeric',
                                                  Pipeline(steps=[('scaler',
                                                                   RobustScaler())]),
                                                  ['age', 'education', 'pdays',
                                                   'previous', 'cons.price.idx',
                                                   'cons.conf.idx', 'euribor3m',
                                                   'nr.employed',
                                                   'emp.var.rate']),
                                                 ('categorical',
                                                  Pipeline(steps=[('encoder',
                                                                   OneHotEncoder())]),
                                                  ['job', 'marital', 'default',
                                                   'housing', 'l

In [103]:
conf_mat(pipe_XGB, X_train_OS, X_test, y_train_OS, y_test, 'XGB Base')

Classification report data TEST XGB Base

               precision    recall  f1-score   support

           0       0.94      0.88      0.91      7310
           1       0.39      0.59      0.47       928

    accuracy                           0.85      8238
   macro avg       0.67      0.74      0.69      8238
weighted avg       0.88      0.85      0.86      8238


ROC AUC test : 0.74 


Confusion matrix data test XGB Base


      Pred1  Pred0
Akt1    552    376
Akt0    870   6440
Classification report data TRAIN XGB Base

               precision    recall  f1-score   support

           0       0.81      0.90      0.85     29238
           1       0.89      0.79      0.84     29238

    accuracy                           0.85     58476
   macro avg       0.85      0.85      0.85     58476
weighted avg       0.85      0.85      0.85     58476


ROC AUC train : 0.85 


Confusion matrix data train XGB Base


      Pred1  Pred0
Akt1  23185   6053
Akt0   2904  26334


### Perbandingan

In [104]:
prec_rec ([pipe_KNN, pipe_SVM, pipe_RF, pipe_XGB], X_test, y_test, ['KNN_Base', 'SVM_Base', 'RF_Base', 'XGB_Base'])

Unnamed: 0,KNN_Base,SVM_Base,RF_Base,XGB_Base
Precison,0.271611,0.093219,0.446784,0.388186
Recall,0.572198,0.798491,0.411638,0.594828


_____________

## HYPERPARAMETER TUNING RANDOM FOREST

In [42]:
skf = StratifiedKFold(n_splits=3)

https://towardsdatascience.com/fine-tuning-a-classifier-in-scikit-learn-66e048c21e65

__TUNING 1__

In [43]:
param_RF = {
    'algo__n_estimators' : np.arange(100, 1000, 100),
    'algo__max_depth' : list(np.arange(10, 100, 10)) + [None],
    'algo__min_samples_leaf' : list(np.arange(10, 100, 10)) + [1],
    'algo__class_weight' : [None, {0:.4, 1:.6}, {0:.3, 1:.7}, {0:.2, 1:.8}, {0:.15, 1: .85}, {0:.1, 1:.9}]
}

In [44]:
RF_RS = RandomizedSearchCV(pipe_RF, param_RF, cv=skf, n_iter=200, n_jobs=-1, verbose=1, random_state=42, scoring='precision')

In [64]:
RF_RS.fit(X_train_OS, y_train_OS)

Fitting 3 folds for each of 200 candidates, totalling 600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed: 23.2min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed: 50.2min
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed: 68.1min finished


RandomizedSearchCV(cv=StratifiedKFold(n_splits=3, random_state=None, shuffle=False),
                   estimator=Pipeline(steps=[('prep',
                                              ColumnTransformer(transformers=[('numeric',
                                                                               Pipeline(steps=[('scaler',
                                                                                                RobustScaler())]),
                                                                               ['age',
                                                                                'education',
                                                                                'pdays',
                                                                                'previous',
                                                                                'cons.price.idx',
                                                                                'cons.conf.

In [65]:
RF_RS.best_params_

{'algo__n_estimators': 100,
 'algo__min_samples_leaf': 1,
 'algo__max_depth': 90,
 'algo__class_weight': {0: 0.2, 1: 0.8}}

In [72]:
# tab_hyper_RF = pd.DataFrame(RF_RS.cv_results_)[['param_algo__n_estimators', 'param_algo__max_depth', 'param_algo__min_samples_leaf', 'param_algo__class_weight', 'mean_test_score']]
# tab_hyper_RF = (tab_hyper_RF.sort_values(by='mean_test_score', ascending=False).reset_index()).drop(columns='index')
# tab_hyper_RF[(tab_hyper_RF['mean_test_score'] > 0.7) & (tab_hyper_RF['mean_test_score'] < 0.8)]

______________
_UNTUK yng best estimator_
_____________

In [69]:
RF_Tune = RF_RS.best_estimator_

In [73]:
RF_Tune.fit(X_train_OS, y_train_OS)

Pipeline(steps=[('prep',
                 ColumnTransformer(transformers=[('numeric',
                                                  Pipeline(steps=[('scaler',
                                                                   RobustScaler())]),
                                                  ['age', 'education', 'pdays',
                                                   'previous', 'cons.price.idx',
                                                   'cons.conf.idx', 'euribor3m',
                                                   'nr.employed',
                                                   'emp.var.rate']),
                                                 ('categorical',
                                                  Pipeline(steps=[('encoder',
                                                                   OneHotEncoder())]),
                                                  ['job', 'marital', 'default',
                                                   'housing', 'l

In [74]:
conf_mat(RF_Tune, X_train_OS, X_test, y_train_OS, y_test, 'RF Tuning best estimator')

Classification report data TEST RF Tuning best estimator

               precision    recall  f1-score   support

           0       0.93      0.94      0.93      7310
           1       0.45      0.41      0.43       928

    accuracy                           0.88      8238
   macro avg       0.69      0.67      0.68      8238
weighted avg       0.87      0.88      0.87      8238


ROC AUC test : 0.67 


Confusion matrix data test RF Tuning best estimator


      Pred1  Pred0
Akt1    379    549
Akt0    471   6839
Classification report data TRAIN RF Tuning best estimator

               precision    recall  f1-score   support

           0       1.00      0.98      0.99     29238
           1       0.98      1.00      0.99     29238

    accuracy                           0.99     58476
   macro avg       0.99      0.99      0.99     58476
weighted avg       0.99      0.99      0.99     58476


ROC AUC train : 0.99 


Confusion matrix data train RF Tuning best estimator


      Pred1 

______________
_{
    'algo__n_estimators' : 300,
    'algo__max_depth' : None,
    'algo__min_samples_leaf' : 10,
    'algo__class_weight' : None
}_
_____________

__TUNING 2__

In [75]:
param_RF2 = {
    'algo__n_estimators' : [10, 20, 30, 50, 80, 100],
    'algo__max_depth' : [None, 1, 3, 5, 8, 10],
    'algo__min_samples_leaf' : [1, 3, 5, 8, 10],
    'algo__class_weight' : [None, {0:.4, 1:.6}, {0:.3, 1:.7}, {0:.2, 1:.8}, {0:.1, 1:.9}]
}

In [76]:
RF_RS2 = RandomizedSearchCV(pipe_RF, param_RF2, cv=skf, n_iter=200, n_jobs=-1, verbose=1, random_state=42, scoring='precision')

In [77]:
RF_RS2.fit(X_train_OS, y_train_OS)

Fitting 3 folds for each of 200 candidates, totalling 600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   22.8s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:  4.8min finished


RandomizedSearchCV(cv=StratifiedKFold(n_splits=3, random_state=None, shuffle=False),
                   estimator=Pipeline(steps=[('prep',
                                              ColumnTransformer(transformers=[('numeric',
                                                                               Pipeline(steps=[('scaler',
                                                                                                RobustScaler())]),
                                                                               ['age',
                                                                                'education',
                                                                                'pdays',
                                                                                'previous',
                                                                                'cons.price.idx',
                                                                                'cons.conf.

In [78]:
RF_RS2.best_params_

{'algo__n_estimators': 10,
 'algo__min_samples_leaf': 1,
 'algo__max_depth': None,
 'algo__class_weight': {0: 0.3, 1: 0.7}}

In [79]:
# tab_hyper_RF = pd.DataFrame(RF_RS.cv_results_)[['param_algo__n_estimators', 'param_algo__max_depth', 'param_algo__min_samples_leaf', 'param_algo__class_weight', 'mean_test_score']]
# tab_hyper_RF = (tab_hyper_RF.sort_values(by='mean_test_score', ascending=False).reset_index()).drop(columns='index')
# tab_hyper_RF[(tab_hyper_RF['mean_test_score'] > 0.8) & (tab_hyper_RF['mean_test_score'] < 0.95)]

______________
_UNTUK yng best estimator_
_____________

In [80]:
RF_Tune2 = RF_RS2.best_estimator_

In [81]:
RF_Tune2.fit(X_train_OS, y_train_OS)

Pipeline(steps=[('prep',
                 ColumnTransformer(transformers=[('numeric',
                                                  Pipeline(steps=[('scaler',
                                                                   RobustScaler())]),
                                                  ['age', 'education', 'pdays',
                                                   'previous', 'cons.price.idx',
                                                   'cons.conf.idx', 'euribor3m',
                                                   'nr.employed',
                                                   'emp.var.rate']),
                                                 ('categorical',
                                                  Pipeline(steps=[('encoder',
                                                                   OneHotEncoder())]),
                                                  ['job', 'marital', 'default',
                                                   'housing', 'l

In [82]:
conf_mat(RF_Tune2, X_train_OS, X_test, y_train_OS, y_test, 'RF Tuning 2 best estimator')

Classification report data TEST RF Tuning 2 best estimator

               precision    recall  f1-score   support

           0       0.92      0.94      0.93      7310
           1       0.41      0.36      0.39       928

    accuracy                           0.87      8238
   macro avg       0.67      0.65      0.66      8238
weighted avg       0.86      0.87      0.87      8238


ROC AUC test : 0.65 


Confusion matrix data test RF Tuning 2 best estimator


      Pred1  Pred0
Akt1    334    594
Akt0    473   6837
Classification report data TRAIN RF Tuning 2 best estimator

               precision    recall  f1-score   support

           0       1.00      0.98      0.99     29238
           1       0.98      1.00      0.99     29238

    accuracy                           0.99     58476
   macro avg       0.99      0.99      0.99     58476
weighted avg       0.99      0.99      0.99     58476


ROC AUC train : 0.99 


Confusion matrix data train RF Tuning 2 best estimator


    

__TUNING 3__

In [91]:
np.arange(50, 150, 20)

array([ 50,  70,  90, 110, 130])

In [92]:
param_RF3 = {
    'algo__n_estimators' : np.arange(60, 150, 20),
    'algo__max_depth' : list(np.arange(50, 150, 20)) + [None],
    'algo__min_samples_leaf' : np.arange(1, 10, 2),
    'algo__class_weight' : [None, {0:.4, 1:.6}, {0:.3, 1:.7}, {0:.2, 1:.8}, {0:.15, 1: .85}]
}

In [93]:
RF_RS3 = RandomizedSearchCV(pipe_RF, param_RF3, cv=skf, n_iter=200, n_jobs=-1, verbose=1, random_state=42, scoring='precision')

In [94]:
RF_RS3.fit(X_train_OS, y_train_OS)

Fitting 3 folds for each of 200 candidates, totalling 600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   59.1s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  5.6min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed: 12.8min
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed: 17.3min finished


RandomizedSearchCV(cv=StratifiedKFold(n_splits=3, random_state=None, shuffle=False),
                   estimator=Pipeline(steps=[('prep',
                                              ColumnTransformer(transformers=[('numeric',
                                                                               Pipeline(steps=[('scaler',
                                                                                                RobustScaler())]),
                                                                               ['age',
                                                                                'education',
                                                                                'pdays',
                                                                                'previous',
                                                                                'cons.price.idx',
                                                                                'cons.conf.

In [95]:
RF_RS3.best_params_

{'algo__n_estimators': 60,
 'algo__min_samples_leaf': 1,
 'algo__max_depth': 90,
 'algo__class_weight': {0: 0.15, 1: 0.85}}

In [79]:
# tab_hyper_RF = pd.DataFrame(RF_RS.cv_results_)[['param_algo__n_estimators', 'param_algo__max_depth', 'param_algo__min_samples_leaf', 'param_algo__class_weight', 'mean_test_score']]
# tab_hyper_RF = (tab_hyper_RF.sort_values(by='mean_test_score', ascending=False).reset_index()).drop(columns='index')
# tab_hyper_RF[(tab_hyper_RF['mean_test_score'] > 0.8) & (tab_hyper_RF['mean_test_score'] < 0.95)]

______________
_UNTUK yng best estimator_
_____________

In [96]:
RF_Tune3 = RF_RS3.best_estimator_

In [97]:
RF_Tune3.fit(X_train_OS, y_train_OS)

Pipeline(steps=[('prep',
                 ColumnTransformer(transformers=[('numeric',
                                                  Pipeline(steps=[('scaler',
                                                                   RobustScaler())]),
                                                  ['age', 'education', 'pdays',
                                                   'previous', 'cons.price.idx',
                                                   'cons.conf.idx', 'euribor3m',
                                                   'nr.employed',
                                                   'emp.var.rate']),
                                                 ('categorical',
                                                  Pipeline(steps=[('encoder',
                                                                   OneHotEncoder())]),
                                                  ['job', 'marital', 'default',
                                                   'housing', 'l

In [98]:
conf_mat(RF_Tune3, X_train_OS, X_test, y_train_OS, y_test, 'RF Tuning 3 best estimator')

Classification report data TEST RF Tuning 3 best estimator

               precision    recall  f1-score   support

           0       0.92      0.93      0.93      7310
           1       0.43      0.39      0.41       928

    accuracy                           0.87      8238
   macro avg       0.68      0.66      0.67      8238
weighted avg       0.87      0.87      0.87      8238


ROC AUC test : 0.66 


Confusion matrix data test RF Tuning 3 best estimator


      Pred1  Pred0
Akt1    366    562
Akt0    479   6831
Classification report data TRAIN RF Tuning 3 best estimator

               precision    recall  f1-score   support

           0       1.00      0.98      0.99     29238
           1       0.98      1.00      0.99     29238

    accuracy                           0.99     58476
   macro avg       0.99      0.99      0.99     58476
weighted avg       0.99      0.99      0.99     58476


ROC AUC train : 0.99 


Confusion matrix data train RF Tuning 3 best estimator


    

### PERBANDINGAN PRECISION MODEL PASCA HYPER

================================================================================================

In [99]:
prec_rec ([pipe_KNN, pipe_SVM, pipe_RF, RF_Tune, RF_Tune2, RF_Tune3], X_test, y_test, ['KNN_Base', 'SVM_Base', 'RF_Base', 'RF_HPT_BE 1', 'RF_HPT_BE 2', 'RF_HPT_BE 3'])

Unnamed: 0,KNN_Base,SVM_Base,RF_Base,RF_HPT_BE 1,RF_HPT_BE 2,RF_HPT_BE 3
Precison,0.271611,0.093219,0.446784,0.445882,0.413879,0.433136
Recall,0.572198,0.798491,0.411638,0.408405,0.359914,0.394397


_____________

## HYPERPARAMETER TUNING XGB

__DEFAULT PARAMETER__

1. 'algo__n_estimators': 100,
2. 'algo__max_depth': 6,
3. 'algo__learning_rate': 0.300000012,
4. 'algo__gamma': 0,
5. 'algo__colsample_bytree': 1,
6. 'algo__subsample': 1,
7. 'algo__reg_alpha': 0,
8. 'algo__reg_lambda': 1

__TUNING 1__

In [119]:
param_XGB = {
    'algo__n_estimators' : np.arange(100, 600, 100),
    'algo__max_depth' : [2, 3, 5, 6, 8, 10],
    'algo__learning_rate' : list(np.logspace(-3, 0, 4)) + [0.300000012],
    'algo__gamma' : np.logspace(-3, 2, 6)
#     'algo__colsample_bytree' : [0.3, 0.5, 0.7, 0.8],
#     'algo__subsample' : [0.3, 0.5, 0.7, 0.8],
#     'algo__reg_alpha' : np.logspace(-3, 3, 7),
#     'algo__reg_lambda' : np.logspace(-3, 3, 7)
}

In [120]:
XGB_RS = RandomizedSearchCV(pipe_XGB, param_XGB, cv=skf, n_iter=200, n_jobs=-1, verbose=1, random_state=42, scoring='precision')

In [121]:
XGB_RS.fit(X_train_OS, y_train_OS)

Fitting 3 folds for each of 200 candidates, totalling 600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  4.9min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed: 29.0min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed: 83.6min
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed: 109.9min finished




RandomizedSearchCV(cv=StratifiedKFold(n_splits=3, random_state=None, shuffle=False),
                   estimator=Pipeline(steps=[('prep',
                                              ColumnTransformer(transformers=[('numeric',
                                                                               Pipeline(steps=[('scaler',
                                                                                                RobustScaler())]),
                                                                               ['age',
                                                                                'education',
                                                                                'pdays',
                                                                                'previous',
                                                                                'cons.price.idx',
                                                                                'cons.conf.

In [122]:
XGB_RS.best_params_

{'algo__n_estimators': 500,
 'algo__max_depth': 10,
 'algo__learning_rate': 0.300000012,
 'algo__gamma': 0.01}

In [123]:
# tab_hyper_RF = pd.DataFrame(RF_RS.cv_results_)[['param_algo__n_estimators', 'param_algo__max_depth', 'param_algo__min_samples_leaf', 'param_algo__class_weight', 'mean_test_score']]
# tab_hyper_RF = (tab_hyper_RF.sort_values(by='mean_test_score', ascending=False).reset_index()).drop(columns='index')
# tab_hyper_RF[(tab_hyper_RF['mean_test_score'] > 0.8) & (tab_hyper_RF['mean_test_score'] < 0.95)]

______________
_UNTUK yng best estimator_
_____________

In [124]:
XGB_Tune = XGB_RS.best_estimator_

In [125]:
XGB_Tune.fit(X_train_OS, y_train_OS)



Pipeline(steps=[('prep',
                 ColumnTransformer(transformers=[('numeric',
                                                  Pipeline(steps=[('scaler',
                                                                   RobustScaler())]),
                                                  ['age', 'education', 'pdays',
                                                   'previous', 'cons.price.idx',
                                                   'cons.conf.idx', 'euribor3m',
                                                   'nr.employed',
                                                   'emp.var.rate']),
                                                 ('categorical',
                                                  Pipeline(steps=[('encoder',
                                                                   OneHotEncoder())]),
                                                  ['job', 'marital', 'default',
                                                   'housing', 'l

In [126]:
conf_mat(XGB_Tune, X_train_OS, X_test, y_train_OS, y_test, 'XGB Tuning 1 best estimator')

Classification report data TEST XGB Tuning 1 best estimator

               precision    recall  f1-score   support

           0       0.92      0.93      0.93      7310
           1       0.41      0.40      0.40       928

    accuracy                           0.87      8238
   macro avg       0.67      0.66      0.66      8238
weighted avg       0.87      0.87      0.87      8238


ROC AUC test : 0.66 


Confusion matrix data test XGB Tuning 1 best estimator


      Pred1  Pred0
Akt1    370    558
Akt0    536   6774
Classification report data TRAIN XGB Tuning 1 best estimator

               precision    recall  f1-score   support

           0       1.00      0.98      0.99     29238
           1       0.98      1.00      0.99     29238

    accuracy                           0.99     58476
   macro avg       0.99      0.99      0.99     58476
weighted avg       0.99      0.99      0.99     58476


ROC AUC train : 0.99 


Confusion matrix data train XGB Tuning 1 best estimator




### PERBANDINGAN PRECISION MODEL PASCA HYPER

================================================================================================

In [127]:
prec_rec ([pipe_KNN, pipe_SVM, pipe_RF, RF_Tune, RF_Tune2, RF_Tune3, pipe_XGB, XGB_Tune], X_test, y_test, ['KNN_Base', 'SVM_Base', 'RF_Base', 'RF_HPT_BE 1', 'RF_HPT_BE 2', 'RF_HPT_BE 3', 'XGB_Base', 'XGB_HPT_BE 1'])

Unnamed: 0,KNN_Base,SVM_Base,RF_Base,RF_HPT_BE 1,RF_HPT_BE 2,RF_HPT_BE 3,XGB_Base,XGB_HPT_BE 1
Precison,0.271611,0.093219,0.446784,0.445882,0.413879,0.433136,0.388186,0.408389
Recall,0.572198,0.798491,0.411638,0.408405,0.359914,0.394397,0.594828,0.398707
