## Using all colunms as Features built model with Adabooster

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import AdaBoostClassifier

In [2]:
#Used Austin Caras, preprocessed data
starter_df = pd.read_csv(Path(".//Resources/BankChurnersPrimary.csv"))

starter_df = starter_df[starter_df != 'Unknown'].dropna()
starter_df.sample(5)


Unnamed: 0,CLIENTNUM,Attrition_Flag,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,...,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio,Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1,Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2
7942,772346358,Existing Customer,52,F,3,Uneducated,Single,Less than $40K,Blue,41,...,1801.0,0,1801.0,0.516,5047,79,0.756,0.0,5.9e-05,0.99994
8840,720474408,Existing Customer,52,M,2,Graduate,Single,$40K - $60K,Blue,41,...,9298.0,1898,7400.0,0.655,8427,87,0.776,0.204,0.000114,0.99989
1840,808273533,Attrited Customer,39,M,2,High School,Single,$60K - $80K,Blue,34,...,1818.0,0,1818.0,0.651,2054,55,0.774,0.0,0.9972,0.002805
888,713743158,Existing Customer,47,M,1,Graduate,Married,$60K - $80K,Blue,36,...,19063.0,1714,17349.0,0.786,1270,21,0.615,0.09,0.000169,0.99983
2380,713445183,Existing Customer,35,M,1,High School,Married,$60K - $80K,Blue,26,...,4008.0,1841,2167.0,1.504,2794,58,1.148,0.459,0.000164,0.99984


In [4]:
#Shorten column names so they display better
replacement_column_names = {'Attrition_Flag': 'Status',
        'Customer_Age': 'Age',
        'Education_Level': 'Education',
        'Dependent_count': 'Dependents',
        'Income_Category': 'Income',
        'Card_Category': 'Card Type',
        'Months_on_book': 'Tenure(month)'
        }
starter_df.rename(columns=replacement_column_names, inplace=True)

In [5]:
#Drop unknowns data
starter_df = starter_df[starter_df.Marital_Status != 'Unknown']
starter_df = starter_df[starter_df.Education != 'Unknown']
starter_df = starter_df[starter_df.Income != 'Unknown']

#Replace M/F with 0,1
starter_df = starter_df.replace({'Gender' : {'M': 0, 'F': 1}})

#Replace customer's status to 1 if they left and 0 if they stayed
starter_df = starter_df.replace({'Status' : {'Attrited Customer': 1, 'Existing Customer': 0}})


In [6]:
#reaplced the Martial status and calucluated the marital ratio
starter_df = starter_df.replace({'Marital_Status' : {'Divorced': 1, 'Single': 1, 'Married' : 2}})
starter_df['MaritalDependentRatio'] = ((starter_df['Marital_Status'] / (starter_df['Dependents'] + 1)) / 2).round(2)
starter_df.drop(columns=['Marital_Status', 'Dependents'], inplace=True)

#Replace card categories from Blue/Silver/Gold/Plat to 0/1/2/3/4
starter_df = starter_df.replace({'Card Type' : {'Blue': 0, 'Silver': 1, 'Gold' : 3, 'Platinum' : 4}})


In [7]:
#use feature creation to make a ratio between avg open to buy and credit limit
starter_df['CreditUsage'] = (starter_df['Avg_Open_To_Buy'] / starter_df['Credit_Limit']).round(2)
starter_df.drop(columns=['Avg_Open_To_Buy', 'Credit_Limit'], inplace=True)

In [8]:
#use feature creation to make a ratio between transaction amount and trans count
starter_df['AvgTransValue'] = (starter_df['Total_Trans_Amt'] / starter_df['Total_Trans_Ct']).round(2)
starter_df.drop(columns=['Total_Trans_Amt', 'Total_Trans_Ct'], inplace=True)

In [11]:
#use feature creation to make a ratio between age and tenure
starter_df['TenureByAge'] = (starter_df['Tenure(month)'] / starter_df['Age']).round(2)
starter_df.drop(columns=['Tenure(month)', 'Age'], inplace=True)

In [12]:
starter_df = starter_df.dropna()

In [13]:
#Rank the income as a 0-4 low-high
starter_df = starter_df.replace({'Income' : {'Less than $40K': 0, '$40K - $60K': 1, '$80K - $120K' : 2, '$60K - $80K' : 3, '$120K +' : 4}})

In [14]:
#Remove columns that are irrelevant 

starter_df.drop(columns=['CLIENTNUM',
'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2',
'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1', 
],
inplace=True
)
starter_df.sample(10)

Unnamed: 0,Status,Gender,Education,Income,Card Type,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,Total_Revolving_Bal,Total_Amt_Chng_Q4_Q1,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio,MaritalDependentRatio,CreditUsage,AvgTransValue,TenureByAge
8146,1,0,4,4,0,4,2,3,0,0.903,0.63,0.0,0.12,1.0,61.77,0.68
4001,0,0,3,1,0,6,1,2,1074,0.774,0.68,0.368,0.2,0.63,47.86,0.6
6074,0,1,1,1,0,3,3,3,0,0.738,0.762,0.0,0.33,1.0,58.35,0.67
764,1,1,0,0,0,6,3,1,0,0.701,0.562,0.0,0.12,1.0,32.8,0.66
4026,0,1,1,0,0,2,1,2,0,0.656,0.917,0.0,0.17,1.0,50.29,0.65
3580,0,1,0,1,0,3,3,3,1652,1.132,0.758,0.616,0.2,0.38,58.31,0.83
3991,0,0,3,4,0,6,1,3,2506,0.873,0.58,0.718,0.12,0.28,61.46,0.71
7014,1,1,3,0,0,6,2,2,2256,0.615,0.353,0.805,0.5,0.19,51.54,0.64
8916,0,0,0,2,0,1,3,2,2416,0.917,0.548,0.088,0.25,0.91,77.93,0.73
9446,0,0,0,3,0,2,3,1,1908,0.697,0.672,0.291,0.33,0.71,125.92,0.83


In [24]:
starter_df.shape

(4490, 16)

In [15]:
y = starter_df['Status']
X = starter_df.drop(columns=['Status'])

In [16]:
#Check the value counts
y.value_counts()

0    3783
1     707
Name: Status, dtype: int64

In [17]:
#Split features and target in to training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [18]:
oversample = SMOTE()
X_train, y_train = oversample.fit_resample(X_train, y_train)

In [19]:
#Using the Adabooster model and fit the model with training data, predict the model with testing data 
ada_model = AdaBoostClassifier(n_estimators=1000)
ada_model.fit(X_train, y_train)
ada_y_pred = ada_model.predict(X_test)
ada_y_pred[:10]

array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0], dtype=int64)

In [20]:
# print the accuracy score 
ada_accuracy = accuracy_score(y_test, ada_y_pred)
print("Ada Booster Accuracy: {:.2f}%".format(ada_accuracy * 100))

Ada Booster Accuracy: 87.98%


In [21]:
# create a dataframe and print the actual targets and testing predictions
results = pd.DataFrame({"Testing predictions": ada_y_pred,
    "Actual targets" : y_test})
results.tail()

Unnamed: 0,Testing predictions,Actual targets
3635,0,0
5954,0,0
7676,1,1
7453,0,0
7558,0,0


In [22]:
#generate confusion matrix
ada_matrix= confusion_matrix(y_test, ada_y_pred)
ada_matrix

array([[872,  59],
       [ 76, 116]], dtype=int64)

In [23]:
#create Classification report 
print(classification_report(y_test, ada_y_pred))

              precision    recall  f1-score   support

           0       0.92      0.94      0.93       931
           1       0.66      0.60      0.63       192

    accuracy                           0.88      1123
   macro avg       0.79      0.77      0.78      1123
weighted avg       0.88      0.88      0.88      1123

