This notebook is using data prep/encoding that we decided upon for group project.  We have divided algorithm analysis up between members and want to have uniform data prep/encoding for comparison.

In [142]:
# Important Statements
import numpy as np
import matplotlib.pyplot as plt 
%matplotlib inline
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier

In [143]:
dataset = pd.read_csv("bank-additional-full.csv")
#inital EDA showed strong correlation for emp.var.rate/nr.employed with euribor3m, duration can't know ahead of time, also drop
dataset = dataset.drop(['emp.var.rate','nr.employed', 'duration'], axis=1)

In [144]:
dataset.columns

Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'campaign', 'pdays', 'previous',
       'poutcome', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'y'],
      dtype='object')

In [145]:
dataset.shape

(41188, 18)

In [146]:
dataset.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,campaign,pdays,previous,poutcome,cons.price.idx,cons.conf.idx,euribor3m,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,1,999,0,nonexistent,93.994,-36.4,4.857,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,1,999,0,nonexistent,93.994,-36.4,4.857,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,1,999,0,nonexistent,93.994,-36.4,4.857,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,1,999,0,nonexistent,93.994,-36.4,4.857,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,1,999,0,nonexistent,93.994,-36.4,4.857,no


In [147]:
#Initial data prep
#check for null values
#dataset.isnull().values.any()
#remove rows with >= 4 unkown values
#dataset = dataset.replace(to_replace='unknown', value=np.nan).dropna(thresh=17)
#dataset = dataset.replace(to_replace='unknown', value=np.nan).dropna()
#dataset = dataset.replace(to_replace=np.nan, value='unknown')
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 18 columns):
age               41188 non-null int64
job               41188 non-null object
marital           41188 non-null object
education         41188 non-null object
default           41188 non-null object
housing           41188 non-null object
loan              41188 non-null object
contact           41188 non-null object
month             41188 non-null object
day_of_week       41188 non-null object
campaign          41188 non-null int64
pdays             41188 non-null int64
previous          41188 non-null int64
poutcome          41188 non-null object
cons.price.idx    41188 non-null float64
cons.conf.idx     41188 non-null float64
euribor3m         41188 non-null float64
y                 41188 non-null object
dtypes: float64(3), int64(4), object(11)
memory usage: 5.7+ MB


In [148]:
dataset['education'].unique()

array(['basic.4y', 'high.school', 'basic.6y', 'basic.9y',
       'professional.course', 'unknown', 'university.degree',
       'illiterate'], dtype=object)

In [149]:
dataset['education']=np.where(dataset['education'] =='basic.9y', 'Basic', dataset['education'])
dataset['education']=np.where(dataset['education'] =='basic.6y', 'Basic', dataset['education'])
dataset['education']=np.where(dataset['education'] =='basic.4y', 'Basic', dataset['education'])
dataset['education'].unique()

array(['Basic', 'high.school', 'professional.course', 'unknown',
       'university.degree', 'illiterate'], dtype=object)

In [150]:
#One Hot Encoding all Catergorical Variables without Order
import category_encoders as ce

encoder = ce.BinaryEncoder(cols=['job','marital','education','default','housing','loan','contact','poutcome','month','day_of_week'])
#encoder = ce.BinaryEncoder(cols=['job','marital','default','housing','loan','contact','poutcome','month','day_of_week'])
dataset = encoder.fit_transform(dataset)

In [151]:
# Encoding Outcome Row to Binary
#dataset['outcome'] = dataset['outcome'].map({'yes': 1, 'no': 0})
dataset['y'] = dataset['y'].map({'yes': 1, 'no': 0})

In [152]:
#dataset.info()

In [153]:
dataset.shape

(41188, 43)

In [170]:
#balance using SMOTE
X = dataset.loc[:, dataset.columns != 'y']
y = dataset.loc[:, dataset.columns == 'y']
from imblearn.over_sampling import SMOTE
os = SMOTE(random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
columns = X_train.columns
os_data_X,os_data_y = os.fit_sample(X_train, y_train.values.ravel())
os_data_X = pd.DataFrame(data=os_data_X,columns=columns )
os_data_y= pd.DataFrame(data=os_data_y,columns=['y'])
# we can Check the numbers of our data
print("length of oversampled data is ",len(os_data_X))
print("Number of no term in oversampled data",len(os_data_y[os_data_y['y']==0]))
print("Number of term",len(os_data_y[os_data_y['y']==1]))
print("Proportion of no term data in oversampled data is ",len(os_data_y[os_data_y['y']==0])/len(os_data_X))
print("Proportion of term data in oversampled data is ",len(os_data_y[os_data_y['y']==1])/len(os_data_X))
X = os_data_X
y = os_data_y

length of oversampled data is  51158
Number of no term in oversampled data 25579
Number of term 25579
Proportion of no term data in oversampled data is  0.5
Proportion of term data in oversampled data is  0.5


In [155]:
#Splitting the data into Training Set and Test Set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=0)

In [156]:
# Scaling
from sklearn.preprocessing import StandardScaler 
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

In [165]:
#Applying PCA
#First we want to including all principle components by specifying None
from sklearn.decomposition import PCA
#start with None, then eval
pcaObj= PCA(n_components=31)
X_train= pcaObj.fit_transform(X_train)
X_test= pcaObj.transform(X_test)
#show components and what is most important, then decid on n_components
components_variance = pcaObj.explained_variance_ratio_
cumulative_variance = pcaObj.explained_variance_ratio_.cumsum()
print(components_variance)
print(cumulative_variance)

[0.12856838 0.07256526 0.06849373 0.06163101 0.05756351 0.05612562
 0.05355602 0.05161025 0.04877781 0.0465348  0.03949361 0.03358003
 0.03201337 0.03065358 0.02941269 0.02733616 0.02650904 0.02515889
 0.01779195 0.01743848 0.01575864 0.01317584 0.00803633 0.00726025
 0.00708338 0.00596055 0.00526197 0.0049535  0.00364978 0.00358439
 0.00045829]
[0.12856838 0.20113364 0.26962737 0.33125838 0.38882189 0.44494751
 0.49850353 0.55011378 0.59889159 0.64542639 0.68492001 0.71850004
 0.75051341 0.78116699 0.81057968 0.83791584 0.86442488 0.88958377
 0.90737572 0.92481419 0.94057284 0.95374868 0.96178502 0.96904526
 0.97612864 0.98208919 0.98735116 0.99230466 0.99595444 0.99953884
 0.99999712]


In [166]:
#from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
#n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
#max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
#max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
#max_depth.append(None)
# Minimum number of samples required to split a node
#min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
#min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
#bootstrap = [True, False]
# Create the random grid
#random_grid = {'n_estimators': n_estimators,
#               'max_features': max_features,
#               'max_depth': max_depth,
#               'min_samples_split': min_samples_split,
#               'min_samples_leaf': min_samples_leaf,
#               'bootstrap': bootstrap}
#print(random_grid)

# Use the random grid to search for best hyperparameters
# First create the base model to tune
#classifierObj = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
#classifierObj_random = RandomizedSearchCV(estimator = classifierObj, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
#classifierObj_random.fit(X_train,y_train)

In [167]:
#classifierObj_random.best_params_

In [168]:
#Fitting Classifier to Training Set. Create a classifier object here and call it classifierObj
from sklearn.ensemble import RandomForestClassifier

#classifierObj= RandomForestClassifier()
#grid_param= {  
#    'n_estimators': [10, 15, 20, 25, 30, 40, 50],
#    'criterion': ['gini', 'entropy'],
#    'bootstrap': [True, False],
#    'max_depth': [5, 10, 20, 30]
#}
#from sklearn.model_selection import GridSearchCV
#gd_sr= GridSearchCV(estimator=classifierObj, param_grid=grid_param, scoring='accuracy', cv=5, n_jobs=-1)
#gd_sr.fit(X_train, y_train) 
#print(gd_sr.best_params_) #print(gd_sr.best_score_)

#grid search results {'bootstrap': False, 'criterion': 'gini', 'max_depth': 10, 'n_estimators': 40}
#0.8942943854324734
classifierObj= RandomForestClassifier(bootstrap=True, criterion='gini', max_depth= 10, n_estimators=40)
classifierObj.fit(X_train,y_train.values.ravel())

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=40, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [169]:
#K-Fold Cross Validation
from sklearn.model_selection import cross_val_score
modelAccuracies = cross_val_score(estimator=classifierObj, X=X_train, y=y_train.values.ravel(), cv=10)
print(modelAccuracies.mean())
print(modelAccuracies.std())

0.7859794250174938
0.007847028490305875


In [161]:
#Making predictions on the Test Set
y_pred = classifierObj.predict(X_test)

#Evaluating the predictions using a Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[4736  369]
 [ 550 4577]]


In [163]:
#score = classifierObj.score(X_test, y_test)
#print(score)