[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/bluebottle66/Practical-Machine-Learning-Northwestern-/blob/master/Predict422Week2_Kun_Yang.ipynb)

In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
from google.colab import files


In [6]:
#test uploading csv file to google colab
uploaded = files.upload()

Saving bank.csv to bank.csv


In [7]:
for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

User uploaded file "bank.csv" with length 461481 bytes


In [0]:
bank=pd.read_csv('bank.csv', sep = ';')

In [9]:
print(bank.shape)

(4521, 17)


In [14]:
bank.dropna()
print(bank.shape)

(4521, 17)


In [0]:


# use dictionary objects for mapping to 0/1 binary or category
response_to_binary = {'no' : 0, 'yes' : 1}
YESresponse = np.array(bank['response'].map(response_to_binary))

default_to_binary = {'no' : 0, 'yes' : 1}
YESdefault = np.array(bank['default'].map(default_to_binary))

housing_to_binary = {'no' : 0, 'yes' : 1}
YEShousing = np.array(bank['housing'].map(housing_to_binary))

loan_to_binary = {'no' : 0, 'yes' : 1}
YESloan = np.array(bank['loan'].map(loan_to_binary))

In [0]:

from sklearn.metrics import roc_auc_score

# specify the two classifiers being evaluated
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
names = ["Naive_Bayes", "Logistic_Regression"]
classifiers = [BernoulliNB(alpha=1.0, binarize=0.5, 
                           class_prior = [0.5, 0.5], fit_prior=False), 
               LogisticRegression()]

In [0]:
#select response variable the three binary explanatory variables
model_data = np.array([YESdefault,\
    YEShousing,\
    YESloan,\
    YESresponse]).T

In [0]:
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
np.random.shuffle(model_data)

In [18]:
print('\nData dimensions:', model_data.shape)


Data dimensions: (4521, 4)


In [0]:
# specify the k-fold cross-validation design
from sklearn.model_selection import KFold

# ten-fold cross-validation employed here
N_FOLDS = 10

In [0]:
# set up numpy array for storing results
cv_results = np.zeros((N_FOLDS, len(names)))

In [21]:
print(cv_results.shape)

(10, 2)


In [0]:
kf = KFold(n_splits = N_FOLDS, shuffle=False, random_state = RANDOM_SEED)

In [23]:
index_for_fold = 0  # fold count initialized 
for train_index, test_index in kf.split(model_data):
    print('\nFold index:', index_for_fold,
          '------------------------------------------')
#   note that 0:model_data.shape[1]-1 slices for explanatory variables
#   and model_data.shape[1]-1 is the index for the response variable    
    X_train = model_data[train_index, 0:model_data.shape[1]-1]
    X_test = model_data[test_index, 0:model_data.shape[1]-1]
    y_train = model_data[train_index, model_data.shape[1]-1]
    y_test = model_data[test_index, model_data.shape[1]-1]   
    print('\nShape of input data for this fold:',
          '\nData Set: (Observations, Variables)')
    print('X_train:', X_train.shape)
    print('X_test:',X_test.shape)
    print('y_train:', y_train.shape)
    print('y_test:',y_test.shape)

    index_for_method = 0  # initialize
    for name, clf in zip(names, classifiers):
        print('\nClassifier evaluation for:', name)
        print('  Scikit Learn method:', clf)
        clf.fit(X_train, y_train)  # fit on the train set for this fold
        # evaluate on the test set for this fold
        y_test_predict = clf.predict_proba(X_test)
        fold_method_result = roc_auc_score(y_test, y_test_predict[:,1]) 
        print('Area under ROC curve:', fold_method_result)
        cv_results[index_for_fold, index_for_method] = fold_method_result
        index_for_method += 1
  
    index_for_fold += 1


Fold index: 0 ------------------------------------------

Shape of input data for this fold: 
Data Set: (Observations, Variables)
X_train: (4068, 3)
X_test: (453, 3)
y_train: (4068,)
y_test: (453,)

Classifier evaluation for: Naive_Bayes
  Scikit Learn method: BernoulliNB(alpha=1.0, binarize=0.5, class_prior=[0.5, 0.5], fit_prior=False)
Area under ROC curve: 0.6103395870453832

Classifier evaluation for: Logistic_Regression
  Scikit Learn method: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
Area under ROC curve: 0.6103395870453832

Fold index: 1 ------------------------------------------

Shape of input data for this fold: 
Data Set: (Observations, Variables)
X_train: (4069, 3)
X_test: (452, 3)
y_train: (4069,)
y_test: (452,)

Classifier evaluation for: Naive_Bayes
  S

In [0]:
cv_results_df = pd.DataFrame(cv_results)
cv_results_df.columns = names

In [25]:
print('\n----------------------------------------------')
print('Average results from ', N_FOLDS, '-fold cross-validation\n',
      '\nMethod                 Area under ROC Curve', sep = '')     
print(cv_results_df.mean())


----------------------------------------------
Average results from 10-fold cross-validation

Method                 Area under ROC Curve
Naive_Bayes            0.605224
Logistic_Regression    0.606980
dtype: float64


In [26]:
#from 10-fold cross validation result, we can see Logistic_regression has slightly larger area under ROC curve, so this is the method I recommend
print(cv_results_df)

   Naive_Bayes  Logistic_Regression
0     0.610340             0.610340
1     0.574080             0.574080
2     0.654569             0.654569
3     0.587235             0.587235
4     0.584457             0.584457
5     0.653991             0.662918
6     0.625050             0.628870
7     0.529094             0.533909
8     0.603422             0.603422
9     0.630000             0.630000


In [0]:
#now test all 8 combination of three binary variables (default, housing, loan), compare predict vs actual using first the recommended model - logistical regression

my_default = np.array([1, 1, 1, 1, 0, 0, 0, 0], np.int32)
my_housing = np.array([1, 1, 0, 0, 1, 1, 0, 0], np.int32)
my_loan = np.array([1, 0, 1, 0, 1, 0, 1, 0], np.int32)

my_X_test = np.vstack([my_default, my_housing, my_loan]).T

In [28]:
# fit logistic regression to full data set
clf = LogisticRegression()
X_train = model_data[:, 0:model_data.shape[1]-1]
y_train = model_data[:, model_data.shape[1]-1]
clf.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [0]:
# predict specific test cases covering all situations
y_my_test_predict = clf.predict_proba(my_X_test)


In [30]:
# create DataFrame for displaying test cases and predicted probabilities
my_targeting_df = pd.DataFrame(np.hstack([my_X_test, y_my_test_predict]))
my_targeting_df.columns = ['default', 'housing', 'loan', 
                           'predict_NO', 'predict_YES']
print('\n\nLogistic regression model predictions for test cases:')
print(my_targeting_df) 



Logistic regression model predictions for test cases:
   default  housing  loan  predict_NO  predict_YES
0      1.0      1.0   1.0    0.945729     0.054271
1      1.0      1.0   0.0    0.892349     0.107651
2      1.0      0.0   1.0    0.900786     0.099214
3      1.0      0.0   0.0    0.811988     0.188012
4      0.0      1.0   1.0    0.953277     0.046723
5      0.0      1.0   0.0    0.906588     0.093412
6      0.0      0.0   1.0    0.914016     0.085984
7      0.0      0.0   0.0    0.834890     0.165110


group 3 & group 7 have largest predict_YES probability, so those are the best target marketing efforts

Group 3: default = true, housing = false, loan = false

Group 7, same as group 3 except default = false

So, basically if the customer does not have mortgage and loan, no matter his credit default history, it has around 16~18% probability getting a term deposit.


Out of curiosity, I want to take a look at the less preferred model: Naive Bayes classification to see what results it gets here...

In [0]:
clf2=BernoulliNB(alpha=1.0, binarize=0.5, 
                           class_prior = [0.5, 0.5], fit_prior=False)

In [33]:
clf2.fit(X_train, y_train)

BernoulliNB(alpha=1.0, binarize=0.5, class_prior=[0.5, 0.5], fit_prior=False)

In [0]:
y_my_test_predict2 = clf2.predict_proba(my_X_test)

In [35]:
my_targeting_df2 = pd.DataFrame(np.hstack([my_X_test, y_my_test_predict2]))
my_targeting_df2.columns = ['default', 'housing', 'loan', 
                           'predict_NO', 'predict_YES']
print('\n\nNaive Bayes model predictions for test cases:')
print(my_targeting_df2) 



Naive Bayes model predictions for test cases:
   default  housing  loan  predict_NO  predict_YES
0      1.0      1.0   1.0    0.703288     0.296712
1      1.0      1.0   0.0    0.529384     0.470616
2      1.0      0.0   1.0    0.551965     0.448035
3      1.0      0.0   0.0    0.368951     0.631049
4      0.0      1.0   1.0    0.727746     0.272254
5      0.0      1.0   0.0    0.559193     0.440807
6      0.0      0.0   1.0    0.581473     0.418527
7      0.0      0.0   0.0    0.397353     0.602647


Ok, so Naive Bayes model give the same suggestion as group 3 & group 7, however the predict_YES probability is largely overstated here.

below are just some practice.... I want to see if I just choose one binary variable here, default, what kind of performance/confusion matrix if I use that to predict whether client will subscribe term depoit. 

In [39]:
from sklearn.linear_model import SGDClassifier
sgd_clf=SGDClassifier(random_state=42)
sgd_clf.fit(model_data[:,0].reshape(-1,1),model_data[:,3])



SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=None, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=42, shuffle=True,
       tol=None, verbose=0, warm_start=False)

In [40]:
from sklearn.model_selection import cross_val_predict
y_train_pred=cross_val_predict(sgd_clf,model_data[:,0].reshape(-1,1),model_data[:,3],cv=3)



In [41]:
from sklearn.metrics import confusion_matrix
confusion_matrix(model_data[:,3],y_train_pred)

array([[4000,    0],
       [ 521,    0]])

  'precision', 'predicted', average, warn_for)


0.0

I think since the customer with positive response to subscribe to term deposit  are very small part of the total population, that make the classification model (with three factors we chose) have very limited predict accuracy. we need to introduce more variable in order to improve the model here.