# Bank Churners
**ML Project 1 - Spring 2022**

Collaborators: Ben DeSollar and Matt McDonnell

### Step 1
Import the necessary packages and frameworks.

In [3]:
import numpy as np
from mlwpy import *
import matplotlib.pyplot as plt
import matplotlib as mpl
import pandas as pd
import seaborn as sns
from sklearn import (datasets, neighbors,
                     naive_bayes,
                     model_selection as skms,
                     linear_model, dummy,
                     metrics,
                     pipeline,
                     preprocessing as skpre) 
import csv
from sklearn import tree

### Step 2
Read in .csv files and save as DataFrames.

In [4]:
data_train_df = pd.read_csv("BankChurners.train.csv") 
data_test_df = pd.read_csv("BankChurners.test.csv")
data_train_ft = data_train_df.drop('Target', axis=1)
data_train_tgt = data_train_df["Target"]

### Step 3
Work with only the necessary features.

In [5]:
features = ['Dependent_count',
            'Months_on_book',
            'Total_Trans_Amt',
            'Income_Category',
            'Credit_Limit',
            'Total_Relationship_Count',
            'Avg_Utilization_Ratio',
            'Total_Trans_Ct',
            'Total_Ct_Chng_Q4_Q1', 
            'Total_Revolving_Bal',
            'Total_Amt_Chng_Q4_Q1',
            ]

### Step 4
Work with a practice estimate (median age). Convert features to numerical values as necessary. For now, let's use a simple approach to estimate the age (and consider revisiting this estimate later) when it is missing by using the median from the other samples.

In [6]:
median_age = data_train_df['Customer_Age'].median() # note: by default, this will skip NA/null values
median_age_test = data_test_df['Customer_Age'].median() 
print(f'Median age: {median_age:.2f}')
print(f'Median age (test): {median_age_test:.2f}')
data_train_df['Customer_Age'] = data_train_df['Customer_Age'].fillna(median_age)
data_test_df['Customer_Age'] = data_test_df['Customer_Age'].fillna(median_age_test)

data_train_df['Gender'].replace(['M','F'],[0,1],inplace=True)
data_test_df['Gender'].replace(['M','F'],[0,1],inplace=True)

Median age: 46.00
Median age (test): 46.00


Here is an updated table after sorting through these features.

In [7]:
print("data_train_df:")
display(data_train_df.head(10))
print("data_train_df.info():")
display(data_train_df.info())

data_train_df:


Unnamed: 0,id,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,Total_Relationship_Count,...,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio,Target
0,1,51,1,2,Graduate,Single,40000,Blue,39,3,...,2,2581.0,1722,859.0,0.765,4431,79,0.717,0.667,1
1,2,50,0,2,Unknown,Single,120000,Blue,38,4,...,2,2123.0,995,1128.0,0.626,4516,78,0.625,0.469,1
2,3,44,0,5,Doctorate,Single,120000,Blue,31,5,...,2,7567.0,2496,5071.0,0.709,4076,60,0.579,0.33,1
3,4,38,0,2,High School,Married,120000,Blue,29,4,...,2,2818.0,1656,1162.0,1.404,2916,45,0.957,0.588,1
4,5,32,0,1,Graduate,Married,50000,Blue,24,1,...,2,9711.0,972,8739.0,0.647,14926,115,0.742,0.1,1
5,6,40,1,3,Uneducated,Single,40000,Blue,27,5,...,2,2035.0,0,2035.0,0.779,4107,82,0.783,0.0,1
6,7,40,0,3,High School,Single,100000,Blue,36,2,...,2,4620.0,0,4620.0,0.654,6644,61,0.694,0.0,0
7,8,50,1,4,Graduate,Married,40000,Blue,31,4,...,1,2080.0,1799,281.0,0.651,5130,82,0.864,0.865,1
8,9,65,1,0,Graduate,Married,40000,Blue,53,5,...,5,4161.0,2491,1670.0,0.41,1203,30,0.579,0.599,1
9,10,56,1,2,Graduate,Single,40000,Blue,46,4,...,3,1438.3,0,1438.3,0.622,4333,84,0.826,0.0,1


data_train_df.info():
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6750 entries, 0 to 6749
Data columns (total 21 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   id                        6750 non-null   int64  
 1   Customer_Age              6750 non-null   int64  
 2   Gender                    6750 non-null   int64  
 3   Dependent_count           6750 non-null   int64  
 4   Education_Level           6750 non-null   object 
 5   Marital_Status            6750 non-null   object 
 6   Income_Category           6750 non-null   int64  
 7   Card_Category             6750 non-null   object 
 8   Months_on_book            6750 non-null   int64  
 9   Total_Relationship_Count  6750 non-null   int64  
 10  Months_Inactive_12_mon    6750 non-null   int64  
 11  Contacts_Count_12_mon     6750 non-null   int64  
 12  Credit_Limit              6750 non-null   float64
 13  Total_Revolving_Bal       6750 non-null  

None

### Step 5
Convert all feature values to floating-point numbers for better accuracy when performing tests later.

In [8]:
data_train_df = data_train_df[features].astype(float) 
data_test_df = data_test_df[features].astype(float) 

### Step 6
Create a library of different models to train and track the accuracy of later.

In [9]:
models_to_try = {'nb': naive_bayes.GaussianNB()}

# add k-NN models with various values of k to models_to_try
for k in range(1,42,2):
    models_to_try[f'{k}-NN'] = neighbors.KNeighborsClassifier(n_neighbors=k)

### Step 7
Create a library of standardization/scaler pipelines to use for analyzing the data.

In [10]:
scaler = skpre.StandardScaler()
pipelines_to_try = \
    {'GNB0' : naive_bayes.GaussianNB(),

     # 'SVC(1)' : svm.SVC(kernel="linear"),
     #'SVC(2)' : svm.LinearSVC(),
     #'SVC(3)' : svm.SVC(kernel="poly" ,C=.8),
     #'SVC(4)' : svm.NuSVC(kernel='linear', nu=.2),
     'DTC' : tree.DecisionTreeClassifier(),
     'DTC-5' : tree.DecisionTreeClassifier(max_depth=5),
     'DTC-10' : tree.DecisionTreeClassifier(max_depth=10),
     '5NN-C' : neighbors.KNeighborsClassifier(),
     '10NN-C' : neighbors.KNeighborsClassifier(n_neighbors=10)}

baseline = dummy.DummyClassifier(strategy="uniform")
for model_name in models_to_try:
    pipelines_to_try[f'std_{model_name}_pipe'] = pipeline.make_pipeline(scaler, 
                                                      models_to_try[model_name])


sv_classifiers = {"SVC(Linear)"   : svm.SVC(kernel='linear'),
                  "NuSVC(Linear)" : svm.NuSVC(kernel='linear', nu=.9)} 

### Step 8
Use cross-validation to test the accuracy of the models.

In [11]:
accuracy_scores = {}
for name, model in pipelines_to_try.items():
    #loo = skms.LeaveOneOut()
    scores = skms.cross_val_score(model,
                                  data_train_ft[features],
                                  data_train_tgt,
                                  #cv=loo,
                                  cv=10,
                                  scoring='accuracy')
    mean_accuracy = scores.mean()
    accuracy_scores[name] = mean_accuracy
    print(f'{name}: {mean_accuracy:.3f}')

GNB0: 0.885
DTC: 0.934
DTC-5: 0.934
DTC-10: 0.937
5NN-C: 0.884
10NN-C: 0.883
std_nb_pipe: 0.871
std_1-NN_pipe: 0.893
std_3-NN_pipe: 0.912
std_5-NN_pipe: 0.914
std_7-NN_pipe: 0.911
std_9-NN_pipe: 0.910
std_11-NN_pipe: 0.909
std_13-NN_pipe: 0.909
std_15-NN_pipe: 0.908
std_17-NN_pipe: 0.907
std_19-NN_pipe: 0.906
std_21-NN_pipe: 0.904
std_23-NN_pipe: 0.903
std_25-NN_pipe: 0.900
std_27-NN_pipe: 0.900
std_29-NN_pipe: 0.899
std_31-NN_pipe: 0.898
std_33-NN_pipe: 0.898
std_35-NN_pipe: 0.898
std_37-NN_pipe: 0.898
std_39-NN_pipe: 0.897
std_41-NN_pipe: 0.896


### Step 9
Find the pipeline with the highest accuracy and save it for testing.

In [12]:
best_pipeline_name = max(accuracy_scores,key=accuracy_scores.get)
print(f'\nBest pipeline: {best_pipeline_name} (accuracy = {accuracy_scores[best_pipeline_name]:.3f})')
final_pipeline = pipelines_to_try[best_pipeline_name]


Best pipeline: DTC-10 (accuracy = 0.937)


### Step 10
Fit and test the highest-scoring model against the testing data. Save the results in submission.csv for uploading to Kaggle.

In [13]:
fit = final_pipeline.fit(data_train_ft[features], data_train_tgt)
predictions = fit.predict(data_test_df[features])

def writeSubmission(predictions):
   i=6751
   submissionList = []
   for prediction in predictions:
       submissionList.append([str(i), str(prediction)])
       i+=1
   with open('submission.csv', 'w', newline='') as submission:
       writer = csv.writer(submission)
       writer.writerow(['id', 'Target'])
       for row in submissionList:
           writer.writerow(row)

writeSubmission(predictions)