# Bank Churners
**ML Project 1 - Spring 2022**

Collaborators: Ben DeSollar and Matt McDonnell

### Step 1
Import the necessary packages and frameworks.

In [438]:
import numpy as np
from mlwpy import *
import matplotlib.pyplot as plt
import matplotlib as mpl
import pandas as pd
import seaborn as sns
from sklearn import (datasets, neighbors,
                     naive_bayes,
                     model_selection as skms,
                     linear_model, dummy,
                     metrics,
                     pipeline,
                     preprocessing as skpre) 
import csv
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier

### Step 2
Read in .csv files and save as DataFrames.

In [439]:
data_train_df = pd.read_csv("BankChurners.train.csv") 
data_test_df = pd.read_csv("BankChurners.test.csv")
data_train_ft = data_train_df.drop('Target', axis=1)
data_train_tgt = data_train_df["Target"]

### Step 3
Work with only the necessary features. We chose features that best paint the picture of each customer's financial world; what about their current financial status makes them different than other people? 

In [440]:
features = ['Dependent_count',
            # 'Months_on_book',
            'Total_Trans_Amt',
            'Income_Category',
            'Credit_Limit',
            # 'Total_Relationship_Count',
            'Avg_Utilization_Ratio',
            'Total_Trans_Ct',
            'Total_Ct_Chng_Q4_Q1', 
            'Total_Revolving_Bal',
            'Total_Amt_Chng_Q4_Q1',
            'Marital_Status',
            'Avg_Open_To_Buy',
            #'Gender',
            'Card_Category',
            #'Months_Inactive_12_mon',
            'Education_Level',
            'Customer_Age',
            'Contacts_Count_12_mon'
            ] 

### Step 4
Work with a practice estimate (median age). Convert features to numerical values as necessary. 

For now, let's use a simple approach to estimate the age (and consider revisiting this estimate later) when it is missing by using the median from the other samples.

We didn't encounter any missing values in this dataset, and that made the sorting of data much easier for us.

In [441]:
median_age = data_train_ft['Customer_Age'].median() # note: by default, this will skip NA/null values
median_age_test = data_test_df['Customer_Age'].median() 
print(f'Median age: {median_age:.2f}')
print(f'Median age (test): {median_age_test:.2f}')
data_train_ft['Customer_Age'] = data_train_ft['Customer_Age'].fillna(median_age)
data_test_df['Customer_Age'] = data_test_df['Customer_Age'].fillna(median_age_test)

data_train_ft['Gender'].replace(['M','F'],[0,1],inplace=True)
data_test_df['Gender'].replace(['M','F'],[0,1],inplace=True)

data_train_ft['Marital_Status'].replace(['Single','Married', 'Unknown', 'Divorced'],[0,1,0,0],inplace=True)
data_test_df['Marital_Status'].replace(['Single','Married', 'Unknown', 'Divorced'],[0,1,0,0],inplace=True)

data_train_ft['Avg_Open_To_Buy'] = data_train_ft['Avg_Open_To_Buy'].fillna(0)
data_test_df['Avg_Open_To_Buy'] = data_test_df['Avg_Open_To_Buy'].fillna(0)
data_train_ft['Avg_Open_To_Buy'].replace(['Unknown'],[0],inplace=True)
data_test_df['Avg_Open_To_Buy'].replace(['Unknown'],[0],inplace=True)

data_train_ft['Card_Category'] = data_train_ft['Card_Category'].fillna(0)
data_test_df['Card_Category'] = data_test_df['Card_Category'].fillna(0)
data_train_ft['Card_Category'].replace(['Platinum', 'Gold', 'Silver', 'Blue'],[0, 1, 2, 3],inplace=True)
data_test_df['Card_Category'].replace(['Platinum', 'Gold', 'Silver', 'Blue'],[0, 1, 2, 3],inplace=True)


data_train_ft['Education_Level'] = data_train_ft['Education_Level'].fillna(0)
data_test_df['Education_Level'] = data_test_df['Education_Level'].fillna(0)
data_train_ft['Education_Level'].replace(['Doctorate', 'Post-Graduate', 'Graduate', 'College', 'High School', 'Uneducated', 'Unknown'],[0, 1, 2, 3, 4, 5, 5],inplace=True)
data_test_df['Education_Level'].replace(['Doctorate', 'Post-Graduate', 'Graduate', 'College', 'High School', 'Uneducated', 'Unknown'],[0, 1, 2, 3, 4, 5, 5],inplace=True)



Median age: 46.00
Median age (test): 46.00


Here is an updated table after sorting through these features.

In [442]:
print("data_train_df:")
display(data_train_df.head(10))
print("data_train_df.info():")
display(data_train_df.info())
# print(data_train_df['Marital_Status'].head(100))
display(data_train_df['Card_Category'].head(25))

data_train_df:


Unnamed: 0,id,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,Total_Relationship_Count,...,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio,Target
0,1,51,F,2,Graduate,Single,40000,Blue,39,3,...,2,2581.0,1722,859.0,0.765,4431,79,0.717,0.667,1
1,2,50,M,2,Unknown,Single,120000,Blue,38,4,...,2,2123.0,995,1128.0,0.626,4516,78,0.625,0.469,1
2,3,44,M,5,Doctorate,Single,120000,Blue,31,5,...,2,7567.0,2496,5071.0,0.709,4076,60,0.579,0.33,1
3,4,38,M,2,High School,Married,120000,Blue,29,4,...,2,2818.0,1656,1162.0,1.404,2916,45,0.957,0.588,1
4,5,32,M,1,Graduate,Married,50000,Blue,24,1,...,2,9711.0,972,8739.0,0.647,14926,115,0.742,0.1,1
5,6,40,F,3,Uneducated,Single,40000,Blue,27,5,...,2,2035.0,0,2035.0,0.779,4107,82,0.783,0.0,1
6,7,40,M,3,High School,Single,100000,Blue,36,2,...,2,4620.0,0,4620.0,0.654,6644,61,0.694,0.0,0
7,8,50,F,4,Graduate,Married,40000,Blue,31,4,...,1,2080.0,1799,281.0,0.651,5130,82,0.864,0.865,1
8,9,65,F,0,Graduate,Married,40000,Blue,53,5,...,5,4161.0,2491,1670.0,0.41,1203,30,0.579,0.599,1
9,10,56,F,2,Graduate,Single,40000,Blue,46,4,...,3,1438.3,0,1438.3,0.622,4333,84,0.826,0.0,1


data_train_df.info():
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6750 entries, 0 to 6749
Data columns (total 21 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   id                        6750 non-null   int64  
 1   Customer_Age              6750 non-null   int64  
 2   Gender                    6750 non-null   object 
 3   Dependent_count           6750 non-null   int64  
 4   Education_Level           6750 non-null   object 
 5   Marital_Status            6750 non-null   object 
 6   Income_Category           6750 non-null   int64  
 7   Card_Category             6750 non-null   object 
 8   Months_on_book            6750 non-null   int64  
 9   Total_Relationship_Count  6750 non-null   int64  
 10  Months_Inactive_12_mon    6750 non-null   int64  
 11  Contacts_Count_12_mon     6750 non-null   int64  
 12  Credit_Limit              6750 non-null   float64
 13  Total_Revolving_Bal       6750 non-null  

None

0     Blue
1     Blue
2     Blue
3     Blue
4     Blue
5     Blue
6     Blue
7     Blue
8     Blue
9     Blue
10    Blue
11    Blue
12    Blue
13    Blue
14    Blue
15    Blue
16    Blue
17    Blue
18    Blue
19    Blue
20    Blue
21    Blue
22    Blue
23    Blue
24    Blue
Name: Card_Category, dtype: object

### Step 5
Convert all feature values to floating-point numbers for better accuracy when performing tests later.

In [443]:
data_train_ft = data_train_ft[features].astype(float) 
data_test_df = data_test_df[features].astype(float) 
data_train_tgt = data_train_tgt.astype(float)
print(data_train_ft[features].info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6750 entries, 0 to 6749
Data columns (total 15 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Dependent_count        6750 non-null   float64
 1   Total_Trans_Amt        6750 non-null   float64
 2   Income_Category        6750 non-null   float64
 3   Credit_Limit           6750 non-null   float64
 4   Avg_Utilization_Ratio  6750 non-null   float64
 5   Total_Trans_Ct         6750 non-null   float64
 6   Total_Ct_Chng_Q4_Q1    6750 non-null   float64
 7   Total_Revolving_Bal    6750 non-null   float64
 8   Total_Amt_Chng_Q4_Q1   6750 non-null   float64
 9   Marital_Status         6750 non-null   float64
 10  Avg_Open_To_Buy        6750 non-null   float64
 11  Card_Category          6750 non-null   float64
 12  Education_Level        6750 non-null   float64
 13  Customer_Age           6750 non-null   float64
 14  Contacts_Count_12_mon  6750 non-null   float64
dtypes: f

### Step 6
Create a library of different models to train and track the accuracy of later. Like the feature selection, having a wide variety of models to test out helps us ensure that our predictions are as accurate as possible.

In [444]:
models_to_try = {'nb': naive_bayes.GaussianNB()}

# add k-NN models with various values of k to models_to_try
for k in range(1,15,3):
    models_to_try[f'{k}-NN'] = neighbors.KNeighborsClassifier(n_neighbors=k)

### Step 7
Create a library of standardization/scaler pipelines to use for analyzing the data. Like with features and models, classifiers also play a role in our success in the competition, so we tried as many as possible!

In [445]:
scaler = skpre.StandardScaler()
pipelines_to_try = \
    {'GNB0' : naive_bayes.GaussianNB(),
    'RandomForestClassifier' : RandomForestClassifier()}
for k in range(1,15,2):
    models_to_try[f'DTC-{k}'] =  tree.DecisionTreeClassifier(max_depth=k)

baseline = dummy.DummyClassifier(strategy="uniform")
for model_name in models_to_try:
    pipelines_to_try[f'std_{model_name}_pipe'] = pipeline.make_pipeline(scaler, 
                                                      models_to_try[model_name])


sv_classifiers = {"SVC(Linear)"   : svm.SVC(kernel='linear'),
                  "NuSVC(Linear)" : svm.NuSVC(kernel='linear', nu=.9)} 

### Step 8
Use cross-validation to test the accuracy of the models.

In [446]:
accuracy_scores = {}
for name, model in pipelines_to_try.items():
    #loo = skms.LeaveOneOut()
    scores = skms.cross_val_score(model,
                                  data_train_ft,
                                  data_train_tgt,
                                  #cv=loo,
                                  cv=10,
                                  scoring='accuracy')
    mean_accuracy = scores.mean()
    accuracy_scores[name] = mean_accuracy
    print(f'{name}: {mean_accuracy:.3f}')

GNB0: 0.876
RandomForestClassifier: 0.953
std_nb_pipe: 0.860
std_1-NN_pipe: 0.885
std_4-NN_pipe: 0.892
std_7-NN_pipe: 0.892
std_10-NN_pipe: 0.895
std_13-NN_pipe: 0.890
std_DTC-1_pipe: 0.835
std_DTC-3_pipe: 0.901
std_DTC-5_pipe: 0.930
std_DTC-7_pipe: 0.934
std_DTC-9_pipe: 0.932
std_DTC-11_pipe: 0.936
std_DTC-13_pipe: 0.931


### Step 9
Find the pipeline with the highest accuracy and save it for testing.

In [447]:
best_pipeline_name = max(accuracy_scores,key=accuracy_scores.get)
print(f'\nBest pipeline: {best_pipeline_name} (accuracy = {accuracy_scores[best_pipeline_name]:.3f})')
final_pipeline = pipelines_to_try[best_pipeline_name]


Best pipeline: RandomForestClassifier (accuracy = 0.953)


### Step 10
Fit and test the highest-scoring model against the testing data. Save the results in submission.csv for uploading to Kaggle.

In [448]:
fit = final_pipeline.fit(data_train_ft, data_train_tgt)
predictions = fit.predict(data_test_df[features])
predictions = predictions.astype(int)
print(predictions)

def writeSubmission(predictions):
   i=6751
   submissionList = []
   for prediction in predictions:
       submissionList.append([str(i), str(prediction)])
       i+=1
   with open('submission.csv', 'w', newline='') as submission:
       writer = csv.writer(submission)
       writer.writerow(['id', 'Target'])
       for row in submissionList:
           writer.writerow(row)

writeSubmission(predictions)

[1 0 1 ... 0 1 1]
