In [1]:
import altair as alt
import numpy as np
import pandas as pd
from sklearn import set_config
from sklearn.compose import make_column_transformer
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.model_selection import (
    GridSearchCV,
    RandomizedSearchCV,
    cross_validate,
    train_test_split,
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

# load data from the original source on the web 
data = pd.read_csv('https://docs.google.com/spreadsheets/d/18Uxxd5YAfGLn4qWe5crLplKU19zPXPziaomAZLWpDyI/gviz/tq?tqx=out:csv&sheet=clean_dataset')

# wrangling and cleaning the data from it's original (downloaded) format to the format necessary for our analysis
data = data.drop('DriversLicense', axis=1)
data = data.drop('Gender', axis=1)
data = data.drop('Married', axis=1)
data = data.drop('BankCustomer', axis=1)
data = data.drop('Industry', axis=1)
data = data.drop('Ethnicity', axis=1)
data = data.drop('YearsEmployed', axis=1)
data = data.drop('PriorDefault', axis=1)
data = data.drop('Employed', axis=1)
data = data.drop('Citizen', axis=1)
data = data.drop('ZipCode', axis=1)
data["Approved"] = data["Approved"].replace({
     0 : "No",
     1 : "Yes"
 })

data

Unnamed: 0,Age,Debt,CreditScore,Income,Approved
0,30.83,0.000,1,0,Yes
1,58.67,4.460,6,560,Yes
2,24.50,0.500,0,824,Yes
3,27.83,1.540,5,3,Yes
4,20.17,5.625,0,0,Yes
...,...,...,...,...,...
685,21.08,10.085,0,0,No
686,22.67,0.750,2,394,No
687,25.25,13.500,1,1,No
688,17.92,0.205,0,750,No


In [2]:
data_train, data_test = train_test_split(data, test_size=0.25, random_state=123)
data_train

Unnamed: 0,Age,Debt,CreditScore,Income,Approved
618,29.58,4.750,1,68,No
121,25.67,12.500,67,258,Yes
352,22.50,11.500,0,4000,No
210,39.33,5.875,14,0,Yes
299,22.17,12.125,2,173,No
...,...,...,...,...,...
98,22.50,11.000,0,0,No
322,33.67,0.375,0,44,Yes
382,24.33,2.500,0,456,No
365,42.83,1.250,1,112,No


In [3]:
approval_count_data = alt.Chart(data_train).mark_bar().encode(
    x="Approved",
    y="count()",
    color="Approved"
)
approval_count_data

In [4]:
# data summarization
# bar charts of every predictor variable, maxbins = 30

data_vis_cs = alt.Chart(data_train).mark_bar().encode(
    x = alt.X('CreditScore').scale(domain=[0,40],clamp= True).bin(maxbins=30),
    y = alt.Y('count()'),
    color = alt.Color('Approved')
).properties(
    height=100
).facet(
    "Approved:N",)
data_vis_cs

In [5]:
data_vis_i = alt.Chart(data_train).mark_bar().encode(
    x = alt.X('Income').scale(clamp= True).bin(maxbins=20),
    y = alt.Y('count()'),
    color = alt.Color('Approved')
).properties(
    height=100
).facet(
    "Approved:N",)
data_vis_i

In [6]:
data_vis_age = alt.Chart(data_train).mark_bar().encode(
    x = alt.X('Age').scale(clamp= True).bin(maxbins=45),
    y = alt.Y('count()'),
    color = alt.Color('Approved')
).properties(
    height=100
).facet(
    "Approved:N",)
data_vis_age

In [7]:
data_vis_d = alt.Chart(data_train).mark_bar().encode(
    x = alt.X('Debt').scale(clamp= True).bin(maxbins=30),
    y = alt.Y('count()'),
    color = alt.Color('Approved')
).properties(
    height=100
).facet(
    "Approved:N",)
data_vis_d

In [8]:
# Exploratory data analysis 
# Data Vizualisation (Preliminary)

In [9]:
#Debt vs Credit Score
scatterplot_debt_creditscore = alt.Chart(data_train, title = "Debt vs Credit Score").mark_point().encode(
    y=alt.Y("Debt").title("Debt").scale(domain=[0,40],clamp=True),
    x=alt.X("CreditScore").scale(domain=[0,20],clamp= True),
    color=alt.Color("Approved")
)
scatterplot_debt_creditscore

In [10]:
#Income vs Credit Score
scatterplot_income = alt.Chart(data_train, title = "Income vs Credit Score").mark_point().encode(
    x=alt.X("CreditScore").title("Credit Score").scale(domain=[0,20],clamp=True),
    y=alt.Y("Income").scale(domain=[0,10000],clamp= True),
    color=alt.Color("Approved")
)
scatterplot_income

In [11]:
#Age vs Credit Score
scatterplot_age = alt.Chart(data_train, title = "Age vs Credit Score").mark_point().encode(
    x=alt.X("CreditScore").title("Credit Score").scale(domain=[0,20],clamp=True),
    y=alt.Y("Age").scale(domain=[0,90],clamp= True),
    color=alt.Color("Approved")
)
scatterplot_age

In [12]:
# point 4: knn classifier
# Model: knn Neighbours Classification
knn = KNeighborsClassifier(n_neighbors=15)
# we increased k from 7, to 11, to 15
# the accuracy of the "No" column increased with every increase in k, but the "Yes" column either decreased in accuracy or didnt change

# create the preprocessor
preprocessor = make_column_transformer(
    (StandardScaler(), ["Age", "Debt","Income","CreditScore"]),
    remainder='passthrough'
)
data_fit = preprocessor.fit(data_train)
data_fit

In [13]:
# Make pipeline and fit it to our data
knn_fit = make_pipeline(preprocessor, knn).fit(
    X=data_train.drop(columns=['Approved']), 
    y=data_train["Approved"]
)

knn_fit

In [14]:
data_test_predictions = data_test.assign(
    predicted = knn_fit.predict(data_test[["Age", "Debt","Income", "Approved","CreditScore"]])
)
data_test_predictions[["Age", "Debt","Income","CreditScore", "Approved", 'predicted']]

Unnamed: 0,Age,Debt,Income,CreditScore,Approved,predicted
399,31.00,2.085,0,0,No,No
250,40.25,21.500,1200,11,Yes,Yes
396,29.83,2.040,1,0,No,No
192,41.75,0.960,600,0,Yes,No
602,29.83,1.250,0,0,No,No
...,...,...,...,...,...,...
100,37.50,1.750,400,0,No,No
572,21.92,0.540,59,1,Yes,No
101,35.25,16.500,0,0,No,No
195,28.25,5.040,7,8,Yes,Yes


In [15]:
data_preds = data_test_predictions[
    data_test_predictions['Approved'] == data_test_predictions['predicted']
]

data_preds.shape[0] / data_test_predictions.shape[0]

0.7572254335260116

In [16]:
# miss/ false negative/ type II error for "Approval" as our signal
# for the paper lets frame it in the way that correctly predicting rejections is more important for a bank
pd.crosstab(
    data_test_predictions["Approved"],
    data_test_predictions["predicted"]
)

predicted,No,Yes
Approved,Unnamed: 1_level_1,Unnamed: 2_level_1
No,90,5
Yes,37,41


In [17]:
data_pipe = make_pipeline(preprocessor, knn)

X=data_train.drop(columns=['Approved'])
y=data_train["Approved"]

cv_5_df = pd.DataFrame(
    cross_validate(
        estimator=data_pipe,
        cv=5,
        X=X,
        y=y
    )
)

cv_5_df

Unnamed: 0,fit_time,score_time,test_score
0,0.008204,0.009024,0.836538
1,0.006537,0.009001,0.701923
2,0.00673,0.008997,0.757282
3,0.006676,0.008886,0.757282
4,0.006573,0.010739,0.699029


In [18]:
cv_5_metrics = cv_5_df.agg(['mean', 'sem'])
cv_5_metrics

Unnamed: 0,fit_time,score_time,test_score
mean,0.006944,0.009329,0.750411
sem,0.000317,0.000353,0.025004


In [19]:
knn_best_k = KNeighborsClassifier()
parameter_grid = {
    "kneighborsclassifier__n_neighbors": range(1, 100, 4),
}
parameter_grid

{'kneighborsclassifier__n_neighbors': range(1, 100, 4)}

In [20]:
data_tune_grid = GridSearchCV(
    estimator=data_pipe,
    param_grid=parameter_grid,
    cv=5
)
data_tune_grid

In [21]:
accuracies_grid = pd.DataFrame(
    data_tune_grid.fit(
        data_train[["Age", "Debt","Income","CreditScore"]],
        data_train["Approved"]
    ).cv_results_
)
accuracies_grid = (
    accuracies_grid[[
        "param_kneighborsclassifier__n_neighbors",
        "mean_test_score",
        "std_test_score"
    ]]
    .assign(sem_test_score=accuracies_grid["std_test_score"] / 10**(1/2))
    .rename(columns={"param_kneighborsclassifier__n_neighbors": "n_neighbors"})
    .drop(columns=["std_test_score"])
)
accuracies_grid

Unnamed: 0,n_neighbors,mean_test_score,sem_test_score
0,1,0.680695,0.011028
1,5,0.719436,0.012293
2,9,0.736875,0.015468
3,13,0.744604,0.015378
4,17,0.7427,0.016288
5,21,0.748506,0.015391
6,25,0.752427,0.016217
7,29,0.746583,0.015437
8,33,0.746583,0.01471
9,37,0.750429,0.014847


In [22]:
accuracy_vs_k = alt.Chart(accuracies_grid).mark_line(point=True).encode(
    x=alt.X("n_neighbors").title("Neighbors"),
    y=alt.Y("mean_test_score")
        .scale(domain=(0.65, 0.80))
        .title("Accuracy estimate")
)

accuracy_vs_k

In [23]:
data_summary_means = pd.DataFrame()
data_summary_means['Approved'] = (data[data['Approved'] == 'Yes'].mean(numeric_only=True))
data_summary_means['Not Approved'] = (data[data['Approved'] == 'No'].mean(numeric_only=True))
data_summary_means

Unnamed: 0,Approved,Not Approved
Age,33.686221,29.773029
Debt,5.904951,3.839948
CreditScore,4.605863,0.631854
Income,2038.859935,198.605744


In [24]:
data_summary_means = pd.DataFrame()
data_summary_means['Approved'] = (data_train[data_train['Approved'] == 'Yes'].mean(numeric_only=True))
data_summary_means['Not Approved'] = (data_train[data_train['Approved'] == 'No'].mean(numeric_only=True))
data_summary_means

Unnamed: 0,Approved,Not Approved
Age,33.311485,30.129826
Debt,5.694476,3.682552
CreditScore,4.820961,0.722222
Income,1752.209607,195.069444


In [25]:
# Add 2 new observations based on the data summary means above, 1 where we expect approval, 1 where we don't
# Observation 1: Using Approved data means
new_observation_1 = pd.DataFrame({"Age": [33], "Debt": [6], "CreditScore": [5], "Income":[2039]})

# Prediction 1
prediction_1 = knn_fit.predict(new_observation_1)
prediction_1


array(['Yes'], dtype=object)

In [26]:
# Observation 2: Using Not Approved data means
new_observation_2 = pd.DataFrame({"Age": [29], "Debt": [4], "CreditScore": [1], "Income":[198]})


# Prediction 2
prediction_2 = knn_fit.predict(new_observation_2)
prediction_2

array(['No'], dtype=object)

In [27]:
new_obs = pd.DataFrame(new_observation_1)
new_obs = new_obs.append(new_observation_2)
new_obs

  new_obs = new_obs.append(new_observation_2)


Unnamed: 0,Age,Debt,CreditScore,Income
0,33,6,5,2039
0,29,4,1,198


In [28]:


debt_pred = scatterplot_debt_creditscore + (
    # Standardize the new data point with transformer fitted on the original data
    alt.Chart(new_obs)
    .mark_point(size=80, color='black', clip=True).encode(
        y=alt.Y("Debt").title("Debt").scale(domain=[0,40],clamp=True),
    x=alt.X("CreditScore").scale(domain=[0,20],clamp= True),
    )
)
age_pred = scatterplot_age + (
    alt.Chart(new_obs)
    .mark_point(size=80, color='black', clip=True).encode(
        x=alt.X("CreditScore").scale(domain=[0,20],clamp=True),
        y=alt.Y("Age").scale(domain=[0,90],clamp= True),
    )
)
income_pred = scatterplot_income + (
    alt.Chart(new_obs)
    .mark_point(size=80, color='black', clip=True).encode(
        x=alt.X("CreditScore").title("Credit Score").scale(domain=[0,20],clamp=True),
        y=alt.Y("Income").scale(domain=[0,10000],clamp= True),
    )
)
final_plot = debt_pred&age_pred&income_pred
final_plot