Changes (TODO: add to final report as well):
1. removed train/test data, since we decided against using it for our model, this can be reversed pretty easily if we do decide to implement training and testing data
2. removed all the scaled data: we discussed yesterday on call that scaling our data or our charts did not seem worthwhile to us, and instead we decided to stick to cutting outliers from our graphs, focussing on the main "domain", if we decide to reverse this it will be a little tricky since we would have to figure out an alternative way to represent the graphs, but it's definitely doable
3. added histograms of each predictor variable, faceting on approval, and colour coding them as well. This is to support our claim in the project of focussing all our visualizations around Credit Score (quick note: I exercised my own judgement for the maxbins, I am open to more suggestions: it does get rid of that little quirk of our data where there was a weirdly high number of rejections at age 29 but I think overall it looks better)
4. added an infographic of summary means of Approved/Not Approved data. I did this to add merit to my decision of choosing 2 observations (1 approval 1 non approval) to show our model "in use", but I know that yesterday we discussed maybe having a few more observations to show our model working so I would be happy to take another route as well, or add more observations based on other points of importance we can think of. I also added both observations to our charts
5. removed the debt/income and age/income charts as we decided to keep the visualizations consistent with focussing on credit score, can reverse easily as well

In [1]:
import altair as alt
import numpy as np
import pandas as pd
from sklearn import set_config
from sklearn.compose import make_column_transformer
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.model_selection import (
    GridSearchCV,
    RandomizedSearchCV,
    cross_validate,
    train_test_split,
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

# load data from the original source on the web 
data = pd.read_csv('https://docs.google.com/spreadsheets/d/18Uxxd5YAfGLn4qWe5crLplKU19zPXPziaomAZLWpDyI/gviz/tq?tqx=out:csv&sheet=clean_dataset')

# wrangling and cleaning the data from it's original (downloaded) format to the format necessary for our analysis
data = data.drop('DriversLicense', axis=1)
data = data.drop('Gender', axis=1)
data = data.drop('Married', axis=1)
data = data.drop('BankCustomer', axis=1)
data = data.drop('Industry', axis=1)
data = data.drop('Ethnicity', axis=1)
data = data.drop('YearsEmployed', axis=1)
data = data.drop('PriorDefault', axis=1)
data = data.drop('Employed', axis=1)
data = data.drop('Citizen', axis=1)
data = data.drop('ZipCode', axis=1)
data["Approved"] = data["Approved"].replace({
     0 : "No",
     1 : "Yes"
 })

data

Unnamed: 0,Age,Debt,CreditScore,Income,Approved
0,30.83,0.000,1,0,Yes
1,58.67,4.460,6,560,Yes
2,24.50,0.500,0,824,Yes
3,27.83,1.540,5,3,Yes
4,20.17,5.625,0,0,Yes
...,...,...,...,...,...
685,21.08,10.085,0,0,No
686,22.67,0.750,2,394,No
687,25.25,13.500,1,1,No
688,17.92,0.205,0,750,No


In [2]:
# data summarization
# bar charts of every predictor variable, maxbins = 30

data_vis_cs = alt.Chart(data).mark_bar().encode(
    x = alt.X('CreditScore').scale(domain=[0,40],clamp= True).bin(maxbins=30),
    y = alt.Y('count()'),
    color = alt.Color('Approved')
).properties(
    height=100
).facet(
    "Approved:N",)
data_vis_cs

In [3]:
data_vis_i = alt.Chart(data).mark_bar().encode(
    x = alt.X('Income').scale(clamp= True).bin(maxbins=20),
    y = alt.Y('count()'),
    color = alt.Color('Approved')
).properties(
    height=100
).facet(
    "Approved:N",)
data_vis_i

In [4]:
data_vis_age = alt.Chart(data).mark_bar().encode(
    x = alt.X('Age').scale(clamp= True).bin(maxbins=45),
    y = alt.Y('count()'),
    color = alt.Color('Approved')
).properties(
    height=100
).facet(
    "Approved:N",)
data_vis_age

In [5]:
data_vis_d = alt.Chart(data).mark_bar().encode(
    x = alt.X('Debt').scale(clamp= True).bin(maxbins=30),
    y = alt.Y('count()'),
    color = alt.Color('Approved')
).properties(
    height=100
).facet(
    "Approved:N",)
data_vis_d

In [6]:
# Exploratory data analysis 
# Data Vizualisation (Preliminary)

In [7]:
#Debt vs Credit Score
scatterplot_debt_creditscore = alt.Chart(data, title = "Debt vs Credit Score").mark_point().encode(
    y=alt.Y("Debt").title("Debt").scale(domain=[0,40],clamp=True),
    x=alt.X("CreditScore").scale(domain=[0,20],clamp= True),
    color=alt.Color("Approved")
)
scatterplot_debt_creditscore

In [8]:
#Income vs Credit Score
scatterplot_income = alt.Chart(data, title = "Income vs Credit Score").mark_point().encode(
    x=alt.X("CreditScore").title("Credit Score").scale(domain=[0,20],clamp=True),
    y=alt.Y("Income").scale(domain=[0,10000],clamp= True),
    color=alt.Color("Approved")
)
scatterplot_income

In [9]:
#Age vs Credit Score
scatterplot_age = alt.Chart(data, title = "Age vs Credit Score").mark_point().encode(
    x=alt.X("CreditScore").title("Credit Score").scale(domain=[0,20],clamp=True),
    y=alt.Y("Age").scale(domain=[0,90],clamp= True),
    color=alt.Color("Approved")
)
scatterplot_age

In [10]:
# point 4: knn classifier
# Model: knn Neighbours Classification
knn = KNeighborsClassifier(n_neighbors=7)

# create the preprocessor
preprocessor = make_column_transformer(
    (StandardScaler(), ["Age", "Debt","Income","CreditScore"]),
    remainder='passthrough'
)
data_fit = preprocessor.fit(data)
data_fit

In [11]:
# Make pipeline and fit it to our data
knn_fit = make_pipeline(preprocessor, knn).fit(
    X=data.drop(columns=['Approved']), 
    y=data["Approved"]
)
knn_fit

In [12]:
data_summary_means = pd.DataFrame()
data_summary_means['Approved'] = (data[data['Approved'] == 'Yes'].mean(numeric_only=True))
data_summary_means['Not Approved'] = (data[data['Approved'] == 'No'].mean(numeric_only=True))
data_summary_means

Unnamed: 0,Approved,Not Approved
Age,33.686221,29.773029
Debt,5.904951,3.839948
CreditScore,4.605863,0.631854
Income,2038.859935,198.605744


In [13]:
# Add 2 new observations based on the data summary means above, 1 where we expect approval, 1 where we don't
# Observation 1: Using Approved data means
new_observation_1 = pd.DataFrame({"Age": [33], "Debt": [6], "CreditScore": [5], "Income":[2039]})

# Prediction 1
prediction_1 = knn_fit.predict(new_observation_1)
prediction_1


array(['Yes'], dtype=object)

In [14]:
# Observation 2: Using Not Approved data means
new_observation_2 = pd.DataFrame({"Age": [29], "Debt": [4], "CreditScore": [1], "Income":[198]})


# Prediction 2
prediction_2 = knn_fit.predict(new_observation_2)
prediction_2

array(['No'], dtype=object)

In [15]:
new_obs = pd.DataFrame(new_observation_1)
new_obs = new_obs.append(new_observation_2)
new_obs

  new_obs = new_obs.append(new_observation_2)


Unnamed: 0,Age,Debt,CreditScore,Income
0,33,6,5,2039
0,29,4,1,198


In [16]:


debt_pred = scatterplot_debt_creditscore + (
    # Standardize the new data point with transformer fitted on the original data
    alt.Chart(new_obs)
    .mark_point(size=80, color='black', clip=True).encode(
        y=alt.Y("Debt").title("Debt").scale(domain=[0,40],clamp=True),
    x=alt.X("CreditScore").scale(domain=[0,20],clamp= True),
    )
)
age_pred = scatterplot_age + (
    alt.Chart(new_obs)
    .mark_point(size=80, color='black', clip=True).encode(
        x=alt.X("CreditScore").scale(domain=[0,20],clamp=True),
        y=alt.Y("Age").scale(domain=[0,90],clamp= True),
    )
)
income_pred = scatterplot_income + (
    alt.Chart(new_obs)
    .mark_point(size=80, color='black', clip=True).encode(
        x=alt.X("CreditScore").title("Credit Score").scale(domain=[0,20],clamp=True),
        y=alt.Y("Income").scale(domain=[0,10000],clamp= True),
    )
)
final_plot = debt_pred&age_pred&income_pred
final_plot