In [2]:
import altair as alt
import numpy as np
import pandas as pd
from sklearn import set_config
from sklearn.compose import make_column_transformer
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.model_selection import (
    GridSearchCV,
    RandomizedSearchCV,
    cross_validate,
    train_test_split,
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

data = pd.read_csv('https://docs.google.com/spreadsheets/d/18Uxxd5YAfGLn4qWe5crLplKU19zPXPziaomAZLWpDyI/gviz/tq?tqx=out:csv&sheet=clean_dataset')

#wrangling data
data = data.drop('DriversLicense', axis=1)
data = data.drop('Gender', axis=1)
data = data.drop('Married', axis=1)
data = data.drop('BankCustomer', axis=1)
data = data.drop('Industry', axis=1)
data = data.drop('Ethnicity', axis=1)
data = data.drop('YearsEmployed', axis=1)
data = data.drop('PriorDefault', axis=1)
data = data.drop('Employed', axis=1)
data = data.drop('Citizen', axis=1)
data = data.drop('ZipCode', axis=1)
data["Approved"] = data["Approved"].replace({
     0 : "No",
     1 : "Yes"
 })


# point 3: Exploratory data analysis 
# TODO

# point 4: knn classifier
# data_train, data_test = train_test_split(data, test_size=0.25, random_state=123)
# # preliminary look at training data
# data_train
# # preliminary look at testing data
# # data_test

# X_train = data_train[["Age", "Debt","Income","CreditScore"]]  # A single column data frame
# y_train = data_train["Approved"]  # A series

# X_test = data_test[["Age", "Debt", "Income" , "CreditScore"]]  # A single column data frame
# y_test = data_test["Approved"] 


# data.info()


# plot = alt.Chart(data).mark_point().encode(
#     x=alt.X("Debt").title("Debt (USD)"),
#     y=alt.Y("CreditScore").title("Credit score"),
#     color=alt.Color("Approved")
# )


In [3]:
knn = KNeighborsClassifier(n_neighbors=7)

# create the centering / scaling preprocessor
preprocessor = make_column_transformer(
    (StandardScaler(), ["Age", "Debt","Income","CreditScore"]),
    remainder='passthrough'
)
data_scaled = preprocessor.fit_transform(data)

data_scaled = pd.DataFrame(data_scaled).rename(columns={0:'Age', 1:'Debt', 2:'CreditScore', 3:'Income', 4:'Approved'}, errors="raise")
data_scaled
data_plot = alt.Chart(data_scaled).mark_point().encode(
    x=alt.X('Debt').title('Debt in USD').scale(zero=True),
    y=alt.Y('CreditScore').title('Credit Score').scale(zero=False),
    color=alt.Color('Approved')
)
data_plot


# data_plot = alt.Chart(data).mark_point().encode(
#     x=alt.X('Debt').title('Debt in USD'),
#     y=alt.Y('CreditScore').title('Credit Score'),
#     color=alt.Color('Approved')
# )
# data_hist = alt.Chart(data).mark_bar().encode(
#     x=alt.X('Debt').title('Debt in USD').bin(maxbins= 30),
#     y='count()',
# )
# data_hist
# data_scaled_hist = alt.Chart(data_scaled).mark_bar().encode(
#     x=alt.X('Debt').title('Debt in USD').bin(maxbins = 30),
#     y='count()',
# )
# data_hist & data_scaled_hist

In [4]:
data.std()

  data.std()


Age              11.860245
Debt              4.978163
CreditScore       4.862940
Income         5210.102598
dtype: float64

In [5]:

knn_fit = make_pipeline(preprocessor, knn).fit(
    X=data.drop(columns=['Approved']), 
    y=data["Approved"]
)
knn_fit
new_observation = pd.DataFrame({"Age": [31], "Debt": [5], "CreditScore": [5], "Income":[3395]})
prediction = knn_fit.predict(new_observation)
prediction

test = preprocessor.transform(new_observation)
testdf = pd.DataFrame(test)
diffcols = testdf.rename(columns={0:'Age', 1:'Debt', 2:'CreditScore', 3:'Income'}, errors="raise")
diffcols

test_plot = data_plot + (
    # Standardize the new data point with transformer fitted on the original data
    alt.Chart(diffcols)
    .mark_point(size=80, color='black', clip=True).encode(
        x="Debt",
        y=alt.Y("CreditScore").scale(type="symlog", domain=[-0.4, 20], nice=False)
    )
)

test_plot