In [38]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn import preprocessing

In [39]:
# read data from csv file and create a DataFrame
credit_df = pd.read_csv("./datasets/credit_data.csv")
credit_df.head()

Unnamed: 0,clientid,income,age,loan,LTI,default
0,1,66155.925095,59.017015,8106.532131,0.122537,0
1,2,34415.153966,48.117153,6564.745018,0.190752,0
2,3,57317.170063,63.108049,8020.953296,0.13994,0
3,4,42709.534201,45.751972,6103.64226,0.142911,0
4,5,66952.688845,18.584336,8770.099235,0.130989,1


In [40]:
# select dependent and independent variables
features = credit_df[["income", "age", "loan"]]
target = credit_df["default"]

In [41]:
# machine learning handle arrays, not DataFrames
X = np.array(features).reshape(-1,3)
y = np.array(target)

X

array([[6.61559251e+04, 5.90170151e+01, 8.10653213e+03],
       [3.44151540e+04, 4.81171531e+01, 6.56474502e+03],
       [5.73171701e+04, 6.31080495e+01, 8.02095330e+03],
       ...,
       [4.43114493e+04, 2.80171669e+01, 5.52278669e+03],
       [4.37560566e+04, 6.39717958e+01, 1.62272260e+03],
       [6.94365796e+04, 5.61526170e+01, 7.37883360e+03]])

In [42]:
# apply min-max normalization to data
X = preprocessing.MinMaxScaler().fit_transform(X)
X

array([[0.9231759 , 0.89209175, 0.58883739],
       [0.28812165, 0.65470788, 0.47682695],
       [0.74633429, 0.9811888 , 0.58262011],
       ...,
       [0.48612202, 0.21695807, 0.40112895],
       [0.47500998, 1.        , 0.1177903 ],
       [0.98881367, 0.82970913, 0.53597028]])

In [43]:
# split dataset fot training and testing. %70 for training, %30 for testing with test_size=0.3 parameter.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [44]:
# create the KNN model and start training
model = KNeighborsClassifier(n_neighbors=20)
fitted_model = model.fit(X_train, y_train)

In [45]:
# take predictions of test dataset
predictions = fitted_model.predict(X_test)

In [46]:
print(confusion_matrix(y_test, predictions))

[[520   5]
 [  4  71]]


In [47]:
print(accuracy_score(y_test, predictions))

0.985


In [48]:
# make cross validation to find optimum k value
cross_val_scores = []
for i in range(1, 100):
    model = KNeighborsClassifier(n_neighbors=i)
    scores = cross_val_score(model, X, y, cv=10, scoring="accuracy")
    cross_val_scores.append(scores.mean())

print(f"Optimal k with cross validation: {np.argmax(cross_val_scores)}")

Optimal k with cross validation: 32


In [49]:
# create the optimized KNN model and start training
optimized_model = KNeighborsClassifier(n_neighbors=32)
fitted_optimized_model = model.fit(X_train, y_train)

In [50]:
# take predictions of test dataset
predictions = fitted_model.predict(X_test)

In [51]:
print(accuracy_score(y_test, predictions))

0.985
