# Classification using k-Nearest Neighbors

## Dataset: Credit Approval Decisions

## Import packages

In [105]:
#Data preparation
import numpy as np
import pandas as pd

#Classification
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

## Load dataset

In [106]:
df_train = pd.read_csv('credit_approval_decisions_train.csv')
df_predict = pd.read_csv('credit_approval_decisions_test.csv')

In [3]:
df_train.head()

Unnamed: 0,Homeowner,Credit_Score,Years_of_Credit_History,Revolving_Balance,Revolving_Utilization,Decision
0,Y,725,20,"$11,320",25%,Approve
1,Y,573,9,"$7,200",70%,Reject
2,Y,677,11,"$20,000",55%,Approve
3,N,625,15,"$12,800",65%,Reject
4,N,527,12,"$5,700",75%,Reject


## Explore dataset
### Data types

The data types of the different variables in the dataset are:

In [72]:
df_train.dtypes

Homeowner                   object
Credit_Score                 int64
Years_of_Credit_History      int64
Revolving_Balance          float64
Revolving_Utilization      float64
Decision                    object
dtype: object

In [54]:
df_predict.dtypes

Homeowner                   object
Credit_Score                 int64
Years_of_Credit_History      int64
Revolving_Balance           object
Revolving_Utilization       object
Decision                   float64
dtype: object

For machine learning purposes, all variables have to be numerical. 
'Homeowner', 'Revolving Balance', 'Revolving Utilization' and 'Decision' must therefore be converted to numerical.

### Shape
The number of rows and columns in the dataset can be found with DataFrame.shape:

In [13]:
df_train.shape

(51, 6)

In [19]:
df_predict.shape

(7, 6)

As seen above, the training dataset has 51 rows and 6 columns.
The testing dataset has 7 rows and 6 columns.

## Data preparation

* Change data type of 'Revolving Balance' and 'Revolving Utilization' to numerical
* Convert categorical variables 'Homeowner' and 'Decision' to numerical

### Change data type to numerical

In [107]:
df_train['Revolving_Balance'] = df_train['Revolving_Balance'].str.lstrip('$')
df_train['Revolving_Balance'] = df_train['Revolving_Balance'].str.replace(',', '')
df_train['Revolving_Balance'] = df_train['Revolving_Balance'].astype(float)

df_train['Revolving_Utilization'] = df_train['Revolving_Utilization'].str.rstrip('%')
df_train['Revolving_Utilization'] = df_train['Revolving_Utilization'].astype(float)/100

df_predict['Revolving_Balance'] = df_predict['Revolving_Balance'].str.lstrip('$')
df_predict['Revolving_Balance'] = df_predict['Revolving_Balance'].str.replace(',', '')
df_predict['Revolving_Balance'] = df_predict['Revolving_Balance'].astype(float)

df_predict['Revolving_Utilization'] = df_predict['Revolving_Utilization'].str.rstrip('%')
df_predict['Revolving_Utilization'] = df_predict['Revolving_Utilization'].astype(float)/100

In [57]:
df_train.head()
#df_predict.head()

Unnamed: 0,Homeowner,Credit_Score,Years_of_Credit_History,Revolving_Balance,Revolving_Utilization,Decision
0,Y,725,20,11320.0,0.25,Approve
1,Y,573,9,7200.0,0.7,Reject
2,Y,677,11,20000.0,0.55,Approve
3,N,625,15,12800.0,0.65,Reject
4,N,527,12,5700.0,0.75,Reject


 ### Convert categorical variables to numerical

In [108]:
cat_to_num = {'N': 0, 
             'Y': 1,
             'Reject': 0,
             'Approve': 1}

df_train.replace(cat_to_num, inplace=True)
df_predict.replace(cat_to_num, inplace=True)

In [59]:
df_train.head(10)
#df_predict.head()

Unnamed: 0,Homeowner,Credit_Score,Years_of_Credit_History,Revolving_Balance,Revolving_Utilization,Decision
0,1,725,20,11320.0,0.25,1
1,1,573,9,7200.0,0.7,0
2,1,677,11,20000.0,0.55,1
3,0,625,15,12800.0,0.65,0
4,0,527,12,5700.0,0.75,0
5,1,795,22,9000.0,0.12,1
6,0,733,7,35200.0,0.2,1
7,0,620,5,22800.0,0.62,0
8,1,591,17,16500.0,0.5,0
9,1,660,24,9200.0,0.35,1


In [109]:
df_train['Decision'] = df_train['Decision'].astype(float)
df_train['Homeowner'] = df_train['Homeowner'].astype(float)

df_predict= df_predict.drop('Decision', axis=1)
df_predict['Homeowner'] = df_predict['Homeowner'].astype(float)

In [66]:
df_predict.head()

Unnamed: 0,Homeowner,Credit_Score,Years_of_Credit_History,Revolving_Balance,Revolving_Utilization
0,1.0,700,8,21000.0,0.15
1,0.0,520,1,4000.0,0.9
2,1.0,650,10,8500.0,0.25
3,0.0,602,7,16300.0,0.7
4,0.0,549,2,2500.0,0.9


All variables are now numerical.

## Normalizing and scaling the data

When the range of values in a feature is very different from the range in another, normalizing or scaling the values are necessary to improve prediction accuracy. Without scaling, the features with greater numeric range could have more impact than those with a smaller range.

### Min-Max Normalization

Min-Max Normalization is a technique often used with KNN classification problems.
<br>The equation for Min-Max Normalization is:

\begin{equation}
\frac{x_i - min(x)}{max(x)-min(x)}
\end{equation}

In [110]:
scaler = MinMaxScaler()
df_train_minmax = scaler.fit_transform(df_train)
df_train_minmax = pd.DataFrame(df_train_minmax, columns = ['Homeowner', 'Credit_Score', 'Years_of_Credit_History', 'Revolving_Balance', 'Revolving_Utilization', 'Decision'])

In [111]:
df_predict_minmax = scaler.fit_transform(df_predict)
df_predict_minmax = pd.DataFrame(df_predict_minmax, columns = ['Homeowner', 'Credit_Score', 'Years_of_Credit_History', 'Revolving_Balance', 'Revolving_Utilization'])


df_predict_minmax.head()

Unnamed: 0,Homeowner,Credit_Score,Years_of_Credit_History,Revolving_Balance,Revolving_Utilization
0,1.0,0.810811,0.5,1.0,0.0
1,0.0,0.0,0.0,0.081081,1.0
2,1.0,0.585586,0.642857,0.324324,0.133333
3,0.0,0.369369,0.428571,0.745946,0.733333
4,0.0,0.130631,0.071429,0.0,1.0


## Machine learning: KNN Classification

In [112]:
X =  df_train_minmax.drop(['Decision'], axis=1)
y = df_train_minmax.Decision

#Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [113]:
#instantiate learning model
knn = KNeighborsClassifier(n_neighbors=14)

#fitting the model
knn.fit(X_train, y_train)

#Accuracy score on test data
knn.score(X_test, y_test)

0.9230769230769231

In [114]:
#df before prediction
df_predict_minmax.head()

Unnamed: 0,Homeowner,Credit_Score,Years_of_Credit_History,Revolving_Balance,Revolving_Utilization
0,1.0,0.810811,0.5,1.0,0.0
1,0.0,0.0,0.0,0.081081,1.0
2,1.0,0.585586,0.642857,0.324324,0.133333
3,0.0,0.369369,0.428571,0.745946,0.733333
4,0.0,0.130631,0.071429,0.0,1.0


In [115]:
#Predict
X_new =  df_predict_minmax
Y_pred = knn.predict(X_new)

In [116]:
df_predict_minmax['Predicted_Decision'] = Y_pred
df_predict_minmax.head(10)

Unnamed: 0,Homeowner,Credit_Score,Years_of_Credit_History,Revolving_Balance,Revolving_Utilization,Predicted_Decision
0,1.0,0.810811,0.5,1.0,0.0,1.0
1,0.0,0.0,0.0,0.081081,1.0,0.0
2,1.0,0.585586,0.642857,0.324324,0.133333,1.0
3,0.0,0.369369,0.428571,0.745946,0.733333,0.0
4,0.0,0.130631,0.071429,0.0,1.0,0.0
5,1.0,1.0,1.0,0.767568,0.04,1.0


### Change numerical prediction to categorical

In [118]:
num_to_cat = {0: 'Reject',
              1: 'Approve'}
                      
df_predict_minmax['Predicted_Decision'].replace(num_to_cat, inplace=True)
df_predict_minmax.head(10)

Unnamed: 0,Homeowner,Credit_Score,Years_of_Credit_History,Revolving_Balance,Revolving_Utilization,Predicted_Decision
0,1.0,0.810811,0.5,1.0,0.0,Approve
1,0.0,0.0,0.0,0.081081,1.0,Reject
2,1.0,0.585586,0.642857,0.324324,0.133333,Approve
3,0.0,0.369369,0.428571,0.745946,0.733333,Reject
4,0.0,0.130631,0.071429,0.0,1.0,Reject
5,1.0,1.0,1.0,0.767568,0.04,Approve


## Results

The results suggest that the first, third and last credit applications should be approved, while the three others should be rejected.