# Credit Risks

### Import Libraries

In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

### Import Data and Explore

In [2]:
from sklearn.datasets import fetch_openml
dataset = fetch_openml(name='credit-g', version='1')
dataset

{'data': array([[ 0.,  6.,  4., ...,  1.,  1.,  0.],
        [ 1., 48.,  2., ...,  1.,  0.,  0.],
        [ 3., 12.,  4., ...,  2.,  0.,  0.],
        ...,
        [ 3., 12.,  2., ...,  1.,  0.,  0.],
        [ 0., 45.,  2., ...,  1.,  1.,  0.],
        [ 1., 45.,  4., ...,  1.,  0.,  0.]]),
 'target': array(['good', 'bad', 'good', 'good', 'bad', 'good', 'good', 'good',
        'good', 'bad', 'bad', 'bad', 'good', 'bad', 'good', 'bad', 'good',
        'good', 'bad', 'good', 'good', 'good', 'good', 'good', 'good',
        'good', 'good', 'good', 'good', 'bad', 'good', 'good', 'good',
        'good', 'good', 'bad', 'good', 'bad', 'good', 'good', 'good',
        'good', 'good', 'good', 'bad', 'good', 'good', 'good', 'good',
        'good', 'good', 'good', 'good', 'good', 'bad', 'good', 'bad',
        'good', 'good', 'bad', 'good', 'good', 'bad', 'bad', 'good',
        'good', 'good', 'good', 'bad', 'good', 'good', 'good', 'good',
        'good', 'bad', 'good', 'bad', 'good', 'good', 'good

In [3]:
cr = pd.DataFrame(dataset.data)
cr.columns = dataset.feature_names
target = pd.Series(dataset.target)
target.columns = ['class']

In [4]:
cr.head()

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,residence_since,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker
0,0.0,6.0,4.0,3.0,1169.0,4.0,4.0,4.0,2.0,0.0,4.0,0.0,67.0,2.0,1.0,2.0,2.0,1.0,1.0,0.0
1,1.0,48.0,2.0,3.0,5951.0,0.0,2.0,2.0,1.0,0.0,2.0,0.0,22.0,2.0,1.0,1.0,2.0,1.0,0.0,0.0
2,3.0,12.0,4.0,6.0,2096.0,0.0,3.0,2.0,2.0,0.0,3.0,0.0,49.0,2.0,1.0,1.0,1.0,2.0,0.0,0.0
3,0.0,42.0,2.0,2.0,7882.0,0.0,3.0,2.0,2.0,2.0,4.0,1.0,45.0,2.0,2.0,1.0,2.0,2.0,0.0,0.0
4,0.0,24.0,3.0,0.0,4870.0,0.0,2.0,3.0,2.0,0.0,4.0,3.0,53.0,2.0,2.0,2.0,2.0,2.0,0.0,0.0


In [5]:
cr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 20 columns):
checking_status           1000 non-null float64
duration                  1000 non-null float64
credit_history            1000 non-null float64
purpose                   1000 non-null float64
credit_amount             1000 non-null float64
savings_status            1000 non-null float64
employment                1000 non-null float64
installment_commitment    1000 non-null float64
personal_status           1000 non-null float64
other_parties             1000 non-null float64
residence_since           1000 non-null float64
property_magnitude        1000 non-null float64
age                       1000 non-null float64
other_payment_plans       1000 non-null float64
housing                   1000 non-null float64
existing_credits          1000 non-null float64
job                       1000 non-null float64
num_dependents            1000 non-null float64
own_telephone             1000

In [6]:
# Check for missing values
cr.isnull().sum(axis = 0)

checking_status           0
duration                  0
credit_history            0
purpose                   0
credit_amount             0
savings_status            0
employment                0
installment_commitment    0
personal_status           0
other_parties             0
residence_since           0
property_magnitude        0
age                       0
other_payment_plans       0
housing                   0
existing_credits          0
job                       0
num_dependents            0
own_telephone             0
foreign_worker            0
dtype: int64

### Preprocessing

In [7]:
# Data already encoded from openml

In [8]:
# Scaling
# All data is categorical in nature with the exception on credit amount
x = cr['credit_amount'].values.reshape(-1, 1)
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)
cr['credit_amount'] = x_scaled

In [9]:
# Split the data, 80% training, 10% test, 10% validation
X = cr
y = target
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2, random_state=0)
X_test, X_val, y_test, y_val = train_test_split(X_train, y_train, test_size=0.5, random_state=1)

### KNN Model

In [10]:
# Find the optimal value of K for KNN
k_range = range(1, 15)
k_scores = []

for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, X, y, cv=10, scoring='accuracy')
    k_scores.append(scores.mean())

print(k_scores)
print('best k:',(np.asarray(k_scores,dtype=float)).argmax()-1)

[0.692, 0.617, 0.695, 0.6759999999999999, 0.7100000000000001, 0.6829999999999999, 0.713, 0.704, 0.709, 0.712, 0.717, 0.71, 0.7170000000000001, 0.7190000000000001]
best k: 12


In [11]:
# Using k=12
model = KNeighborsClassifier(n_neighbors=12).fit(X_train, y_train)
score = model.score(X_test, y_test)
print('score ',score)
y_pred=model.predict(X_test)
print('accuracy_score: '+ str(accuracy_score(y_test, y_pred)))

y_pred = model.predict(X_test)
conf = confusion_matrix(y_test, y_pred)
print(conf)

score  0.725
accuracy_score: 0.725
[[ 42  86]
 [ 24 248]]
