# CPSC 4970 AI + ML: Module 4 -- K nearest neighbors classifier

New stuff:
- [LabelEncoder](https://scikit-learn.org/stable/modules/preprocessing_targets.html#preprocessing-targets)
- [KNeighborsClassifier](https://scikit-learn.org/stable/modules/neighbors.html#classification)
- [ColumnTransformer](https://scikit-learn.org/stable/modules/generated/sklearn.compose.ColumnTransformer.html)

In [28]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from IPython.display import display
import pandas as pd

# Data from https://www.openml.org/d/31
# Data originally from https://archive.ics.uci.edu/ml/datasets/statlog+(german+credit+data)
# but the openml version is cleaned up a bit.
df = pd.read_csv('data/dataset_31_credit-g.csv')
display(df.info())
display(df.describe())
display(df)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 21 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   checking_status         1000 non-null   object
 1   duration                1000 non-null   int64 
 2   credit_history          1000 non-null   object
 3   purpose                 1000 non-null   object
 4   credit_amount           1000 non-null   int64 
 5   savings_status          1000 non-null   object
 6   employment              1000 non-null   object
 7   installment_commitment  1000 non-null   int64 
 8   personal_status         1000 non-null   object
 9   other_parties           1000 non-null   object
 10  residence_since         1000 non-null   int64 
 11  property_magnitude      1000 non-null   object
 12  age                     1000 non-null   int64 
 13  other_payment_plans     1000 non-null   object
 14  housing                 1000 non-null   object
 15  exist

None

Unnamed: 0,duration,credit_amount,installment_commitment,residence_since,age,existing_credits,num_dependents
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,20.903,3271.258,2.973,2.845,35.546,1.407,1.155
std,12.058814,2822.736876,1.118715,1.103718,11.375469,0.577654,0.362086
min,4.0,250.0,1.0,1.0,19.0,1.0,1.0
25%,12.0,1365.5,2.0,2.0,27.0,1.0,1.0
50%,18.0,2319.5,3.0,3.0,33.0,1.0,1.0
75%,24.0,3972.25,4.0,4.0,42.0,2.0,1.0
max,72.0,18424.0,4.0,4.0,75.0,4.0,2.0


Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,...,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,class
0,'<0',6,'critical/other existing credit',radio/tv,1169,'no known savings','>=7',4,'male single',none,...,'real estate',67,none,own,2,skilled,1,yes,yes,good
1,'0<=X<200',48,'existing paid',radio/tv,5951,'<100','1<=X<4',2,'female div/dep/mar',none,...,'real estate',22,none,own,1,skilled,1,none,yes,bad
2,'no checking',12,'critical/other existing credit',education,2096,'<100','4<=X<7',2,'male single',none,...,'real estate',49,none,own,1,'unskilled resident',2,none,yes,good
3,'<0',42,'existing paid',furniture/equipment,7882,'<100','4<=X<7',2,'male single',guarantor,...,'life insurance',45,none,'for free',1,skilled,2,none,yes,good
4,'<0',24,'delayed previously','new car',4870,'<100','1<=X<4',3,'male single',none,...,'no known property',53,none,'for free',2,skilled,2,none,yes,bad
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,'no checking',12,'existing paid',furniture/equipment,1736,'<100','4<=X<7',3,'female div/dep/mar',none,...,'real estate',31,none,own,1,'unskilled resident',1,none,yes,good
996,'<0',30,'existing paid','used car',3857,'<100','1<=X<4',4,'male div/sep',none,...,'life insurance',40,none,own,1,'high qualif/self emp/mgmt',1,yes,yes,good
997,'no checking',12,'existing paid',radio/tv,804,'<100','>=7',4,'male single',none,...,car,38,none,own,1,skilled,1,none,yes,good
998,'<0',45,'existing paid',radio/tv,1845,'<100','1<=X<4',4,'male single',none,...,'no known property',23,none,'for free',1,skilled,1,yes,yes,bad


In [29]:
ct = ColumnTransformer(
    [('one-hot', OneHotEncoder(), ['checking_status', 'credit_history', 'purpose',
                                  'savings_status', 'employment', 'personal_status',
                                  'other_parties', 'property_magnitude', 'other_payment_plans',
                                  'housing', 'job', 'own_telephone', 'foreign_worker'])],
    remainder='passthrough'
)

X = df.iloc[:, :-1]
y = df['class']

X_train, X_test, y_train, y_test = train_test_split(X, y)

X_train_transformed = ct.fit_transform(X_train)
X_test_transformed = ct.transform(X_test)

label_encoder = LabelEncoder()
y_train_transformed = label_encoder.fit_transform(y_train)
display(label_encoder.classes_)
y_test_transformed = label_encoder.transform(y_test)

array(['bad', 'good'], dtype=object)

# KNN without resampling

In [30]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(X_train_transformed, y_train_transformed)

print("Percentage in positive class: ", sum(y_train_transformed)/len(y_train_transformed))
print("Training accuracy: ", knn.score(X_train_transformed, y_train_transformed))
print("Testing accuracy: ", knn.score(X_test_transformed, y_test_transformed))

Percentage in positive class:  0.696
Training accuracy:  0.7306666666666667
Testing accuracy:  0.692


# KNN with SMOTE

In [31]:
from imblearn.over_sampling import SMOTE

oversampler = SMOTE()
X_train_transformed, y_train_transformed = oversampler.fit_resample(X_train_transformed, y_train_transformed)

knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(X_train_transformed, y_train_transformed)

print("Percentage in positive class: ", sum(y_train_transformed)/len(y_train_transformed))
print("Training accuracy: ", knn.score(X_train_transformed, y_train_transformed))
print("Testing accuracy: ", knn.score(X_test_transformed, y_test_transformed))

Percentage in positive class:  0.5
Training accuracy:  0.7452107279693486
Testing accuracy:  0.564
