In [289]:
# K-Nearest Neighbors (KNN) - Fake bills classification 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [290]:
df = pd.read_csv("datasets/fake_bills.csv", sep=';')
df

Unnamed: 0,is_genuine,diagonal,height_left,height_right,margin_low,margin_up,length
0,True,171.81,104.86,104.95,4.52,2.89,112.83
1,True,171.46,103.36,103.66,3.77,2.99,113.09
2,True,172.69,104.48,103.50,4.40,2.94,113.16
3,True,171.36,103.91,103.94,3.62,3.01,113.51
4,True,171.73,104.28,103.46,4.04,3.48,112.54
...,...,...,...,...,...,...,...
1495,False,171.75,104.38,104.17,4.42,3.09,111.28
1496,False,172.19,104.63,104.44,5.27,3.37,110.97
1497,False,171.80,104.01,104.12,5.51,3.36,111.95
1498,False,172.06,104.28,104.06,5.17,3.46,112.25


In [291]:
# Count null or nan values
# Found 37 null values in margin_low feature
df.isnull().sum()

is_genuine       0
diagonal         0
height_left      0
height_right     0
margin_low      37
margin_up        0
length           0
dtype: int64

In [292]:
df.margin_low

0       4.52
1       3.77
2       4.40
3       3.62
4       4.04
        ... 
1495    4.42
1496    5.27
1497    5.51
1498    5.17
1499    4.63
Name: margin_low, Length: 1500, dtype: float64

In [293]:
# Calculate mean of the margin_low feature then 
margin_mean = df.margin_low.mean()
margin_mean

np.float64(4.485967190704033)

In [None]:
# Eliminate nulls using mean method 
# replace existing values with mean
df.margin_low = df.margin_low.fillna(margin_mean)

In [295]:
df.isnull().sum()

is_genuine      0
diagonal        0
height_left     0
height_right    0
margin_low      0
margin_up       0
length          0
dtype: int64

In [296]:
# Count True and False classes
class_counts = df.is_genuine.value_counts()
class_counts

is_genuine
True     1000
False     500
Name: count, dtype: int64

In [297]:
# majority and minority class count
major_class = class_counts.max()
minor_class = class_counts.min()
print(f'major class : {major_class}')
print(f'minor class : {minor_class}')

major class : 1000
minor class : 500


In [298]:
# Calculate the Imbalance ratio of the majority and minority classes
imbalance_ratio = major_class/minor_class
print(f'Class imbalance ratio = {int(imbalance_ratio)}:1')

Class imbalance ratio = 2:1


In [299]:
# Check duplicate values
dupli = df.duplicated().sum()
dupli

np.int64(0)

In [300]:
# Tranform True and False classes into 1 and 0 through LabelEncoder
lbl_enc = LabelEncoder()
df.is_genuine = lbl_enc.fit_transform(df['is_genuine'])
df.is_genuine.head()

0    1
1    1
2    1
3    1
4    1
Name: is_genuine, dtype: int64

In [301]:
df.head()

Unnamed: 0,is_genuine,diagonal,height_left,height_right,margin_low,margin_up,length
0,1,171.81,104.86,104.95,4.52,2.89,112.83
1,1,171.46,103.36,103.66,3.77,2.99,113.09
2,1,172.69,104.48,103.5,4.4,2.94,113.16
3,1,171.36,103.91,103.94,3.62,3.01,113.51
4,1,171.73,104.28,103.46,4.04,3.48,112.54


In [302]:
df.columns

Index(['is_genuine', 'diagonal', 'height_left', 'height_right', 'margin_low',
       'margin_up', 'length'],
      dtype='object')

In [303]:
x = df.iloc[:, 1:]
x.head()

Unnamed: 0,diagonal,height_left,height_right,margin_low,margin_up,length
0,171.81,104.86,104.95,4.52,2.89,112.83
1,171.46,103.36,103.66,3.77,2.99,113.09
2,172.69,104.48,103.5,4.4,2.94,113.16
3,171.36,103.91,103.94,3.62,3.01,113.51
4,171.73,104.28,103.46,4.04,3.48,112.54


In [304]:
y = df.is_genuine

In [305]:
# Train and Test split
xTrain, xTest, yTrain, yTest = train_test_split(x, y, 
                                                test_size=0.30, 
                                                random_state=2)


In [306]:
xTrain.shape

(1050, 6)

In [307]:
xTest.shape

(450, 6)

In [308]:
# Define object for KNN Class
knn_model = KNeighborsClassifier(n_neighbors=5)

In [309]:
# Training 
training = knn_model.fit(xTrain, yTrain)
training

In [310]:
# Testing
testing = knn_model.fit(xTest, yTest)
testing

In [311]:
# R Squared score of Training
score_training = knn_model.score(xTrain, yTrain)
score_training

0.9885714285714285

In [312]:
# R Squared score of Testing
score_testing = knn_model.score(xTest, yTest)
score_testing

0.9933333333333333

In [313]:
xTrain.head()

Unnamed: 0,diagonal,height_left,height_right,margin_low,margin_up,length
1480,171.64,103.92,104.66,5.71,3.38,112.33
399,171.52,104.08,104.06,4.41,3.36,113.32
1122,172.09,104.15,104.17,4.15,3.4,113.85
279,172.29,103.73,103.55,3.84,3.23,113.9
301,171.46,103.83,103.95,4.21,3.15,113.59


In [314]:
# Normalization for max Training valued features
normalize = MinMaxScaler(feature_range=(0,1))
xTrainNorm = normalize.fit_transform(xTrain)
pd.DataFrame(xTrainNorm)


Unnamed: 0,0,1,2,3,4,5
0,0.304569,0.448276,0.863850,0.685185,0.607407,0.587992
1,0.243655,0.540230,0.582160,0.341270,0.592593,0.792961
2,0.532995,0.580460,0.633803,0.272487,0.622222,0.902692
3,0.634518,0.339080,0.342723,0.190476,0.496296,0.913043
4,0.213198,0.396552,0.530516,0.288360,0.437037,0.848861
...,...,...,...,...,...,...
1045,0.477157,0.362069,0.511737,0.431217,0.362963,0.805383
1046,0.324873,0.362069,0.469484,0.296296,0.222222,0.850932
1047,0.766497,0.477011,0.572770,0.298942,0.577778,0.801242
1048,0.446701,0.373563,0.244131,0.190476,0.177778,0.726708


In [315]:
# Training with normalized features
training_norm = knn_model.fit(xTrainNorm, yTrain)
training_norm

In [316]:
# R squared score for nomalized training
score_training_norm = knn_model.score(xTrainNorm, yTrain)
score_training_norm

0.9885714285714285

In [317]:
# Normalization for max Testing valued features
xTestNorm = normalize.fit_transform(xTest)
pd.DataFrame(xTestNorm)

Unnamed: 0,0,1,2,3,4,5
0,0.405405,0.300699,0.311475,0.339888,0.526667,0.800948
1,0.389189,0.566434,0.442623,0.789326,0.733333,0.255924
2,0.772973,0.734266,0.579235,0.533708,0.706667,0.109005
3,0.508108,0.657343,0.393443,0.308989,0.446667,0.677725
4,0.302703,0.699301,0.584699,0.356742,0.680000,0.753555
...,...,...,...,...,...,...
445,0.432432,0.769231,0.797814,0.926966,0.486667,0.199052
446,0.648649,0.566434,0.448087,0.205056,0.493333,0.665877
447,0.362162,0.734266,0.628415,0.162921,0.586667,0.590047
448,0.600000,0.559441,0.491803,0.275281,0.400000,0.511848


In [318]:
# Testing with normalized features
testing_norm = knn_model.fit(xTestNorm, yTest)
testing_norm

In [319]:
# R squared score for nomalized testing
score_testing_norm = knn_model.score(xTestNorm, yTest)
score_testing_norm

0.9977777777777778

In [321]:
# KNN Model evaluation report
print(f'Training Score: {score_training}')
print(f'Test Score : {score_testing}')
print(f'Training Score (Normalize) : {score_training_norm}')
print(f'Testing Score (Normalize) : {score_testing_norm}')

Training Score: 0.9885714285714285
Test Score : 0.9933333333333333
Training Score (Normalize) : 0.9885714285714285
Testing Score (Normalize) : 0.9977777777777778


In [None]:
test_in = input()