In [50]:
# Importing Libraries

import pandas as pd
import sklearn.model_selection as sk_ms
from sklearn.neighbors import KNeighborsClassifier
import sklearn.metrics as sk_m
import numpy as np
import matplotlib.pyplot as plt

In [51]:
# Reading the DataFrame

faults_df = pd.read_csv("SteelPlateFaults-2class.csv")
print(faults_df)

      X_Minimum  X_Maximum  Y_Minimum  Y_Maximum  Pixels_Areas  X_Perimeter  \
0          1325       1339      30207      30238           268           29   
1             1         16      55572      55629           370           48   
2          1323       1333      68445      68506           330           48   
3          1324       1333      75642      75681           207           25   
4          1324       1335      97132      97213           594           55   
...         ...        ...        ...        ...           ...          ...   
1114        250        373    3629947    3630060          6114          320   
1115        243        370    3658370    3658511          7639          462   
1116        241        360    3711661    3711800          7080          492   
1117        836        878    2150529    2150756          5390          297   
1118        414        438    2887460    2887476           263           29   

      Y_Perimeter  Sum_of_Luminosity  Minimum_of_Lu

In [52]:
# Splitting the data into train and test data

[X_train, X_test, X_label_train, X_label_test] = sk_ms.train_test_split(faults_df.iloc[::,:-1:], faults_df.Class, test_size=0.3, random_state=42,
shuffle=True)

print(X_train)
print()
print(X_test)
print()
print(X_label_train)
print()
print(X_label_test)
print()

# Saving to csv File

X_train.to_csv("SteelPlateFaults-train.csv", index = False)
X_test.to_csv("SteelPlateFaults-test.csv", index = False)

      X_Minimum  X_Maximum  Y_Minimum  Y_Maximum  Pixels_Areas  X_Perimeter  \
1114        250        373    3629947    3630060          6114          320   
1009         37        217    1825287    1825355          6605          310   
993          41        213    1637218    1637282          6206          257   
254        1202       1216     289529     289543           137           15   
820          43        215    1253069    1253240         13975          795   
...         ...        ...        ...        ...           ...          ...   
466         535        578     149538     149552           265           74   
121         357        373     929482     929513           213           42   
1044         41        213    2290146    2290212          5753          291   
1095         86        215    7325485    7325536          4346          163   
860          41        190    1799909    1800071         11388          705   

      Y_Perimeter  Sum_of_Luminosity  Minimum_of_Lu

In [53]:
# K-Nearest Neighbour Analysis for K-value = 1

knn = KNeighborsClassifier(n_neighbors=1)  
knn.fit(X_train, X_label_train)
knn_predict = knn.predict(X_test)
conf_mat = sk_m.confusion_matrix(X_label_test, knn_predict)
print("Confusion Matrix for k = 1 is:")
print(conf_mat)
print()
print("Accuracy Score is:")
acc = sk_m.accuracy_score(X_label_test,knn_predict)
print(acc)

Confusion Matrix for k = 1 is:
[[ 81  27]
 [ 27 201]]

Accuracy Score is:
0.8392857142857143


In [54]:
# Dictionary for storing accuracy scores for different k

acc_score = {1 : acc}
print(acc_score)

{1: 0.8392857142857143}


In [55]:
# Similar KNN Analysis for k = 3 and k = 5

for k in [3, 5]:
    knn = KNeighborsClassifier(n_neighbors=k)  
    knn.fit(X_train, X_label_train)
    knn_predict = knn.predict(X_test)
    conf_mat = sk_m.confusion_matrix(X_label_test, knn_predict)
    print("Confusion Matrix for k =" + str(k)+ " is:")
    print(conf_mat)
    print()
    print("Accuracy Score is:")
    acc = sk_m.accuracy_score(X_label_test,knn_predict)
    print(acc)
    acc_score[k] = acc
    print()
    print("------")
    print()


Confusion Matrix for k =3 is:
[[ 83  25]
 [ 12 216]]

Accuracy Score is:
0.8898809523809523

------

Confusion Matrix for k =5 is:
[[ 82  26]
 [  9 219]]

Accuracy Score is:
0.8958333333333334

------



In [56]:
# Finding k-value for maximum classification accuracy

print(acc_score)
print()

(key1, val1) = (0, 0)
for k in acc_score:
    if val1 < acc_score[k]:
        (key1, val1) = (k, acc_score[k])

print("Max accuracy for k =", key1)


{1: 0.8392857142857143, 3: 0.8898809523809523, 5: 0.8958333333333334}

Max accuracy for k = 5


In [57]:
# Loading test and train data to separate DataFrames

faults_train = pd.read_csv('SteelPlateFaults-train.csv')
faults_test=pd.read_csv('SteelPlateFaults-test.csv')

print(faults_train.head())
print(faults_test.head())

   X_Minimum  X_Maximum  Y_Minimum  Y_Maximum  Pixels_Areas  X_Perimeter  \
0        250        373    3629947    3630060          6114          320   
1         37        217    1825287    1825355          6605          310   
2         41        213    1637218    1637282          6206          257   
3       1202       1216     289529     289543           137           15   
4         43        215    1253069    1253240         13975          795   

   Y_Perimeter  Sum_of_Luminosity  Minimum_of_Luminosity  \
0          197             633741                     40   
1          151             684731                     38   
2          128             641502                     39   
3           15              16692                    109   
4          441            1630309                     29   

   Maximum_of_Luminosity  ...  Outside_X_Index  Edges_X_Index  Edges_Y_Index  \
0                    134  ...           0.0738         0.3844         0.5736   
1                    1

In [62]:
# Min-Max Normalization in range 0-1 for train data

normalized_train = pd.DataFrame()
normalized_test = pd.DataFrame()
normal_max = 1
normal_min = 0
normal_range = normal_max - normal_min

for attribute in faults_train:
    
    if attribute == 'class':
        normalized_train[attribute] = faults_train[attribute]
        continue
        
    attribute_max = faults_train[attribute].max()
    attribute_min = faults_train[attribute].min()
    attribute_range = attribute_max - attribute_min
    
    normalized_train[attribute] = ((faults_train[attribute]-attribute_min) * (normal_range/attribute_range)) + normal_min
    normalized_test[attribute] = ((faults_test[attribute]-attribute_min) * (normal_range/attribute_range)) + normal_min
    
normalized_train.to_csv("SteelPlateFaults-train-Normalised.csv", index = False)
print(normalized_train.head())
print()
normalized_train.to_csv("SteelPlateFaults-test-normalised.csv", index = False)
print(normalized_test.head())

   X_Minimum  X_Maximum  Y_Minimum  Y_Maximum  Pixels_Areas  X_Perimeter  \
0   0.146628   0.215916   0.291950   0.291958      0.163720     0.249804   
1   0.021701   0.124634   0.146524   0.146528      0.176872     0.241948   
2   0.024047   0.122294   0.131369   0.131373      0.166185     0.200314   
3   0.704985   0.709187   0.022767   0.022767      0.003616     0.010212   
4   0.025220   0.123464   0.100413   0.100425      0.374290     0.622938   

   Y_Perimeter  Sum_of_Luminosity  Minimum_of_Luminosity  \
0     0.217295           0.161689               0.204082   
1     0.166297           0.174703               0.193878   
2     0.140798           0.163670               0.198980   
3     0.015521           0.004197               0.556122   
4     0.487805           0.416048               0.147959   

   Maximum_of_Luminosity  ...  Outside_X_Index  Edges_X_Index  Edges_Y_Index  \
0               0.449074  ...         0.082685       0.332176       0.519658   
1               0.4120

In [64]:
# K-Nearest Neighbour Analysis for K-value = 1

knn = KNeighborsClassifier(n_neighbors=1)  
knn.fit(normalized_train, X_label_train)
knn_predict = knn.predict(normalized_test)
conf_mat = sk_m.confusion_matrix(X_label_test, knn_predict)
print("Confusion Matrix for k = 1 is:")
print(conf_mat)
print()
print("Accuracy Score is:")
acc = sk_m.accuracy_score(X_label_test,knn_predict)
print(acc)

Confusion Matrix for k = 1 is:
[[104   4]
 [  9 219]]

Accuracy Score is:
0.9613095238095238


In [65]:
# Dictionary for storing accuracy scores for different k

acc_score = {1 : acc}
print(acc_score)

{1: 0.9613095238095238}


In [69]:
# Similar KNN Analysis for k = 3 and k = 5

for k in [3, 5]:
    knn = KNeighborsClassifier(n_neighbors=k)  
    knn.fit(normalized_train, X_label_train)
    knn_predict = knn.predict(normalized_test)
    conf_mat = sk_m.confusion_matrix(X_label_test, knn_predict)
    print("Confusion Matrix for k =" + str(k)+ " is:")
    print(conf_mat)
    print()
    print("Accuracy Score is:")
    acc = sk_m.accuracy_score(X_label_test,knn_predict)
    print(acc)
    acc_score[k] = acc
    print()
    print("------")
    print()


Confusion Matrix for k =3 is:
[[105   3]
 [  7 221]]

Accuracy Score is:
0.9702380952380952

------

Confusion Matrix for k =5 is:
[[104   4]
 [  7 221]]

Accuracy Score is:
0.9672619047619048

------



In [70]:
# Finding k-value for maximum classification accuracy

print(acc_score)
print()

(key1, val1) = (0, 0)
for k in acc_score:
    if val1 < acc_score[k]:
        (key1, val1) = (k, acc_score[k])

print("Max accuracy for k =", key1)

{1: 0.9613095238095238, 3: 0.9702380952380952, 5: 0.9672619047619048}

Max accuracy for k = 3
