# Bank Note Authentication using K-Means Clustering

In [2]:
import pandas as pd

#### Using iframe render so that plots show up in both Jupyter Lab and nbviewer

In [5]:
import plotly.io as pio
pio.renderers.default = "iframe"

#### Reading the bank note data file downloaded from opneml

In [8]:
data = pd.read_csv("bank_note_dataset.csv")
display(data)

Unnamed: 0,V1,V2,V3,V4,Class
0,3.62160,8.66610,-2.8073,-0.44699,1
1,4.54590,8.16740,-2.4586,-1.46210,1
2,3.86600,-2.63830,1.9242,0.10645,1
3,3.45660,9.52280,-4.0112,-3.59440,1
4,0.32924,-4.45520,4.5718,-0.98880,1
...,...,...,...,...,...
1367,0.40614,1.34920,-1.4501,-0.55949,2
1368,-1.38870,-4.87730,6.4774,0.34179,2
1369,-3.75030,-13.45860,17.5932,-2.77710,2
1370,-3.56370,-8.38270,12.3930,-1.28230,2


#### Describing 4 features from the dataset extracted from the bank note images using a Wavelet Transform tool.
##### V1. variance of Wavelet Transformed image (continuous)
##### V2. skewness of Wavelet Transformed image (continuous)
##### V3. curtosis of Wavelet Transformed image (continuous)
##### V4. entropy of image (continuous)

In [11]:
data[["V1","V2","V3","V4"]].describe()

Unnamed: 0,V1,V2,V3,V4
count,1372.0,1372.0,1372.0,1372.0
mean,0.433735,1.922353,1.397627,-1.191657
std,2.842763,5.869047,4.31003,2.101013
min,-7.0421,-13.7731,-5.2861,-8.5482
25%,-1.773,-1.7082,-1.574975,-2.41345
50%,0.49618,2.31965,0.61663,-0.58665
75%,2.821475,6.814625,3.17925,0.39481
max,6.8248,12.9516,17.9274,2.4495


#### Printing correlation matrix to find significant features.
##### From the below result we can clearly see the V1 and V2 columns has higher correlation, thus have higher significance on determining the Class of banknote

In [14]:
data.corr()

Unnamed: 0,V1,V2,V3,V4,Class
V1,1.0,0.264026,-0.38085,0.276817,-0.724843
V2,0.264026,1.0,-0.786895,-0.526321,-0.444688
V3,-0.38085,-0.786895,1.0,0.318841,0.155883
V4,0.276817,-0.526321,0.318841,1.0,-0.023424
Class,-0.724843,-0.444688,0.155883,-0.023424,1.0


#### Selecting V1 and V2 columns for further analysis

In [17]:
significant_feature_data = data[["V1","V2"]]

#### Plotting V2 against V1

In [20]:
import plotly.express as px
fig = px.scatter(significant_feature_data,x="V1",y="V2",width=1000,height=1000)
fig.update_layout(title="Bank Note Authentication Dataset")
fig.show()

#### Normalizing the data to remove measurement scale bias

In [23]:
from sklearn.preprocessing import StandardScaler

normalized_data = pd.DataFrame(StandardScaler().fit_transform(significant_feature_data),columns=significant_feature_data.columns)
display(normalized_data)

Unnamed: 0,V1,V2
0,1.121806,1.149455
1,1.447066,1.064453
2,1.207810,-0.777352
3,1.063742,1.295478
4,-0.036772,-1.087038
...,...,...
1367,-0.009711,-0.097693
1368,-0.641313,-1.158984
1369,-1.472357,-2.621646
1370,-1.406693,-1.756471


#### Post Normalization the data have 0 Mean and 1 Standard Deviation

In [26]:
normalized_data.mean()

V1   -8.286213e-17
V2   -4.143106e-17
dtype: float64

In [28]:
normalized_data.std()

V1    1.000365
V2    1.000365
dtype: float64

In [30]:
from sklearn.cluster import KMeans
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.metrics import accuracy_score

#### K-Means clustering results in 0,1 cluster label but the real data has 1,2 class label. Thus a mapping dict to map labels to proper class.

In [33]:
openml_map_dict = {0:1,1:2}

#### Ran the K-Means Model accross 100 random states to find the best random state with highest accuracy

In [36]:
accuracy_data = list()
for i in range(100):
    kmeans_result = KMeans(n_clusters=2,n_init=12,random_state=i).fit(normalized_data)
    accuracy_data.append(accuracy_score(data["Class"],list(map(lambda x: openml_map_dict[x],kmeans_result.labels_))))

In [37]:
max_accuracy = max(accuracy_data)
best_random_state = accuracy_data.index(max_accuracy)
print(f"Max Accuracy is {max_accuracy} at random state {best_random_state}")

Max Accuracy is 0.8782798833819242 at random state 0


#### Ran the K-Means Model for 10 more times with the best random state identified at previous step to check the stability of the model
##### The results are exactly same providing exact same accuracy everytime. Thus we can safely say that the model is stable.

In [39]:
fig = make_subplots(2,5,shared_xaxes=True,shared_yaxes=True,x_title="V1 (Normalized)",y_title="V2 (Normalized)")
for i in range(2):
    for j in range(5):
        kmeans_result = KMeans(n_clusters=2,n_init=12,random_state=best_random_state).fit(normalized_data)
        fig.add_trace(go.Scatter(x=normalized_data["V1"],y=normalized_data["V2"],mode="markers",marker_color=kmeans_result.labels_,name=f"Iteration {i*5+j+1}"),row=i+1,col=j+1)
        fig.add_trace(go.Scatter(x=kmeans_result.cluster_centers_[:,0],y=kmeans_result.cluster_centers_[:,1],mode="markers",marker_symbol="star",marker_size=20,marker_color="red",showlegend=False),row=i+1,col=j+1)
        fig.update_xaxes(title=f"Accuracy Score = {accuracy_score(data['Class'],list(map(lambda x: openml_map_dict[x],kmeans_result.labels_)))}",row=i+1,col=j+1, title_standoff=50)
fig.update_layout(height=1000,legend_title_text="KMeans Iterations",title="Running KMeans Clustering mutiple times for stability check")
fig.show()

#### Performing K-Means Clustering to identify the 2 clusters of Genuine and Forged Bank Note

In [44]:
kmeans_result_final = KMeans(n_clusters=2,random_state=best_random_state,n_init=12).fit(normalized_data)
kmeans_data_final = normalized_data.copy()
kmeans_data_final["lable"] = kmeans_result_final.labels_

fig = go.Figure()
fig.update_layout(width=1200,height=1000,xaxis_title="V1 (Normalized)",yaxis_title="V2 (Normalized)",title="KMeans Clustering on Bank Note Authentication Dataset")
for label in kmeans_data_final["lable"].unique():
    current_label_data = kmeans_data_final[kmeans_data_final["lable"] == label]
    fig.add_trace(go.Scatter(x=current_label_data["V1"],y=current_label_data["V2"],mode="markers",name=f"Cluster {label}"))
fig.add_trace(go.Scatter(x=kmeans_result_final.cluster_centers_[:,0],y=kmeans_result_final.cluster_centers_[:,1],mode="markers",marker_symbol="star",marker_size=20,marker_color="black",name="KMeans Cluster Centroids"))
fig.show()

#### Finding the accuracy of the model
##### Seeing the result we can confidently conclude that the model can authenticate bank notes with ≈ 88% accuracy

In [47]:
print(accuracy_score(data["Class"],list(map(lambda x: openml_map_dict[x],kmeans_result_final.labels_))))

0.8782798833819242


In [49]:
data_with_accuracy = significant_feature_data.copy()
data_with_accuracy["accurate"] = [True if openml_map_dict[kmeans_data_final.at[i,"lable"]] == data.at[i,"Class"] else False for i in range(kmeans_data_final.shape[0])]
display(data_with_accuracy)

Unnamed: 0,V1,V2,accurate
0,3.62160,8.66610,True
1,4.54590,8.16740,True
2,3.86600,-2.63830,True
3,3.45660,9.52280,True
4,0.32924,-4.45520,False
...,...,...,...
1367,0.40614,1.34920,False
1368,-1.38870,-4.87730,True
1369,-3.75030,-13.45860,True
1370,-3.56370,-8.38270,True


#### Pie Chart of the Bank Note Correctly Identified or not data

In [52]:
accurate_val_counts = data_with_accuracy["accurate"].value_counts()
fig = px.pie(values = accurate_val_counts, names=accurate_val_counts.index,title="Bank Note Correctly Identified")
fig.update_layout(width = 500)
fig.show()

#### Plotting the data set with accuracy lables for data point on whether the K-Means model was able to correctly cluster that data point or not
##### The plot gives us a pretty good idea of the confidence level of the model. We can clearly see that except a few point on the borderline, most data points are correctly clustered

In [55]:
fig = px.scatter(data_with_accuracy,x="V1",y="V2",color="accurate",width=1000,height=1000)
fig.update_layout(legend_title_text="Correctly Clustered",title="Clustering Accuracy Plot")
fig.show()

#### Final Recommendation -
The outcome of the project is very positive. I achieved an accuracy score of ≈ 88% which will help in identifying the forged notes more easily. Also, the model is very stable and thus an ideal candidate for a finance-related sensitive environment. The KMmeans model was able to clearly identify all forge dnotese having less variance. It struggled only a bit for forged bank notes having moderate variance. Thus my recommendation will be to use this model as a first level filter for identifying forged bank notes. Thoughfurther validationg will still be required for some bank notes with moderate variance. But still, it can automate almost 90% of the process.