# A simple classification test using k-Nearest-Neighbours (kNN)

*Costas Andreopoulos \<c.andreopoulos@cern.ch\>*

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

from IPython.display import display, HTML

**Generate synthetic data and split into train and test samples**

In [3]:
# Generate synthetic data
X, Y = make_classification(n_samples=1000, n_features=10, random_state=42)

print(X.shape)
print(Y.shape)

# Split into train and test
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Convert the training datasets to DataFrame 
df_train = pd.DataFrame(X_train)
df_train['target'] = Y_train  
# Convert DataFrame to an interactive HTML table with scrollable output
html = df_train.to_html(max_rows=20, max_cols=10)
display(HTML('<style> .dataframe {max-height: 300px; overflow-y: scroll; display: block; } </style>'))
display(HTML(df_train.to_html()))
# display(df_train) # simpler option if the above doesn't work

(1000, 10)
(1000,)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,target
0,-0.063926,-0.221042,0.189654,0.06937,0.564842,1.635798,-0.671868,2.39211,0.874895,0.192597,1
1,-1.884649,0.092055,-1.619239,0.47624,2.441222,0.437127,2.214412,-0.993593,2.234279,2.529928,1
2,0.531307,0.265362,0.371714,-0.582759,0.390465,-0.103222,-0.365368,-0.134279,-0.906845,-2.438817,0
3,0.292668,-0.707552,0.230376,-0.53879,-1.244486,0.449827,-0.279508,-1.125655,-0.415822,-0.720212,1
4,-0.101944,-0.2216,0.250126,-1.440982,-0.346772,-0.585793,-0.911652,1.127686,1.22427,0.958816,0
5,0.385308,0.175211,0.248494,2.985259,-0.2298,-0.851406,-0.200598,-0.31353,-0.726511,0.367482,0
6,-0.395859,-0.132169,-0.596864,0.072279,0.595491,-0.589895,1.24929,-0.778837,-0.369595,-1.500222,1
7,-0.657912,0.532783,-0.57936,-0.805527,-0.70129,-0.875815,0.816094,1.542277,0.733894,-0.215114,1
8,-0.386023,1.181891,-1.191119,0.480908,-1.176173,0.48756,3.078496,-0.528297,-2.350479,-0.202524,1
9,-0.931213,-0.608494,-0.772027,-1.007868,0.158756,1.156007,1.008493,-1.793076,1.195602,0.253737,1


**Scale features**

Feature scaling ensures that all features have the same scale (range of values), which is crucial for many machine learning algorithms that rely on distance measurements (e.g., kNN, SVM, K-means clustering).

The StandardScaler() transforms features to have:
- Mean = 0
- Standard deviation = 1 (unit variance)

It uses the formula $\displaystyle X_{scaled} = \frac{X-\mu}{\sigma}$, where
- $X$ is the original feature,
- $\mu$ is the mean of the feature, and
- $\sigma$ is the standard deviation of the feature.

This transformation is called Z-score normalization (or standardization).

In [4]:
scaler = StandardScaler()
# Computes the mean and std for each feature, before applying to a Z-score normalization to training data
X_train_scaled = scaler.fit_transform(X_train)
# Applies Z-score norm to the test data using the mean and std values computed above
X_test_scaled  = scaler.transform(X_test) 

**Create and train the kNN classifier**

In [5]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train_scaled, Y_train)

**Build predictions for the test set and evaluate the accuracy**

In [6]:
# Predict
Y_pred = knn.predict(X_test_scaled)

# Evaluate
accuracy = accuracy_score(Y_test, Y_pred)
print(f"Accuracy: {accuracy * 100:.4f}%")

# Print the test set, along with the predicted and true target values
# Convert the test set to DataFrame 
df_test = pd.DataFrame(X_test)
df_test['true target'] = Y_test
df_test['pred target'] = Y_pred
# Convert DataFrame to an interactive HTML table with scrollable output
html = df_test.to_html(max_rows=20, max_cols=10)
display(HTML('<style> .dataframe {max-height: 300px; overflow-y: scroll; display: block; } </style>'))
display(HTML(df_test.to_html()))
# display(df_test) # simpler option if the above doesn't work

Accuracy: 79.0000%


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,true target,pred target
0,0.740306,0.42549,0.603912,1.197247,1.159074,-0.336255,-0.771684,-0.709441,-0.98265,-1.371674,0,0
1,-1.495778,0.381353,-1.276334,1.365526,1.388665,0.142378,1.73063,0.574042,1.802012,-0.630143,1,1
2,-0.144079,-0.009289,0.023662,0.306981,-0.175378,0.491506,-0.281049,-0.342879,0.652572,1.813408,1,1
3,-0.287854,0.627606,-0.443368,-0.279507,1.320414,-0.874436,0.936993,-0.183854,-0.299303,-0.158305,1,1
4,-0.209246,-0.177596,0.384766,0.655892,0.925975,-0.091076,-1.478354,-0.020099,2.092603,0.963907,0,0
5,-1.26835,0.703492,-1.080516,-0.077821,0.217718,-0.82465,1.462131,-0.361724,1.533758,-0.275951,1,1
6,0.308346,1.084198,0.747883,0.210303,0.223281,-1.198392,-1.837338,-0.009182,1.212428,0.198394,0,0
7,-0.491701,0.906544,0.169073,0.072252,0.357661,0.064506,-1.228893,-1.438411,2.515626,2.057811,0,0
8,0.725886,-1.207022,0.572361,-1.075312,1.276965,0.338023,-0.696216,-0.945507,-1.028164,1.676393,0,0
9,-0.069296,0.113394,0.085278,0.458994,0.409439,-0.276672,-0.360869,-0.103587,0.555308,-0.188202,0,1
