# Classification using K-Nearest Neighbors


## Importing libraries:

In [24]:

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, mean_squared_error, mean_absolute_error
    

## data_preprocessing:

In [26]:
# Import dataset
data = pd.read_csv("magic04.data", header=None)

# Define column names and assign them to the dataset
data.columns = ['fLength', 'fWidth', 'fSize', 'fConc', 'fConc1', 'fAsym', 'fM3Long', 
                'fM3Trans', 'fAlpha', 'fDist', 'class']

# Convert class labels ('g' -> 1, 'h' -> 0)
data['class'] = data['class'].replace({'g': 1, 'h': 0})

# Balance the dataset by matching the number of gamma and hadron samples
num_hadrons = data['class'].value_counts()[0]
gamma_samples = data[data['class'] == 1].sample(n=num_hadrons, random_state=42)
hadron_samples = data[data['class'] == 0]

# Merge and shuffle the balanced dataset
data_balanced = pd.concat([gamma_samples, hadron_samples]).sample(frac=1, random_state=42).reset_index(drop=True)


  data['class'] = data['class'].replace({'g': 1, 'h': 0})


## split_and_standardize_data:

In [28]:
# Separate features and target variable
X = data_balanced.drop('class', axis=1)
y = data_balanced['class']

# Split data into training and temporary sets (for validation and testing)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)

# Further split the temporary set equally into validation and test sets
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Normalize feature values using standardization
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)


## knn_evaluation:

In [30]:
# Evaluate K-NN Classifier for Different Values of K
k_range = range(1, 21)
performance_metrics = []

for k in k_range:
    model = KNeighborsClassifier(n_neighbors=k)
    model.fit(X_train, y_train)
    predictions = model.predict(X_val)
    
    performance_metrics.append({
        "k": k,
        "accuracy": accuracy_score(y_val, predictions),
        "precision": precision_score(y_val, predictions),
        "recall": recall_score(y_val, predictions),
        "f1_score": f1_score(y_val, predictions)
    })


## selecting_best_k:

In [32]:
# Determine the Optimal K Value
best_k = sorted(performance_metrics, key=lambda x: x["f1_score"], reverse=True)[0]["k"]
print(f"Optimal k value based on F1-score: {best_k}")


Optimal k value based on F1-score: 17


## train_and_evaluate_knn:

In [34]:
# Train K-NN with the Optimal K and Evaluate on the Test Set
optimal_knn = KNeighborsClassifier(n_neighbors=best_k)
optimal_knn.fit(X_train, y_train)

# Make predictions on the test set
y_pred_test = optimal_knn.predict(X_test)

# Compute evaluation metrics
test_metrics = {
    "Accuracy": accuracy_score(y_test, y_pred_test),
    "Precision": precision_score(y_test, y_pred_test),
    "Recall": recall_score(y_test, y_pred_test),
    "F1 Score": f1_score(y_test, y_pred_test),
    "Confusion Matrix": confusion_matrix(y_test, y_pred_test),
    "Mean Squared Error": mean_squared_error(y_test, y_pred_test),
    "Mean Absolute Error": mean_absolute_error(y_test, y_pred_test),
}

# Display test performance metrics
print("Test Set Evaluation:")
for metric, value in test_metrics.items():
    print(f"{metric}: {value}")


Test Set Evaluation:
Accuracy: 0.804185351270553
Precision: 0.7537117903930131
Recall: 0.8860369609856262
F1 Score: 0.8145351580934403
Confusion Matrix: [[751 282]
 [111 863]]
Mean Squared Error: 0.19581464872944693
Mean Absolute Error: 0.19581464872944693


## Comments on results:

# Accuracy (80.4%) – The model correctly predicts about 80% of the test samples.
# Precision (75.4%) – When the model predicts a positive class (gamma), it's correct 75.4% of the time.
# Recall (88.6%) – The model successfully detects 88.6% of the actual positive cases, meaning it has a strong ability to find gamma samples.
# F1 Score (81.5%) – A balanced measure of precision and recall, indicating solid performance.
# Confusion Matrix – Shows that there are 282 false positives and 111 false negatives, suggesting the model is slightly biased toward classifying more gamma cases.
# MSE & MAE (~0.196) – Both errors are relatively low, reinforcing that predictions are fairly accurate.