## Importing libraries

In [198]:
import numpy as np
import pandas as pd

## Preparing arrays of dependent and independent variables

In [199]:
dataset = pd.read_csv('magic04.data')
X = dataset.iloc[:, :-1].values # Matrix of features
y = dataset.iloc[:, -1].values # Vector of predicted values

In [200]:
print(X)

[[ 31.6036  11.7235   2.5185 ...  -9.9574   6.3609 205.261 ]
 [162.052  136.031    4.0612 ... -45.216   76.96   256.788 ]
 [ 23.8172   9.5728   2.3385 ...  -7.1513  10.449  116.737 ]
 ...
 [ 75.4455  47.5305   3.4483 ...  -9.4662  30.2987 256.5166]
 [120.5135  76.9018   3.9939 ... -63.8389  84.6874 408.3166]
 [187.1814  53.0014   3.2093 ...  31.4755  52.731  272.3174]]


In [201]:
print(y)

['g' 'g' 'g' ... 'h' 'h' 'h']


## Balancing out the data

In [202]:
# We count the appearance of each class in our dataset

g_counter = 0
h_counter = 0

for c in y:
  if c == 'g':
    g_counter += 1
  else:
    h_counter += 1

In [203]:
print(g_counter)

12331


In [204]:
print(h_counter)

6688


In [205]:
# We balance the dataset by randomly removing extra data points
# that result in the g class until both classes are approximately equal

i = 0
indices = []

while g_counter > h_counter:
  if y[i] == 'g':
    indices.append(i)
    g_counter -= 1
  i += 1

# We remove the same rows from both X and y

X = np.delete(X, indices, axis=0)
y = np.delete(y, indices, axis=0)

## Splitting the data to training, validation, and test sets

In [206]:
from sklearn.model_selection import train_test_split

# training set will be 70% of dataset
X_train, X_rest, y_train, y_rest = train_test_split(X, y, test_size=0.3)

# the remaining 30% is divided equally among validation and test sets
X_validation, X_test, y_validation, y_test = train_test_split(X_rest, y_rest, test_size=0.5)

## Performing feature scaling

In [207]:
# Performing feature scaling is necessary, so that no features dominate
# the model due to their large range

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_validation = sc.transform(X_validation)
X_test = sc.transform(X_test)

In [208]:
print(X_train)

[[-0.55938563 -0.01786021  0.29032034 ...  0.52812729  0.44791916
  -1.08881334]
 [-0.45646009 -0.72100486 -1.131196   ...  0.03609442 -1.04990346
  -1.58512567]
 [ 0.97688509  1.55259087  0.89538996 ...  2.25657366 -0.9544033
   1.35794931]
 ...
 [ 0.48523742  0.0872283   1.32755267 ...  0.68950018 -1.05033688
   0.8139234 ]
 [-0.15282417 -0.65202693 -0.57045077 ...  0.36690332 -1.08818872
  -0.14491771]
 [-0.67376763 -0.34836803 -0.08706412 ... -0.46364414 -1.02963283
  -0.18647343]]


## Training model on training set

In [209]:
from sklearn.neighbors import KNeighborsClassifier

# Standard is using minkowski metric, which uses manhattan distance if
# p=1, and uses euclidean distance if p=2
classifier = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)

# Training model on training set
classifier.fit(X_train, y_train)

## Choosing the best k value to use

In [210]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix

suggested_neighbors = range(1, 21)

# We will store these metrics for analysing performance the of model with
# different K values later

accuracy_scores_on_validation = []
precision_scores_on_validation = []
recall_scores_on_validation = []
f_scores_on_validation = []
confusion_matrices_on_validation = []

accuracy_scores_on_test = []
precision_scores_on_test = []
recall_scores_on_test = []
f_scores_on_test = []
confusion_matrices_on_test = []

# Getting scores by comparing predictions and true values from validation set
for k in suggested_neighbors:
  classifier.n_neighbors = k
  y_pred = classifier.predict(X_validation)

  accuracy_scores_on_validation.append(accuracy_score(y_validation, y_pred))

  precision = precision_score(y_validation, y_pred, pos_label='g')
  precision_scores_on_validation.append(precision)

  recall = recall_score(y_validation, y_pred, pos_label='g')
  recall_scores_on_validation.append(recall)

  f_scores_on_validation.append(2 * (precision * recall) / (precision + recall))

  confusion_matrices_on_validation.append(confusion_matrix(y_validation, y_pred))

# Getting scores by comparing predictions and true values from test set
for k in suggested_neighbors:
  classifier.n_neighbors = k
  y_pred = classifier.predict(X_test)

  accuracy_scores_on_test.append(accuracy_score(y_test, y_pred))

  precision = precision_score(y_test, y_pred, pos_label='g')
  precision_scores_on_test.append(precision)

  recall = recall_score(y_test, y_pred, pos_label='g')
  recall_scores_on_test.append(recall)

  f_scores_on_test.append(2 * (precision * recall) / (precision + recall))

  confusion_matrices_on_test.append(confusion_matrix(y_test, y_pred))

# Selecting most efficient K
best_k = pd.Series(accuracy_scores_on_validation).idxmax() + 1
classifier.n_neighbors = best_k

In [211]:
print(f"Hypothetically, model works best at n_neighbors = {best_k}")

Hypothetically, model works best at n_neighbors = 15


## Comparing performance metrics for different models

In [212]:
from prettytable import PrettyTable, ALL
report_table_on_validation = PrettyTable(["K value", "Accuracy", "Precision", "Recall", "F-score", "Confusion matrix"])
report_table_on_validation.hrules = ALL

report_table_on_test = PrettyTable(["K value", "Accuracy", "Precision", "Recall", "F-score", "Confusion matrix"])
report_table_on_test.hrules = ALL

for i in range(0, 20):
  accuracy = float("{:.4f}".format(accuracy_scores_on_validation[i]))
  precision = float("{:.4f}".format(precision_scores_on_validation[i]))
  recall = float("{:.4f}".format(recall_scores_on_validation[i]))
  f_score = float("{:.4f}".format(f_scores_on_validation[i]))
  cm = confusion_matrices_on_validation[i]

  report_table_on_validation.add_row([i+1, accuracy, precision, recall, f_score, cm])

for i in range(0, 20):
  accuracy = float("{:.4f}".format(accuracy_scores_on_test[i]))
  precision = float("{:.4f}".format(precision_scores_on_test[i]))
  recall = float("{:.4f}".format(recall_scores_on_test[i]))
  f_score = float("{:.4f}".format(f_scores_on_test[i]))
  cm = confusion_matrices_on_test[i]

  report_table_on_test.add_row([i+1, accuracy, precision, recall, f_score, cm])



print("                       Results on Validation set")
print(report_table_on_validation)

print("\n\n                       Results on Test set")
print(report_table_on_test)

print(f"\nBest accuracy on test data = {max(accuracy_scores_on_test)}, using k = {pd.Series(accuracy_scores_on_validation).idxmax() + 1}")



                       Results on Validation set
+---------+----------+-----------+--------+---------+------------------+
| K value | Accuracy | Precision | Recall | F-score | Confusion matrix |
+---------+----------+-----------+--------+---------+------------------+
|    1    |  0.7856  |   0.7496  | 0.8547 |  0.7987 |    [[853 145]    |
|         |          |           |        |         |    [285 723]]    |
+---------+----------+-----------+--------+---------+------------------+
|    2    |  0.7537  |   0.6839  | 0.9389 |  0.7914 |    [[937  61]    |
|         |          |           |        |         |    [433 575]]    |
+---------+----------+-----------+--------+---------+------------------+
|    3    |  0.7926  |   0.7496  | 0.8758 |  0.8078 |    [[874 124]    |
|         |          |           |        |         |    [292 716]]    |
+---------+----------+-----------+--------+---------+------------------+
|    4    |  0.7827  |   0.7165  | 0.9319 |  0.8101 |    [[930  68]    |
| 