# 1.1 Data Preparation

In [None]:
import numpy as np
from pandas import read_csv

In [None]:
def parse_csv(filepath: str):
    data = read_csv(filepath)

    # targets = array of target values (last column)
    targets = data.values[:, -1].astype(float)

    # inputs = array of input values (all columns except last)
    inputs = data.values[:, :-1].astype(float)

    return np.array(inputs), np.array(targets)

In [None]:
inputs, targets = parse_csv('glass-dataset.csv')

In [None]:
# This is just to prove its working
print(inputs[:5]);

# 1.2 Classification

In [None]:
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.svm import SVC

In [None]:
def pca(inputs):
    pca = PCA(n_components=2)
    
    return pca.fit_transform(inputs)

In [None]:
compressed = pca(inputs)

In [None]:
def knn(inputs, targets, k: int):
    classifier = KNeighborsClassifier(n_neighbors=k)
    
    classifier.fit(inputs, targets)
    
    return classifier.predict(inputs)

In [None]:
def how_good_is_k(inputs, targets, k: int):
    classifiedData = knn(inputs, targets, k)

    totalRight = 0
    totalEntries = len(targets)

    for i in range(totalEntries):
        if classifiedData[i] == targets[i]:
            totalRight += 1
    
    return totalRight / totalEntries

In [None]:
# Returns the most accurate k values' classified data
def classify_knn(inputs, targets):
    accuracies = []

    for k in range(1, 20):
        accuracies.append(how_good_is_k(inputs, targets, k))

    plt.figure()
    plt.plot(range(1, 20), accuracies)
    plt.xlabel('k')
    plt.ylabel('Accuracy')
    plt.title("Accuracy of kNN with different k values")
    plt.show()
    
    return knn(inputs, targets, accuracies.index(max(accuracies)) + 1)

In [None]:
classifiedData = classify_knn(compressed, targets)

In [None]:
plt.figure()
plt.scatter(compressed[:, 0], compressed[:, 1], c=classifiedData)
plt.show()

In [None]:
# kernel can be "linear", "poly" or "rbf"
# degree can be 2 or 3
def svm(inputs, targets, kernel: str, degree: int):
    classifier = SVC(kernel=kernel, degree=degree)

    classifier.fit(inputs, targets)

    return classifier.predict(inputs)

In [None]:
def how_good_is_svm(inputs, targets, kernel, degree):
    classifiedData = svm(inputs, targets, kernel, degree)

    totalRight = 0
    totalEntries = len(targets)

    for i in range(totalEntries):
        if classifiedData[i] == targets[i]:
            totalRight += 1
    
    return totalRight / totalEntries

In [None]:
def classify_svm(inputs, targets):
    kernels = ["linear", "poly", "rbf"]
    degrees = [2, 3]
    accuracies = []

    for kernel in kernels:
        for degree in degrees:
            accuracies.append([kernel, degree, how_good_is_svm(inputs, targets, kernel, degree)])

    mostAccurate = max(accuracies, key=lambda x: x[2])

    return svm(inputs, targets, mostAccurate[0], mostAccurate[1])

In [None]:
classifiedData = classify_svm(inputs, targets)

In [None]:
plt.figure()
plt.scatter(compressed[:, 0], compressed[:, 1], c=classifiedData)
plt.show()

# 1.3 Assessment of Classification

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
def svm_cross_val(inputs, targets, kernel: str, degree: int):
    classifier = SVC(kernel=kernel, degree=degree)

    return cross_val_score(classifier, inputs, targets, cv=10)

In [None]:
def knn_cross_val(inputs, targets, k: int):
    classifier = KNeighborsClassifier(n_neighbors=k)
    
    return cross_val_score(classifier, inputs, targets, cv=10)

In [None]:
knn_accuracies = knn_cross_val(compressed, targets, 3)
svm_accuracies = svm_cross_val(inputs, targets, "poly", 3)

In [None]:
plt.figure()
plt.plot(range(1, 11), knn_accuracies, label="KNN")
plt.plot(range(1, 11), svm_accuracies, label="SVM")
plt.legend()
plt.show()