# Part 1

- ## Loading the dataset

In [None]:
import numpy as np
from numpy.lib.recfunctions import structured_to_unstructured

In [None]:
data = np.genfromtxt('iris.data', delimiter=",", encoding="utf8", dtype=None)
features = structured_to_unstructured(data[["f0", "f1", "f2", "f3"]])
names = data["f4"]
f"Memory size: {features.nbytes + names.nbytes} bytes"

In [None]:
features

- ## Normalizing

In [None]:
norm_features = (features - features.min(axis=0)) / (features.max(axis=0) - features.min(axis=0))
norm_features

- ## Defining fourth feature as a categorical variable

In [None]:
first_q = np.quantile(norm_features[3], 0.25)
second_q = np.quantile(norm_features[3], 0.75)

fc = norm_features[:,3]
third_feature_column = np.where(fc < first_q, "small", np.where(fc > second_q, "big", "medium"))
third_feature_column

- ## Splitting dataset into two

In [None]:
def get_sets(f, n):
    indices = np.random.permutation(f.shape[0])
    train_percentage = int(f.shape[0] * 0.8)
    training_idx, test_idx = indices[:train_percentage], indices[train_percentage:]

    training_features, training_names = f[training_idx], n[training_idx]
    test_features, test_names = f[test_idx], n[test_idx]
    return training_features, training_names, test_features, test_names

In [None]:
tr_f, tr_n, te_f, te_n = get_sets(features, names)
f"{tr_f.shape[0]} rows in training set, {te_f.shape[0]} rows in testing set"

# Part 2

- ## Data classification (testing classifier SVC on various selections)

In [None]:
import sklearn as sk
from sklearn import svm

In [None]:
def get_stats(expected, actual):
    wrong_predictions = [f"Expected: {w}, actual {c}" for w,c in dict(zip(expected, actual)).items() if w != c]
    accuracy = (len(expected) - len(wrong_predictions)) / len(expected)
    return wrong_predictions, accuracy

In [None]:
def test_classifier(clf, num_of_tests=20):
    np.random.seed(42)
    for i in range(num_of_tests):
        train_f, train_n, test_f, test_n = get_sets(features, names)
        clf = sk.base.clone(clf)
        clf.fit(train_f, train_n)
        wp, score = get_stats(test_n, clf.predict(test_f))
        print(f"Iteration: {i}\n Wrong predictions: {wp}\n Score: {score}\n")

test_classifier(svm.SVC(), 100)

**The classifier was tested on several samples of the same dataset. Average accuracy: 0.9 - 1. Most often the classifier confuses "Iris-virginica" and "Iris-versicolor".**

- ## Experiments with hyperparameters
  Testing different kernels with different parameters

In [None]:
def get_average_score(clf, num_of_tests=1000, normalized=True):
    np.random.seed(42)
    clf = sk.base.clone(clf)
    points = 0
    f = norm_features if normalized else features
    for i in range(num_of_tests):
        train_f, train_n, test_f, test_n = get_sets(f, names)
        clf.fit(train_f, train_n)
        points += clf.score(test_f, test_n)
    return points / num_of_tests

### 1. Testing rbf kernel

In [None]:
get_average_score(svm.SVC(C=1, kernel='rbf'), normalized=True)

In [None]:
get_average_score(svm.SVC(C=1, kernel="rbf"), normalized=False) 

In [None]:
get_average_score(svm.SVC(C=0.7, kernel="rbf"), normalized=True)

In [None]:
get_average_score(svm.SVC(C=0.7, kernel="rbf"), normalized=False)

In [None]:
get_average_score(svm.SVC(C=0.5, kernel="rbf"), normalized=True)

In [None]:
get_average_score(svm.SVC(C=0.5, kernel="rbf"), normalized=False)

**Conclusion**: Best result without normalization with higher C

### 2. Testing linear kernel

In [None]:
get_average_score(svm.SVC(C=1, kernel="linear"), normalized=True)

In [None]:
get_average_score(svm.SVC(C=1, kernel="linear"), normalized=False)

In [None]:
get_average_score(svm.SVC(C=0.5, kernel="linear"), normalized=True)

In [None]:
get_average_score(svm.SVC(C=0.5, kernel="linear"), normalized=False)

**Conclusion:** Best result without normalization with C near 0.5

### 3. Testing poly kernel

In [None]:
get_average_score(svm.SVC(C=1, kernel="poly"), normalized=True)

In [None]:
get_average_score(svm.SVC(C=1, kernel="poly"), normalized=False)

In [None]:
get_average_score(svm.SVC(C=0.5, kernel="poly"), normalized=True)

In [None]:
get_average_score(svm.SVC(C=0.5, kernel="poly"), normalized=False)

**Conclusion**: Best results with normalization with higher C

- ## Vizualizing dataset

In [None]:
from sklearn import decomposition
import matplotlib.pyplot as plt

In [None]:
pca = decomposition.PCA(n_components=2)
pca.fit(features)
dec_features = pca.transform(features)
dec_features

In [None]:
np.random.seed(42)
train_f, train_n, test_f, test_n = get_sets(dec_features, names)

clf = svm.SVC(C=0.5, kernel="linear")
clf.fit(train_f, train_n)
clf.score(test_f, test_n)

In [None]:
colours = np.unique(test_n, return_inverse=True)[1]
labels = np.unique(test_n, return_inverse=True)[0].tolist()

sc = plt.scatter(test_f[:,0], test_f[:,1], c=colours, edgecolors="k")
plt.title("Original dataset classification")
plt.legend(handles=sc.legend_elements()[0], labels=labels)

plt.xlim([-3.5, 4])
plt.ylim([-1.5, 1.5])

plt.show()

**Now it's clear why classifier confuses iris-versicolor and iris-virginica. Let's see how classifieir predicts labels for such data**

In [None]:
predicted = np.unique(clf.predict(test_f), return_inverse=True)[1]

pr_sc = plt.scatter(test_f[:,0], test_f[:,1], c=predicted, edgecolors="k")
plt.title("Predicted values classification")
plt.legend(handles=pr_sc.legend_elements()[0], labels=labels)

plt.xlim([-3.5, 4])
plt.ylim([-1.5, 1.5])
plt.show()