# Mandatory assignment 2

In [414]:
import math
import numpy as np
from math import fsum, sqrt
import pandas as pd

### Preparing the data
The dataset is gathered from a url and converted to a Pandas dataframe

In [415]:
URL = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'

In [416]:
df_iris = pd.read_csv(URL,
                      header=None,
                      names=['sepal length', 'sepal width',
                             'petal length', 'petal width', 'class'])

The labels are strings and it will be easier and more efficient to convert them to numbers.
This is achieved by using the pd.factorize method.
The dataframe is then converted to a numpy array

In [417]:
df_iris['class'] = pd.factorize(df_iris['class'])[0]
np_iris = df_iris.to_numpy()
np_iris

array([[5.1, 3.5, 1.4, 0.2, 0. ],
       [4.9, 3. , 1.4, 0.2, 0. ],
       [4.7, 3.2, 1.3, 0.2, 0. ],
       [4.6, 3.1, 1.5, 0.2, 0. ],
       [5. , 3.6, 1.4, 0.2, 0. ],
       [5.4, 3.9, 1.7, 0.4, 0. ],
       [4.6, 3.4, 1.4, 0.3, 0. ],
       [5. , 3.4, 1.5, 0.2, 0. ],
       [4.4, 2.9, 1.4, 0.2, 0. ],
       [4.9, 3.1, 1.5, 0.1, 0. ],
       [5.4, 3.7, 1.5, 0.2, 0. ],
       [4.8, 3.4, 1.6, 0.2, 0. ],
       [4.8, 3. , 1.4, 0.1, 0. ],
       [4.3, 3. , 1.1, 0.1, 0. ],
       [5.8, 4. , 1.2, 0.2, 0. ],
       [5.7, 4.4, 1.5, 0.4, 0. ],
       [5.4, 3.9, 1.3, 0.4, 0. ],
       [5.1, 3.5, 1.4, 0.3, 0. ],
       [5.7, 3.8, 1.7, 0.3, 0. ],
       [5.1, 3.8, 1.5, 0.3, 0. ],
       [5.4, 3.4, 1.7, 0.2, 0. ],
       [5.1, 3.7, 1.5, 0.4, 0. ],
       [4.6, 3.6, 1. , 0.2, 0. ],
       [5.1, 3.3, 1.7, 0.5, 0. ],
       [4.8, 3.4, 1.9, 0.2, 0. ],
       [5. , 3. , 1.6, 0.2, 0. ],
       [5. , 3.4, 1.6, 0.4, 0. ],
       [5.2, 3.5, 1.5, 0.2, 0. ],
       [5.2, 3.4, 1.4, 0.2, 0. ],
       [4.7, 3

In [418]:
n_rows = np_iris.shape[1]
n_rows

5


### Defining function for euclidian distance
${d(p,q) = \sqrt{ \sum_{i=1}^{N} (q_i -p_i)^2}}$

In [428]:
def euclidian_distance(q, p, N):
    sum = 0
    for i in range(N):
        p_float = np.float64(p[i]).item()
        q_float = np.float64(q[i]).item()
        sum += math.pow(q_float - p_float, 2)
    return sqrt(sum)

### control calculation of distance between two rows
By predicting the distance of two first rows manually, we will make sure that the method yields the correct output


In [420]:
manual_first_distance = math.sqrt(math.pow(5.1 - 4.9, 2) + math.pow(3.5 - 3.0, 2) + math.pow(0, 2) + math.pow(0, 2))
print("first distance should be:", manual_first_distance)

first distance should be: 0.5385164807134502


Extracting first two rows from dataset and computing distance between first two rows by using the euclidian_distance function

In [421]:
row0 = np_iris[0, 0:4]
row1 = np_iris[1, 0:4]
first_distance = euclidian_distance(row0, row1, n_rows-1)
print("first distance is calculated correctly using euclidian_distance method:", manual_first_distance == first_distance)

first distance is calculated correctly using euclidian_distance method: True


### Predicting new value

In [422]:
np_iris[0,0:4]

array([5.1, 3.5, 1.4, 0.2])

In [423]:
#manually adjusted for only first row compared to every other
distances = []
new_dp = np.array([7.0, 3.1, 1.3, 0.7])
for i in range(len(np_iris)):
    p_row = np_iris[i,0:4]
    label = np_iris[i, 4]
    distances.append([euclidian_distance(new_dp, p_row, n_rows-1), label])
distances

[[2.0074859899884734, 0.0],
 [2.1633307652783933, 0.0],
 [2.355843797877949, 0.0],
 [2.459674775249769, 0.0],
 [2.12367605815953, 0.0],
 [1.8574175621006705, 0.0],
 [2.4535688292770597, 0.0],
 [2.092844953645635, 0.0],
 [2.657066051117284, 0.0],
 [2.1931712199461306, 0.0],
 [1.7916472867168916, 0.0],
 [2.2956480566497994, 0.0],
 [2.2847319317591728, 0.0],
 [2.7748873851023217, 0.0],
 [1.584297951775486, 0.0],
 [1.8734993995195195, 0.0],
 [1.813835714721705, 0.0],
 [1.984943324127921, 0.0],
 [1.5811388300841893, 0.0],
 [2.0736441353327724, 0.0],
 [1.7492855684535897, 0.0],
 [2.024845673131659, 0.0],
 [2.519920633670831, 0.0],
 [1.9621416870348587, 0.0],
 [2.353720459187964, 0.0],
 [2.085665361461421, 0.0],
 [2.0663978319771825, 0.0],
 [1.9209372712298545, 0.0],
 [1.8947295321496413, 0.0],
 [2.3748684174075834, 0.0],
 [2.2759613353482084, 0.0],
 [1.6673332000533063, 0.0],
 [2.1540659228538015, 0.0],
 [1.928730152198591, 0.0],
 [2.1931712199461306, 0.0],
 [2.0663978319771825, 0.0],
 [1.63

Sort distances

In [424]:
distances = sorted(distances)
distances

[[1.5811388300841893, 0.0],
 [1.584297951775486, 0.0],
 [1.6309506430300091, 0.0],
 [1.6673332000533063, 0.0],
 [1.7492855684535897, 0.0],
 [1.7916472867168916, 0.0],
 [1.813835714721705, 0.0],
 [1.8574175621006705, 0.0],
 [1.8734993995195195, 0.0],
 [1.881488772222678, 0.0],
 [1.8947295321496413, 0.0],
 [1.9209372712298545, 0.0],
 [1.928730152198591, 0.0],
 [1.9621416870348587, 0.0],
 [1.984943324127921, 0.0],
 [1.9974984355438181, 0.0],
 [2.0074859899884734, 0.0],
 [2.024845673131659, 0.0],
 [2.0639767440550294, 0.0],
 [2.0663978319771825, 0.0],
 [2.0663978319771825, 0.0],
 [2.073644135332772, 0.0],
 [2.0736441353327724, 0.0],
 [2.078460969082653, 0.0],
 [2.085665361461421, 0.0],
 [2.092844953645635, 0.0],
 [2.1071307505705477, 0.0],
 [2.12367605815953, 0.0],
 [2.1330729007701543, 0.0],
 [2.1540659228538015, 0.0],
 [2.1633307652783933, 0.0],
 [2.1931712199461306, 0.0],
 [2.1931712199461306, 0.0],
 [2.1931712199461306, 0.0],
 [2.2405356502408083, 0.0],
 [2.2759613353482084, 0.0],
 [2.

Defining K value

In [425]:
k = 4

Extracting top k nearest neighbours

In [426]:
k_nearest_neighbours = distances[0:4]
k_nearest_labels = []
for i in range(k):
    k_nearest_labels.append(k_nearest_neighbours[3][1])
k_nearest_labels

[0.0, 0.0, 0.0, 0.0]

Extracting the label that has the most occurences, as my prediction

In [427]:
pred = max(k_nearest_labels)
print("prediction is for label: ", pred)

prediction is for label:  0.0
