In [1]:
# DO NOT CHANGE THIS CELL

import csv
import numpy

In [2]:
# DO NOT CHANGE THIS CELL

def accuracy(t, tp):
    
    return 1.0 - ((t != tp).sum() / len(t))

def load_iris(fname, seed=1):
    
    class_mappings = {'Iris-setosa':0, 'Iris-versicolor':1, 'Iris-virginica':2}
    
    X = []
    t = []    
    
    # load dataset, line by line
    with open(fname, 'r') as csvfile:

        lines = csv.reader(csvfile)
        for row in lines:

            # convert the features (given as strings) to floats
            x = [float(v) for v in row[:-1]]

            # convert the string label to an integer
            l = class_mappings[row[-1].strip()]

            X.append(numpy.array(x))
            t.append(l)
    
    X = numpy.array(X).reshape((len(X),4))
    t = numpy.array(t)
    
    # The datapoints are given in sorted order w.r.t.
    # the classes. Let us shuffle the order ...
    numpy.random.seed(seed)
    permutation = numpy.random.permutation(len(X))
    X = X[permutation]
    t = t[permutation]
    
    return X, t

In [3]:
# DO NOT CHANGE THIS CELL

X, t = load_iris('iris.data')
print("Number of data points: %i" % X.shape[0])
print("Number of features per datapoint: %i" % X.shape[1])
      
n_train = int(0.75 * len(X))
X_train = X[:n_train]
t_train = t[:n_train]
X_test = X[n_train:]
t_test = t[n_train:]
print("Number of training points: %i" % X_train.shape[0])
print("Number of test points: %i" % X_test.shape[0])

Number of data points: 150
Number of features per datapoint: 4
Number of training points: 112
Number of test points: 38


In [4]:
class OneNearestNeighborClassifier:
    
    def __init__(self, dist_measure="euclidean"):
        
        self.dist_measure = dist_measure
    
    def fit(self, X, t):
        
        self.X_train = X
        self.t_train = t
    
    def predict(self, X):
        
        predictions = []
        
        for i in range(len(X)):
            
            dists = []
            for j in range(len(self.X_train)):
                
                if self.dist_measure == "euclidean":
                    d = self._euclideanDistance(X[i], self.X_train[j])
                elif self.dist_measure == "manhattan":
                    d = self._manhattenDistance(X[i], self.X_train[j])
                else:
                    raise Exception("Unknown distance measure!")

                dists.append(d)
                
            best_idx = numpy.argmin(numpy.array(dists))
            
            pred = self.t_train[best_idx]
            
            predictions.append(pred)
            
        return numpy.array(predictions)
    
    def _euclideanDistance(self, p, q):
        
        d = ((q - p)**2).sum()
        d = numpy.sqrt(d)
        
        return d
    
    def _manhattenDistance(self, p, q):
        
        d = numpy.abs((q-p))
        d = d.sum()
        
        return d
                

In [5]:
# DO NOT CHANGE THIS CELL

# fit model on training instances
model = OneNearestNeighborClassifier(dist_measure="euclidean")
model.fit(X_train, t_train)
# get predictions on test set
preds_euclidean = model.predict(X_test)
acc_euclidean = accuracy(t_test, preds_euclidean)
print("Final accuracy on test set (Euclidean distance): %f" % acc_euclidean)

Final accuracy on test set (Euclidean distance): 0.894737


In [6]:
# DO NOT CHANGE THIS CELL

# fit model on training instances
model = OneNearestNeighborClassifier(dist_measure="manhattan")
model.fit(X_train, t_train)
# get predictions on test set
preds_manhattan = model.predict(X_test)
acc_manhattan = accuracy(t_test, preds_manhattan)
print("Final accuracy on test set (Manhattan distance): %f" % acc_manhattan)

Final accuracy on test set (Manhattan distance): 0.868421
