# Assignment 2

## Imports

In [2]:
from IPython.display import display, HTML, Image

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted

from sklearn import metrics
from scipy.spatial import distance 

%matplotlib inline
#%qtconsole

## Define TemplateMatchClassifier

The TemplateMatchClassifier algorithm is a simple template matching predictor. TemplateMatchClassifier only works for continuous descriptive features and categorical target features. TemplateMatchClassifier works very simply as follows:

* **Training:** For each target feature level calculate the average value of all descriptive features for instances that have that target level. Store these average vectors as templates for each target level.
* **Prediction:** When a new prediction needs to be made compare the descriptive feature values of the new query instance to each template and return the target feature level that belongs to the template that is cloesest (based on Euclidean distance) to the query case.

In [3]:
# Create a new classifier which is based on the sckit-learn BaseEstimator and ClassifierMixin classes
class TemplateMatchClassifier(BaseEstimator, ClassifierMixin):
    '''
    Parameters
    ----------
    distance_metric string, optional (default = 'euclidean')
    It can be manhattan or cosine or ceuclidean

    Attributes
    ----------
    classes_ : array of shape = [n_classes] 
               The class labels (single output problem).
    templates_: dict
               A dictionary of the templates used for each class 
              {classes:templates}
    Notes
    -----


    See also
    --------
    
    
    ----------

    Examples
    --------
    >>> from sklearn.datasets import load_iris
    >>> from sklearn.model_selection import cross_val_score
    >>> clf = TemplateMatchClassifier()
    >>> iris = load_iris()
    >>> cross_val_score(clf, iris.data, iris.target, cv=10)
    
    '''
    # Constructor for the classifier object
    def __init__(self, distance_metric = 'euclidean'): #default is euclidean
        self.distance_metric = distance_metric

    # The fit function to train a classifier
    def fit(self, X, y):
        """Build a decision tree classifier from the training set (X, y).
        Parameters
        ----------
        X : array-like or sparse matrix, shape = [n_samples, n_features]
            The training input samples. Internally, it will be converted to
            ``dtype=np.float32`` and if a sparse matrix is provided
            to a sparse ``csc_matrix``.
        y : array-like, shape = [n_samples] 
            The target values (class labels) as integers or strings.
        Returns
        -------
        self : object
        """
        
        #get the distance_metric..
        if self.distance_metric == 'euclidean':
            self.distance_fn = distance.euclidean
        elif self.distance_metric == 'cosine':
            self.distance_fn = distance.cosine
        elif self.distance_metric == 'minkowski':
            self.distance_fn = distance.minkowski
        elif self.distance_metric == 'manhattan':
            self.distance_fn = distance.cityblock
        
        # Check that X and y have correct shape
        X, y = check_X_y(X, y)

        # Count the number of occurrences of each class in the target vector (uses mupy unique function that returns a list of unique values and their counts)
        #unique, counts = np.unique(y, return_counts=True)
        # Store the classes seen during fit
        unique = np.unique(y)
        self.classes_ = unique
        print(self.classes_)
        
        #Create dict to store template for each class..
        self.templates_ = dict()
        for clas in self.classes_:
                self.templates_[clas] = X[y == clas].mean(axis = 0)
        print(self.templates_)
        return self
        
        '''
        # Normalise the counts to sum to 1
        #dist = counts/sum(counts)
            
        # If the add_noise attribute is true add a little noise to the distribution
        #if(self.add_noise):
        #    for i in  range(len(dist)):
        #        dist[i] = dist[i] + dist[i]*random.uniform(-0.25, 0.25)
            # Renormalise the distribution
        #    dist = dist/sum(dist)
            
        # Create a new dictionary of classes and their normalised frequencies (the distribution)
        #self.distribution_ = dict(zip(unique, dist))
        
        # Return the classifier
        '''
        
    # The predict function to make a set of predictions for a set of query instances
    def predict(self, X):
        """Predict class labels of the input samples X.
        Parameters
        ----------
        X : array-like or sparse matrix of shape = [n_samples, n_features]
            The input samples. Internally, it will be converted to
            ``dtype=np.float32`` and if a sparse matrix is provided
            to a sparse ``csr_matrix``.
        Returns
        -------
        p : array of shape = [n_samples, ].
            The predicted class labels of the input samples. 
        """
        
        # Check is fit had been called by confirming that the distributions_ dictionary has been set up
        check_is_fitted(self, ['templates_'])

        # Check that the input features match the type and shape of the training features
        X = check_array(X)

        # Initialise an empty list to store the predictions made
        predictions = list()
        
        #go through each instance of X..
        for instance in X:

            #Let best match be first template...
            globalDistance = self.distance_fn(instance, self.templates_[self.classes_[0]]) #global minimum
            #Generate a random class according to the learned distribution
            globalClass = self.classes_[0] #global solution
            
            for template in self.templates_:
                currDistance = self.distance_fn(instance, self.templates_[template])
                if currDistance < globalDistance:
                    globalDistance = currDistance
                    globalClass = template
            
            predictions.append(globalClass)
        return np.array(predictions)

## Test the TemplateMatchClassifier against custom datapoints

In [4]:
X = np.array([[10,22,3,4], [15,6,2,89], [1,5,6,12], [4,19,2,33]])
y = np.array([1, 2, 2, 1])

In [5]:
model = TemplateMatchClassifier(distance_metric='manhattan')

In [6]:
model.fit(X,y)
model.templates_

[1 2]
{1: array([ 7. , 20.5,  2.5, 18.5]), 2: array([ 8. ,  5.5,  4. , 50.5])}


{1: array([ 7. , 20.5,  2.5, 18.5]), 2: array([ 8. ,  5.5,  4. , 50.5])}

In [7]:
new_query = np.array([[2,15,6,21], [8,9,7,6]])

In [8]:
model.predict(new_query)

array([1, 1])

## Test the TemplateMatchClassifier on Fashion MNIST

In [10]:
data_sampling_rate = 0.1
cv_folds = 10

In [11]:
dataset = pd.read_csv('data/fashion-mnist_train.csv')
dataset = dataset.sample(frac=data_sampling_rate) #take a sample from the dataset so everyhting runs smoothly
num_classes = 10
classes = {0: "T-shirt/top", 1:"Trouser", 2: "Pullover", 3:"Dress", 4:"Coat", 5:"Sandal", 6:"Shirt", 7:"Sneaker", 8:"Bag", 9:"Ankle boot"}
display(dataset.head())

X = dataset[dataset.columns.difference(["label"])]
Y = np.array(dataset["label"])
X = X/255
X_train_plus_valid, X_test, y_train_plus_valid, y_test \
    = train_test_split(X, Y, random_state=0, \
                                    train_size = 0.7)

X_train, X_valid, y_train, y_valid \
    = train_test_split(X_train_plus_valid, \
                                        y_train_plus_valid, \
                                        random_state=0, \
                                        train_size = 0.5/0.7)

Unnamed: 0,label,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783,pixel784
7386,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
51637,2,0,0,0,0,0,0,0,0,1,...,0,0,0,132,146,56,0,0,0,0
36092,7,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20330,6,0,0,0,0,0,0,0,0,0,...,0,0,8,98,56,0,0,0,0,0
15149,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
model = TemplateMatchClassifier()
model.fit(X_train, y_train)

[0 1 2 3 4 5 6 7 8 9]
{0: array([1.28998968e-05, 1.38660991e-01, 5.58888029e-01, 5.66550568e-01,
       5.60152219e-01, 5.76354489e-01, 5.54695562e-01, 5.02386481e-01,
       4.80430857e-01, 4.31798246e-01, 2.14254386e-01, 4.04798762e-02,
       2.44117647e-01, 6.69504644e-03, 2.74767802e-03, 0.00000000e+00,
       3.86996904e-05, 2.61867905e-03, 8.03663571e-03, 4.67621259e-02,
       2.22587719e-01, 4.43975748e-01, 5.00283798e-01, 2.91989164e-01,
       5.07185243e-01, 5.46839525e-01, 5.74754902e-01, 5.70910733e-01,
       5.64112487e-01, 5.75374097e-01, 5.73013416e-01, 5.67079463e-01,
       5.68588751e-01, 5.64538184e-01, 2.44969040e-01, 5.76547988e-01,
       5.75464396e-01, 5.59894221e-01, 5.11803406e-01, 4.91279670e-01,
       4.72652219e-01, 3.46852425e-01, 9.18214654e-02, 1.64086687e-02,
       3.70227038e-03, 1.89112487e-01, 0.00000000e+00, 1.22549020e-03,
       5.49535604e-03, 1.40866873e-02, 8.13725490e-02, 3.20136739e-01,
       4.91344169e-01, 5.15028380e-01, 5.17930857e-

TemplateMatchClassifier(distance_metric='euclidean')

In [15]:
# Make a set of predictions for the training data
y_pred = model.predict(X_train)

# Print performance details
accuracy = metrics.accuracy_score(y_train, y_pred) # , normalize=True, sample_weight=None
print("Accuracy: " +  str(accuracy))
print(metrics.classification_report(y_train, y_pred))

# Print confusion matrix
# print(metrics.confusion_matrix(y_train, y_pred))

# Print nicer homemade confusion matrix
print("Confusion Matrix")
display(pd.crosstab(np.array(y_train), y_pred, rownames=['True'], colnames=['Predicted'], margins=True))

Accuracy: 0.6943333333333334
              precision    recall  f1-score   support

           0       0.72      0.68      0.70       304
           1       0.98      0.91      0.94       303
           2       0.58      0.49      0.53       269
           3       0.71      0.83      0.77       311
           4       0.52      0.61      0.56       288
           5       0.50      0.76      0.61       335
           6       0.42      0.19      0.26       307
           7       0.76      0.82      0.79       290
           8       0.96      0.77      0.85       323
           9       0.82      0.86      0.84       270

    accuracy                           0.69      3000
   macro avg       0.70      0.69      0.69      3000
weighted avg       0.70      0.69      0.69      3000

Confusion Matrix


Predicted,0,1,2,3,4,5,6,7,8,9,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,208,3,2,33,4,42,9,0,3,0,304
1,7,275,4,9,2,5,1,0,0,0,303
2,1,0,133,2,61,39,30,0,3,0,269
3,10,1,0,259,14,15,12,0,0,0,311
4,0,0,44,30,176,14,24,0,0,0,288
5,0,0,0,0,0,256,0,49,0,30,335
6,60,0,41,17,80,46,59,0,4,0,307
7,0,0,0,0,0,33,0,238,0,19,290
8,1,1,7,15,3,38,1,9,248,0,323
9,0,0,0,0,1,19,3,16,0,231,270


In [16]:
# Make a set of predictions for the training data
y_pred = model.predict(X_test)

# Print performance details
accuracy = metrics.accuracy_score(y_test, y_pred) # , normalize=True, sample_weight=None
print("Accuracy: " +  str(accuracy))
print(metrics.classification_report(y_test, y_pred))

# Print confusion matrix
# print(metrics.confusion_matrix(y_train, y_pred))

# Print nicer homemade confusion matrix
print("Confusion Matrix")
pd.crosstab(np.array(y_test), y_pred, rownames=['True'], colnames=['Predicted'], margins=True)

Accuracy: 0.675
              precision    recall  f1-score   support

           0       0.69      0.69      0.69       168
           1       0.98      0.91      0.94       174
           2       0.50      0.40      0.44       180
           3       0.67      0.80      0.73       174
           4       0.57      0.58      0.57       202
           5       0.47      0.77      0.58       175
           6       0.32      0.19      0.23       188
           7       0.78      0.83      0.80       204
           8       0.93      0.76      0.84       168
           9       0.91      0.87      0.89       167

    accuracy                           0.68      1800
   macro avg       0.68      0.68      0.67      1800
weighted avg       0.68      0.68      0.67      1800

Confusion Matrix


Predicted,0,1,2,3,4,5,6,7,8,9,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,116,2,2,15,0,21,12,0,0,0,168
1,2,158,0,10,0,2,2,0,0,0,174
2,2,0,72,3,42,37,21,0,3,0,180
3,9,1,1,139,7,11,6,0,0,0,174
4,0,0,25,22,117,9,29,0,0,0,202
5,0,0,0,0,0,135,1,31,3,5,175
6,40,1,38,12,34,25,35,0,3,0,188
7,0,0,0,0,0,26,0,169,0,9,204
8,0,0,7,6,4,14,3,6,128,0,168
9,0,0,0,0,1,7,1,12,0,146,167


In [19]:
#perform crossvalidation 
my_model = TemplateMatchClassifier()
scores = cross_val_score(my_model, X_train_plus_valid, y_train_plus_valid, cv=cv_folds, n_jobs=-1)
print(scores)

[0.70952381 0.6952381  0.68571429 0.69285714 0.68095238 0.68809524
 0.71190476 0.66190476 0.65952381 0.70238095]


In [21]:
# Set up the parameter grid to seaerch
param_grid = [
 {'distance_metric': ['euclidean', 'cosine', 'manhattan']}
]

# Perform the search
tuned_model = GridSearchCV(TemplateMatchClassifier(), param_grid, cv=cv_folds, verbose = 2, n_jobs=-1)
tuned_model.fit(X_train_plus_valid, y_train_plus_valid)

# Print details
print("Best parameters set found on development set:")
print(tuned_model.best_params_)
print(tuned_model.best_score_)


Fitting 10 folds for each of 3 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    2.1s finished


[0 1 2 3 4 5 6 7 8 9]
{0: array([8.95335303e-06, 1.46306742e-01, 5.74178530e-01, 5.82021667e-01,
       5.76318381e-01, 5.85164294e-01, 5.70776256e-01, 5.23914406e-01,
       5.02766586e-01, 4.50649118e-01, 2.15328140e-01, 5.19563076e-02,
       2.51159459e-01, 8.41615185e-03, 1.93392425e-03, 6.26734712e-05,
       5.37201182e-05, 1.89811084e-03, 9.23090697e-03, 4.80436924e-02,
       2.35553765e-01, 4.58044588e-01, 5.17154624e-01, 3.04360283e-01,
       5.26654132e-01, 5.59888978e-01, 5.92667204e-01, 5.84689766e-01,
       5.78073238e-01, 5.89041096e-01, 5.90957113e-01, 5.85692542e-01,
       5.85844749e-01, 5.77661384e-01, 2.60533620e-01, 5.84385352e-01,
       5.83928731e-01, 5.75655833e-01, 5.35473185e-01, 5.11424478e-01,
       4.96803653e-01, 3.51240039e-01, 9.90867580e-02, 2.37084788e-02,
       2.65914585e-03, 1.98316770e-01, 8.95335303e-05, 8.59521891e-04,
       4.42295640e-03, 1.66532366e-02, 8.77070463e-02, 3.36735607e-01,
       5.06464321e-01, 5.33763094e-01, 5.32858806e-

In [22]:
# Make a set of predictions for the test data
y_pred = tuned_model.predict(X_test)

# Print performance details
accuracy = metrics.accuracy_score(y_test, y_pred) # , normalize=True, sample_weight=None
print("Accuracy: " +  str(accuracy))
print(metrics.classification_report(y_test, y_pred))

# Print confusion matrix
print("Confusion Matrix")
pd.crosstab(np.array(y_test), y_pred, rownames=['True'], colnames=['Predicted'], margins=True)

Accuracy: 0.6722222222222223
              precision    recall  f1-score   support

           0       0.70      0.68      0.69       168
           1       0.96      0.91      0.94       174
           2       0.49      0.39      0.43       180
           3       0.67      0.79      0.73       174
           4       0.57      0.57      0.57       202
           5       0.47      0.77      0.58       175
           6       0.32      0.20      0.24       188
           7       0.77      0.83      0.80       204
           8       0.94      0.75      0.83       168
           9       0.91      0.87      0.89       167

    accuracy                           0.67      1800
   macro avg       0.68      0.68      0.67      1800
weighted avg       0.67      0.67      0.67      1800

Confusion Matrix


Predicted,0,1,2,3,4,5,6,7,8,9,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,114,3,3,15,1,21,11,0,0,0,168
1,1,159,0,10,0,2,2,0,0,0,174
2,2,0,70,3,41,37,24,0,3,0,180
3,9,2,1,138,7,10,7,0,0,0,174
4,0,0,26,21,116,9,30,0,0,0,202
5,0,0,0,0,0,134,1,33,2,5,175
6,38,1,36,13,35,25,37,0,3,0,188
7,0,0,0,0,0,25,0,170,0,9,204
8,0,0,8,6,4,16,2,6,126,0,168
9,0,0,0,0,1,7,1,12,0,146,167
