# Machine Learning in Python - Roll your Own Estimator SOLUTION

## Imports Use Packages Etc

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import pyplot
from random import randint

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.utils.multiclass import unique_labels
from sklearn import metrics
from scipy.spatial import distance


## Define TemplateMatchClassifier

The TemplateMatchClassifier algorithm is a simple template matching predictor. TemplateMatchClassifier only works for continuous descriptive features and categorical target features. TemplateMatchClassifier works very simply as follows:
* **Training:** For each target feature level calculate the average value of all descriptive features for instances that have that target level. Store these average vectors as templates for each target level.
* **Prediction:** When a new prediction needs to be made compare the descriptive feature values of the new query instance to each template and return the target feature level that belongs to the template that is cloesest (based on Euclidean distance) to the query case.

### Define the TemplateMatchClassifier Class

In [2]:
# Create a new classifier which is based on the sckit-learn BaseEstimator and ClassifierMixin classes
class TemplateMatchClassifier(BaseEstimator, ClassifierMixin):
    """A very simple template match classifier. The TemplateMatchClassifier algorithm is a simple template matching predictor. TemplateMatchClassifier only works for continuous descriptive features and categorical target features. TemplateMatchClassifier works very simply as follows:
    - Training: For each target feature level calculate the average value of all descriptive features for instances that have that target level. Store these average vectors as templates for each target level.
    - Prediction: When a new prediction needs to be made compare the descriptive feature values of the new query instance to each template and return the target feature level that belongs to the template that is cloesest (based on Euclidean distance) to the query case.

    Parameters
    ----------
    distance_metric string, optional (default = 'euclidean')
        The distance metric that can be used, can be one of 'euclidean',
        'cosine', or 'manhattan'.

    Attributes
    ----------
    classes_ : array of shape = [n_classes] 
        The classes labels (single output problem).
    distance_metric: string
        The distance metric used    
    templates_: dict
        A dictionary of the templates used for each class. The classes are the dictioanry keys, and the templates the values.
        
    Notes
    -----
    

    See also
    --------
    
    ----------
    
    Examples
    --------
    >>> from sklearn.datasets import load_iris
    >>> from sklearn.model_selection import cross_val_score
    >>> clf = TemplateMatchClassifier()
    >>> iris = load_iris()
    >>> cross_val_score(clf, iris.data, iris.target, cv=10)

    """
    
    # Constructor for the classifier object
    def __init__(self, distance_metric = 'euclidean', demo_param='demo'):
        self.demo_param = demo_param
        self.distance_metric = distance_metric

    # The fit function to train a classifier
    def fit(self, X, y):
        """Build a decision tree classifier from the training set (X, y).
        Parameters
        ----------
        X : array-like or sparse matrix, shape = [n_samples, n_features]
            The training input samples. Internally, it will be converted to
            ``dtype=np.float32`` and if a sparse matrix is provided
            to a sparse ``csc_matrix``.
        y : array-like, shape = [n_samples] 
            The target values (class labels) as integers or strings.
        Returns
        -------
        self : object
        """
    
        # Convert the string distance metric into a function reference
        if self.distance_metric == 'euclidean':
            self.distance_function_ = distance.euclidean
        elif self.distance_metric == 'cosine':
            self.distance_function_ = distance.cosine
        elif self.distance_metric == 'manhattan':
            self.distance_function_ = distance.cityblock
        else:
            self.distance_function_ = distance.euclidean
            
        # Check that X and y have correct shape
        X, y = check_X_y(X, y)
        # Store the classes seen during fit
        self.classes_ = unique_labels(y)
        
        # Create a new empty dictionary into which we will store templates
        self.templates_ = dict()
        
        # Iterate through the classes creating a template for each and storing it
        for c in self.classes_:
            template = X[y == c].mean(axis = 0)
            self.templates_[c] = template
        
        # Return the classifier
        return self

    # The predict function to make a set of predictions for a set of query instances
    def predict(self, X):
        """Predict class labels of the input samples X.
        Parameters
        ----------
        X : array-like or sparse matrix of shape = [n_samples, n_features]
            The input samples. Internally, it will be converted to
            ``dtype=np.float32`` and if a sparse matrix is provided
            to a sparse ``csr_matrix``.
        Returns
        -------
        p : array of shape = [n_samples, ].
            The predicted class labels of the input samples. 
        """
        # Check is fit had been called by confirming that the teamplates_ dictiponary has been set up
        check_is_fitted(self, ['templates_'])

        # Check that the input features match the type and shape of the training features
        X = check_array(X)

        # Initialise an empty list to store the predictions made
        closest_classes = list()
        
        # Iterate through the query instances in the query dataset 
        for instance in X:
            # Insitialse best match to be the first template
            min_dist = self.distance_function_(instance, self.templates_[self.classes_[0]])
            closest_class = self.classes_[0]
            
            # Iterate through the templates to find the one closest to the query instance
            for c in self.templates_:
                dist = self.distance_function_(instance, self.templates_[c])
                if dist < min_dist:
                    min_dist = dist
                    closest_class = c
            closest_classes.append(closest_class)
            
        return np.array(closest_classes)

### Test the TemplateMatchClassifier

Do a simple test of the TemplateMatchClassifier

In [3]:
a = np.array([[1,23,3,4], [5,6,7,8], [7,5,6,2], [4,9,12,43]])
y = np.array([1, 2, 2, 1])

In [4]:
my_model = TemplateMatchClassifier(distance_metric='cosine')

In [5]:
my_model.fit(a, y)

TemplateMatchClassifier(demo_param='demo', distance_metric='cosine')

In [6]:
my_model.templates_

{1: array([ 2.5, 16. ,  7.5, 23.5]), 2: array([6. , 5.5, 6.5, 5. ])}

In [7]:
q = np.array([[2,15,6,21], [8,9,7,6]])

In [8]:
my_model.predict(q)

array([1, 2])

Do simple Iris cross validation expeirment

In [9]:
from sklearn.datasets import load_iris
clf = TemplateMatchClassifier()
iris = load_iris()
cross_val_score(clf, iris.data, iris.target, cv=10)

array([0.86666667, 0.93333333, 0.93333333, 0.93333333, 0.93333333,
       0.8       , 1.        , 0.93333333, 1.        , 1.        ])

## Load & Partition Data

### Setup - IMPORTANT

Take only a sample of the dataset for fast testing

In [10]:
data_sampling_rate = 0.1

Setup the number of folds for all grid searches (should be 5 - 10)

In [11]:
cv_folds = 10

### Load & Partition Data

Load the dataset and explore it.

In [12]:
dataset = pd.read_csv('fashion-mnist_train.csv')
dataset = dataset.sample(frac=data_sampling_rate) #take a sample from the dataset so everyhting runs smoothly
num_classes = 10
classes = {0: "T-shirt/top", 1:"Trouser", 2: "Pullover", 3:"Dress", 4:"Coat", 5:"Sandal", 6:"Shirt", 7:"Sneaker", 8:"Bag", 9:"Ankle boot"}
display(dataset.head())

Unnamed: 0,label,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783,pixel784
40764,4,0,0,0,0,0,1,0,0,95,...,2,1,0,0,90,104,78,0,0,0
53761,3,0,0,0,0,0,0,0,0,0,...,5,92,48,0,0,0,0,0,0,0
46313,7,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
59986,6,0,0,0,0,0,0,0,0,0,...,0,0,0,0,20,126,124,27,0,0
58395,6,0,0,0,2,0,0,0,74,109,...,147,39,87,53,0,0,1,0,0,0


Isolate the descriptive features we are interested in

In [13]:
X = dataset[dataset.columns[1:]]
Y = np.array(dataset["label"])

In [14]:
X = X/255

In [15]:
X_train_plus_valid, X_test, y_train_plus_valid, y_test \
    = train_test_split(X, Y, random_state=0, \
                                    train_size = 0.7)

X_train, X_valid, y_train, y_valid \
    = train_test_split(X_train_plus_valid, \
                                        y_train_plus_valid, \
                                        random_state=0, \
                                        train_size = 0.5/0.7)

## Train and Evaluate a Simple Model

In [16]:
my_model = TemplateMatchClassifier()
my_model.fit(X_train, y_train)

TemplateMatchClassifier(demo_param='demo', distance_metric='euclidean')

In [17]:
# Make a set of predictions for the training data
y_pred = my_model.predict(X_train)

# Print performance details
accuracy = metrics.accuracy_score(y_train, y_pred) # , normalize=True, sample_weight=None
print("Accuracy: " +  str(accuracy))
print(metrics.classification_report(y_train, y_pred))

# Print confusion matrix
# print(metrics.confusion_matrix(y_train, y_pred))

# Print nicer homemade confusion matrix
print("Confusion Matrix")
display(pd.crosstab(np.array(y_train), y_pred, rownames=['True'], colnames=['Predicted'], margins=True))

Accuracy: 0.695
              precision    recall  f1-score   support

           0       0.73      0.70      0.72       301
           1       0.98      0.90      0.94       313
           2       0.61      0.46      0.53       337
           3       0.75      0.76      0.76       316
           4       0.57      0.61      0.59       323
           5       0.48      0.83      0.61       275
           6       0.32      0.24      0.27       287
           7       0.79      0.84      0.82       281
           8       0.97      0.79      0.87       288
           9       0.86      0.84      0.85       279

    accuracy                           0.69      3000
   macro avg       0.71      0.70      0.69      3000
weighted avg       0.71      0.69      0.69      3000

Confusion Matrix


Predicted,0,1,2,3,4,5,6,7,8,9,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,211,0,6,30,1,39,13,0,1,0,301
1,8,283,3,9,2,5,2,0,1,0,313
2,1,0,156,2,79,34,64,0,1,0,337
3,14,7,1,241,11,26,16,0,0,0,316
4,0,0,43,21,197,17,45,0,0,0,323
5,0,0,0,0,0,228,0,26,1,20,275
6,55,0,41,9,56,53,69,0,4,0,287
7,0,0,0,0,0,26,0,237,0,18,281
8,0,0,6,7,2,26,6,13,228,0,288
9,0,0,0,1,0,17,2,24,0,235,279


In [18]:
# Make a set of predictions for the training data
y_pred = my_model.predict(X_test)

# Print performance details
accuracy = metrics.accuracy_score(y_test, y_pred) # , normalize=True, sample_weight=None
print("Accuracy: " +  str(accuracy))
print(metrics.classification_report(y_test, y_pred))

# Print confusion matrix
# print(metrics.confusion_matrix(y_train, y_pred))

# Print nicer homemade confusion matrix
print("Confusion Matrix")
pd.crosstab(np.array(y_test), y_pred, rownames=['True'], colnames=['Predicted'], margins=True)

Accuracy: 0.6733333333333333
              precision    recall  f1-score   support

           0       0.70      0.72      0.71       156
           1       0.97      0.86      0.91       202
           2       0.58      0.45      0.51       197
           3       0.67      0.78      0.72       169
           4       0.49      0.56      0.52       171
           5       0.49      0.76      0.60       188
           6       0.29      0.17      0.21       184
           7       0.71      0.82      0.76       158
           8       0.95      0.76      0.84       187
           9       0.88      0.87      0.88       188

    accuracy                           0.67      1800
   macro avg       0.67      0.68      0.67      1800
weighted avg       0.68      0.67      0.67      1800

Confusion Matrix


Predicted,0,1,2,3,4,5,6,7,8,9,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,112,2,6,11,5,14,4,1,1,0,156
1,4,173,3,12,0,7,3,0,0,0,202
2,1,0,89,2,40,29,32,0,4,0,197
3,7,4,0,132,8,11,7,0,0,0,169
4,0,0,25,18,96,7,25,0,0,0,171
5,0,0,0,1,0,143,1,28,0,15,188
6,35,0,24,11,44,35,31,1,3,0,184
7,0,0,0,0,0,21,0,130,0,7,158
8,0,0,7,10,4,15,2,7,142,0,187
9,0,0,0,0,0,8,1,15,0,164,188


## Do a Cross Validation Experiment With Our Model

In [19]:
my_model = TemplateMatchClassifier()
scores = cross_val_score(my_model, X_train_plus_valid, y_train_plus_valid, cv=cv_folds, n_jobs=-1)
print(scores)

[0.66666667 0.71394799 0.6855792  0.73696682 0.68720379 0.68809524
 0.64200477 0.67625899 0.70023981 0.74879227]


## Do a Grid Search Through Distance Metrics

In [20]:
# Set up the parameter grid to seaerch
param_grid = [
 {'distance_metric': ['euclidean', 'cosine', 'manhattan']}
]

# Perform the search
my_tuned_model = GridSearchCV(TemplateMatchClassifier(), param_grid, cv=cv_folds, verbose = 2, n_jobs=-1)
my_tuned_model.fit(X_train_plus_valid, y_train_plus_valid)

# Print details
print("Best parameters set found on development set:")
print(my_tuned_model.best_params_)
print(my_tuned_model.best_score_)


Fitting 10 folds for each of 3 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Best parameters set found on development set:
{'distance_metric': 'cosine'}
0.695952380952381


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    5.1s finished


In [21]:
# Make a set of predictions for the test data
y_pred = my_tuned_model.predict(X_test)

# Print performance details
accuracy = metrics.accuracy_score(y_test, y_pred) # , normalize=True, sample_weight=None
print("Accuracy: " +  str(accuracy))
print(metrics.classification_report(y_test, y_pred))

# Print confusion matrix
print("Confusion Matrix")
pd.crosstab(np.array(y_test), y_pred, rownames=['True'], colnames=['Predicted'], margins=True)

Accuracy: 0.6716666666666666
              precision    recall  f1-score   support

           0       0.66      0.79      0.72       156
           1       0.98      0.88      0.93       202
           2       0.58      0.63      0.60       197
           3       0.69      0.84      0.76       169
           4       0.51      0.54      0.52       171
           5       0.51      0.28      0.36       188
           6       0.42      0.27      0.33       184
           7       0.63      0.89      0.74       158
           8       0.91      0.83      0.87       187
           9       0.71      0.80      0.75       188

    accuracy                           0.67      1800
   macro avg       0.66      0.68      0.66      1800
weighted avg       0.66      0.67      0.66      1800

Confusion Matrix


Predicted,0,1,2,3,4,5,6,7,8,9,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,123,0,1,12,0,1,14,0,5,0,156
1,4,178,4,11,3,0,2,0,0,0,202
2,2,0,124,2,45,0,19,0,5,0,197
3,9,3,0,142,1,0,14,0,0,0,169
4,0,0,50,20,93,0,8,0,0,0,171
5,0,0,1,2,0,52,3,72,0,58,188
6,47,0,32,8,41,0,50,0,6,0,184
7,0,0,0,0,0,12,0,141,0,5,158
8,1,0,3,8,1,4,8,7,155,0,187
9,0,0,0,0,0,32,1,4,0,151,188
