In [1]:
from IPython.display import display, HTML, Image

from TAS_Python_Utilities import data_viz
from TAS_Python_Utilities import data_viz_target
from TAS_Python_Utilities import visualize_tree

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import pyplot
import random

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.utils.multiclass import unique_labels
from sklearn import metrics
from scipy.spatial import distance


%matplotlib inline
#%qtconsole

In [2]:
data_sampling_rate = 0.1
cv_folds = 10

dataset = pd.read_csv('fashion-mnist_train.csv')
dataset = dataset.sample(frac=data_sampling_rate) #take a sample from the dataset so everyhting runs smoothly

test_dataset = pd.read_csv('fashion-mnist_test.csv')

# Define TemplateMatchClassifier

Define and test out the TemplateMatchClassifier class. To build a scikit-learn classifier we extend from the BaseEstimator and ClassifierMixin classes and implement the init, fit, predict, and predict_proba methods.

In [3]:
# Create a new classifier which is based on the sckit-learn BaseEstimator and ClassifierMixin classes
class TemplateMatchClassifier(BaseEstimator, ClassifierMixin):
   

    # Constructor for the classifier object  
    # euclidean, mahalanobis, minkowski
    # link: https://docs.scipy.org/doc/scipy/reference/spatial.distance.html
    def __init__(self, dst_type = 'euclidean'):
        #self.hello = print('Hello')
        self.dst_type = dst_type
        
    # The fit function to train a classifier
    def fit(self, X, y):
        """Build a decision tree classifier from the training set (X, y).
        Parameters
        ----------
        X : array-like or sparse matrix, shape = [n_samples, n_features]
            The training input samples. Internally, it will be converted to
            ``dtype=np.float32`` and if a sparse matrix is provided
            to a sparse ``csc_matrix``.
        y : array-like, shape = [n_samples] 
            The target values (class labels) as integers or strings.
        Returns
        -------
        self : object
        """
            
        # Check that X and y have correct shape
        X, y = check_X_y(X, y)

        # Find the classes for each of the classes
        unique = np.unique(y)        
        
        # Store the classes seen during fit
        self.classes_ = unique
        
        # For each target feature level, calculating the average value of 
        # all descriptive features for instances that have that target level
        avg_all = X[pd.Index(y).get_loc(unique[0])]
        df = pd.DataFrame(avg_all)
        avg_all= df.agg("mean", axis="rows")
        
        for i in range(1, len(unique)):
            b = X[pd.Index(y).get_loc(unique[i])]
            df = pd.DataFrame(b)
            b= df.agg("mean", axis="rows")
            avg_all = np.vstack((avg_all,b))

        # Storing the average value of all descriptive features
        self.avg_all = avg_all
        
        # Return the classifier
        return self
    
    
    # The predict function to make a set of predictions for a set of query instances
    def predict(self, X):
        
        # Check is fit had been called by confirming that the distributions_ dictionary has been set up
        check_is_fitted(self, ['avg_all'])
        
        # Check that the input features match the type and shape of the training features
        X = check_array(X)
        
        # store number of rows of test array
        len_array = len(X)
        
               
        # Initialise an empty list to store the distances made for every class
        zeros_shape = np.zeros(shape=(len_array,len(self.classes_)))
        distances_ = pd.DataFrame(zeros_shape)
        
        # Initialise an empty list to store the predictions made for every row of input
        zeros_shape2 = np.zeros(shape=(1,len_array))
        predictions_ = pd.DataFrame(zeros_shape2)
        
        
        # Calculating distances for each classes and storing results on distances_
        for i in range(len(self.classes_)):
            avg_all = self.avg_all[i,:]
            for j in range (len_array):  # len_array = number of rows of test array
                x = X[j,:]
                if self.dst_type == 'euclidean':
                    dst = distance.euclidean(avg_all, x)
                elif self.dst_type == 'minkowski':
                    dst = distance.minkowski(avg_all, x)
                # elif self.dst_type == 'mahalanobis':
                #     dst = distance.mahalanobis(avg_all, x, iv)
                distances_.iloc[j,i] = dst
      
        # Storing predictions
        for k in range(len_array):
            predictions_[k] = pd.Index(distances_.iloc[k,:]).get_loc(min(distances_.iloc[k,:]))
            
        #predictions_ = pd.DataFrame(predictions_)
        predictions_ = np.array(predictions_)[0]
        #self.predictions_ = predictions_
        return predictions_

In [4]:
X = dataset[dataset.columns.difference(["label"])]
Y = np.array(dataset["label"])

In [18]:
my_model = TemplateMatchClassifier(dst_type='minkowski')

In [19]:
my_model.fit(X, Y)

TemplateMatchClassifier(dst_type='minkowski')

In [20]:
q_first = test_dataset.sample(10)
q = q_first[q_first.columns.difference(["label"])]

In [21]:
my_model.predict(q)

array([4, 4, 4, 8, 4, 4, 4, 4, 9, 4], dtype=int64)

In [22]:
q_first_label = q_first["label"]
q_first_label

6840    8
1312    4
2858    0
4898    7
3376    6
9556    3
3722    3
8525    4
4031    9
6567    6
Name: label, dtype: int64

# Trying with MNIST fashion dataset

In [11]:
X = X/255

In [12]:
X_train_plus_valid, X_test, y_train_plus_valid, y_test \
    = train_test_split(X, Y, random_state=0, \
                                    train_size = 0.7)

X_train, X_valid, y_train, y_valid \
    = train_test_split(X_train_plus_valid, \
                                        y_train_plus_valid, \
                                        random_state=0, \
                                        train_size = 0.5/0.7)



In [13]:
my_model = TemplateMatchClassifier()
my_model.fit(X_train, y_train)

TemplateMatchClassifier(dst_type='euclidean')

In [14]:
# Make a set of predictions for the training data
y_pred = my_model.predict(X_train)

# Print performance details
accuracy = metrics.accuracy_score(y_train, y_pred) # , normalize=True, sample_weight=None
print("Accuracy: " +  str(accuracy))
print(metrics.classification_report(y_train, y_pred))

# Print confusion matrix
# print(metrics.confusion_matrix(y_train, y_pred))

# Print nicer homemade confusion matrix
print("Confusion Matrix")
display(pd.crosstab(y_train, y_pred, rownames=['True'], colnames=['Predicted'], margins=True))

Accuracy: 0.689
             precision    recall  f1-score   support

          0       0.72      0.68      0.70       303
          1       0.98      0.89      0.93       313
          2       0.55      0.53      0.54       285
          3       0.66      0.77      0.71       285
          4       0.59      0.53      0.56       322
          5       0.48      0.77      0.59       305
          6       0.36      0.24      0.29       286
          7       0.77      0.82      0.79       284
          8       0.97      0.76      0.86       333
          9       0.88      0.88      0.88       284

avg / total       0.70      0.69      0.69      3000

Confusion Matrix


Predicted,0,1,2,3,4,5,6,7,8,9,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,206,1,9,32,3,40,10,0,2,0,303
1,10,278,2,12,2,5,4,0,0,0,313
2,3,0,150,3,44,38,47,0,0,0,285
3,17,5,0,220,15,19,9,0,0,0,285
4,2,0,58,36,172,17,36,0,1,0,322
5,0,0,0,0,0,234,1,49,2,19,305
6,48,1,46,11,56,53,69,0,2,0,286
7,0,0,0,0,0,37,0,233,0,14,284
8,0,0,8,15,1,33,16,5,254,1,333
9,0,0,0,2,0,12,2,17,0,251,284


In [15]:
# Make a set of predictions for the training data
y_pred = my_model.predict(X_test)

# Print performance details
accuracy = metrics.accuracy_score(y_test, y_pred) # , normalize=True, sample_weight=None
print("Accuracy: " +  str(accuracy))
print(metrics.classification_report(y_test, y_pred))

# Print confusion matrix
# print(metrics.confusion_matrix(y_train, y_pred))

# Print nicer homemade confusion matrix
print("Confusion Matrix")
pd.crosstab(np.array(y_test), y_pred, rownames=['True'], colnames=['Predicted'], margins=True)

Accuracy: 0.6866666666666666
             precision    recall  f1-score   support

          0       0.68      0.70      0.69       173
          1       0.97      0.91      0.94       190
          2       0.53      0.49      0.51       169
          3       0.77      0.77      0.77       197
          4       0.52      0.58      0.55       180
          5       0.50      0.77      0.60       175
          6       0.31      0.19      0.24       181
          7       0.78      0.84      0.81       186
          8       0.93      0.73      0.82       176
          9       0.88      0.86      0.87       173

avg / total       0.69      0.69      0.68      1800

Confusion Matrix


Predicted,0,1,2,3,4,5,6,7,8,9,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,121,1,8,13,2,19,8,1,0,0,173
1,5,173,0,5,1,4,1,0,1,0,190
2,0,0,82,2,36,15,33,0,1,0,169
3,13,3,1,152,9,16,3,0,0,0,197
4,0,0,35,8,105,11,19,0,2,0,180
5,0,0,0,1,0,134,0,27,1,12,175
6,34,1,27,7,46,27,35,0,4,0,181
7,0,0,0,0,0,20,0,157,0,9,186
8,3,0,0,10,3,14,13,4,129,0,176
9,1,0,1,0,0,9,1,13,0,148,173


In [23]:
# Set up the parameter grid to seaerch
param_grid = [
 {'dst_type': ['euclidean', 'minkowski']}
]

# Perform the search
my_tuned_model = GridSearchCV(TemplateMatchClassifier(), param_grid, cv=2, verbose = 2)
my_tuned_model.fit(X_train_plus_valid, y_train_plus_valid)


# Print details
print("Best parameters set found on development set:")
print(my_tuned_model.best_params_)
print(my_tuned_model.best_score_)

Fitting 2 folds for each of 2 candidates, totalling 4 fits
[CV] dst_type=euclidean ..............................................
[CV] ............................... dst_type=euclidean, total=   6.3s
[CV] dst_type=euclidean ..............................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   11.8s remaining:    0.0s


[CV] ............................... dst_type=euclidean, total=   5.7s
[CV] dst_type=minkowski ..............................................
[CV] ............................... dst_type=minkowski, total=   5.4s
[CV] dst_type=minkowski ..............................................
[CV] ............................... dst_type=minkowski, total=   5.5s
Best parameters set found on development set:
{'dst_type': 'euclidean'}
0.69


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   46.1s finished


In [27]:
# Make a set of predictions for the test data
y_pred = my_tuned_model.predict(X_test)

# Print performance details
accuracy = metrics.accuracy_score(y_test, y_pred) # , normalize=True, sample_weight=None
print("Accuracy: " +  str(accuracy))
print(metrics.classification_report(y_test, y_pred))

# Print confusion matrix
print("Confusion Matrix")
pd.crosstab(np.array(y_test), y_pred, rownames=['True'], colnames=['Predicted'], margins=True)

Accuracy: 0.6933333333333334
             precision    recall  f1-score   support

          0       0.78      0.70      0.74       196
          1       0.98      0.92      0.95       169
          2       0.54      0.51      0.52       174
          3       0.72      0.80      0.76       178
          4       0.56      0.53      0.55       184
          5       0.55      0.74      0.63       211
          6       0.30      0.22      0.26       175
          7       0.70      0.84      0.76       161
          8       0.94      0.78      0.85       169
          9       0.89      0.90      0.89       183

avg / total       0.69      0.69      0.69      1800

Confusion Matrix


Predicted,0,1,2,3,4,5,6,7,8,9,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,137,0,2,18,6,21,11,1,0,0,196
1,2,156,2,3,2,2,1,0,1,0,169
2,1,0,88,2,26,19,38,0,0,0,174
3,4,3,1,142,7,12,9,0,0,0,178
4,1,0,28,19,98,11,24,0,3,0,184
5,0,0,0,0,0,157,0,37,3,14,211
6,30,0,36,8,33,27,39,0,2,0,175
7,0,0,0,0,0,20,0,135,0,6,161
8,1,0,6,5,2,11,5,7,132,0,169
9,0,0,0,0,0,4,1,14,0,164,183
