## Imports

In [1]:

import random
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import time
import cv2

from numpy import genfromtxt
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import f_regression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, classification_report, cohen_kappa_score, mean_absolute_error, balanced_accuracy_score
from sklearn import metrics
from sklearn import tree
from sklearn import preprocessing
from sklearn.model_selection import *
from sklearn.ensemble import *
from sklearn.preprocessing import normalize
import graphviz 
from mpl_toolkits.mplot3d import Axes3D
from IPython.display import SVG

from IPython.display import Image  

#----------------------------------------------------------------

import numpy as np
import matplotlib.pyplot as plt
#set image size
num_pixels = 24


In [2]:
# Importing training  data from CSV files
x_train_import = genfromtxt('x_train_gr_smpl.csv', delimiter=',', skip_header =1)
y_train_import = genfromtxt('y_train_smpl.csv', delimiter=',', skip_header =1 )

In [3]:
# Importing testing data from CSV files
x_test_import = genfromtxt('x_test_gr_smpl.csv', delimiter=',', skip_header =1)
y_test_import = genfromtxt('y_test_smpl.csv', delimiter=',', skip_header =1 )


In [4]:
# Check data has been imported correctly 
print(x_train_import.shape)
print(y_train_import.shape)
print(x_test_import.shape)
print(y_test_import.shape)

(12660, 2304)
(12660,)
(4170, 2304)
(4170,)


## Preprocessing

In [5]:
# Combinging data with labels

y_train_import = y_train_import.reshape(12660,1)
train_data_combined = np.append(x_train_import, y_train_import, axis=1)

y_test_import = y_test_import.reshape(4170,1)
test_data_combined = np.append(x_test_import, y_test_import, axis=1)


In [6]:
# Randomises row order in a reproducable way 
np.random.seed(0)
np.random.shuffle(train_data_combined)
np.random.shuffle(test_data_combined)

In [7]:
#equalize the data so all classes have the same amount of instances - smaller classes take their max

def equilize(data):
    train_combined_equal = []
    for x in range(10):
        count = 0
        #while count < 240:
        for i in data :
            if i[-1] == x and count < 240:
                train_combined_equal.append(i)
                count += 1
        
    train_combined_equal = np.asarray(train_combined_equal)
    np.random.shuffle(train_combined_equal)      
    
    return train_combined_equal

train_combined_equal = equilize(train_data_combined)
print("length:",len(train_combined_equal))

length: 2400


In [8]:
# Split data and lables that have undergone the same randomization process 
x_train = train_combined_equal[:,:-1]
y_train = train_combined_equal[:, [-1]].reshape(len(train_combined_equal),)   


x_train_all = train_data_combined[:,:-1]
y_train_all = train_data_combined[:, [-1]].reshape(12660,)

x_test = test_data_combined[:,:-1]
y_test = test_data_combined[:, [-1]].reshape(4170,)

#what is the mean of the data
print("train mean:",np.mean(x_train))
print("test mean:",np.mean(x_test))
#shows that test set has been offset

train mean: 80.40806694878472
test mean: 85.48841426858513


In [9]:
#  Reducing the number of attributes using opencv's reduce functionality as shown in the second python Lecture
#  As seen in python lectures: https://vision.hw.ac.uk/webapps/blackboard/content/listContent.jsp?course_id=_94419_1&content_id=_3391865_1
x_train_all = np.apply_along_axis(
        func1d=lambda img: cv2.resize(img.reshape((48,48)), (num_pixels,num_pixels)),
        axis =1, arr = x_train_all).reshape((-1,num_pixels*num_pixels))

x_train = np.apply_along_axis(
        func1d=lambda img: cv2.resize(img.reshape((48,48)), (num_pixels,num_pixels)),
        axis =1, arr = x_train).reshape((-1,num_pixels*num_pixels))

x_test  = np.apply_along_axis(
        func1d=lambda img: cv2.resize(img.reshape((48,48)), (num_pixels,num_pixels)),
        axis =1, arr = x_test).reshape((-1,num_pixels*num_pixels))

In [10]:
x_train_test = np.append(x_train,x_test, axis=0)
y_train_test = np.append(y_train,y_test, axis=0)

## Test Kappa score vs MAE

In [11]:
#try standared tree with no limits 
clf = tree.DecisionTreeClassifier()
clf.fit(x_train, y_train)
# graph = graphviz.Source(tree.export_graphviz(clf, out_file=None,filled=True,rounded=True,class_names=True))  
# png_bytes = graph.pipe(format='png')  
# name = "overfitting_base_tree.png"
# with open(name,'wb') as f:
#     f.write(png_bytes)

# Image(name)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [12]:
#predict on test set
y_pred_test_data = clf.predict(x_test)
print(classification_report(y_test, y_pred_test_data))

              precision    recall  f1-score   support

         0.0       0.47      0.44      0.45       450
         1.0       0.47      0.53      0.50       630
         2.0       0.37      0.29      0.32       150
         3.0       0.80      0.81      0.81       420
         4.0       0.66      0.53      0.59       690
         5.0       0.80      0.80      0.80       720
         6.0       0.75      0.77      0.76       270
         7.0       0.05      0.13      0.07        60
         8.0       0.70      0.66      0.68       690
         9.0       0.22      0.37      0.28        90

    accuracy                           0.61      4170
   macro avg       0.53      0.53      0.53      4170
weighted avg       0.63      0.61      0.62      4170



In [13]:
print("kappa:",cohen_kappa_score(y_test, y_pred_test_data))
print("MAE:",mean_absolute_error(y_test, y_pred_test_data))# kappa greater then good

kappa: 0.5558382686842063
MAE: 1.1995203836930455


## The same procedure for 1360 images of each class

In [14]:
#equalize the data so all classes have the same amount of instances - smaller classes take their max

def equilize(data):
    train_combined_equal = []
    for x in range(10):
        count = 0
        #while count < 1320:
        for i in data :
            if i[-1] == x and count < 1320:
                train_combined_equal.append(i)
                count += 1
        
    train_combined_equal = np.asarray(train_combined_equal)
    np.random.shuffle(train_combined_equal)      
    
    return train_combined_equal

train_combined_equal = equilize(train_data_combined)
print("length:",len(train_combined_equal))

length: 9660


In [15]:
# Split data and lables that have undergone the same randomization process 
x_train = train_combined_equal[:,:-1]
y_train = train_combined_equal[:, [-1]].reshape(len(train_combined_equal),)   


x_train_all = train_data_combined[:,:-1]
y_train_all = train_data_combined[:, [-1]].reshape(12660,)

x_test = test_data_combined[:,:-1]
y_test = test_data_combined[:, [-1]].reshape(4170,)

#what is the mean of the data
print("train mean:",np.mean(x_train))
print("test mean:",np.mean(x_test))
#shows that test set has been offset

train mean: 80.58911520337301
test mean: 85.48841426858513


In [16]:
#  Reducing the number of attributes using opencv's reduce functionality as shown in the second python Lecture
#  As seen in python lectures: https://vision.hw.ac.uk/webapps/blackboard/content/listContent.jsp?course_id=_94419_1&content_id=_3391865_1
x_train_all = np.apply_along_axis(
        func1d=lambda img: cv2.resize(img.reshape((48,48)), (num_pixels,num_pixels)),
        axis =1, arr = x_train_all).reshape((-1,num_pixels*num_pixels))

x_train = np.apply_along_axis(
        func1d=lambda img: cv2.resize(img.reshape((48,48)), (num_pixels,num_pixels)),
        axis =1, arr = x_train).reshape((-1,num_pixels*num_pixels))

x_test  = np.apply_along_axis(
        func1d=lambda img: cv2.resize(img.reshape((48,48)), (num_pixels,num_pixels)),
        axis =1, arr = x_test).reshape((-1,num_pixels*num_pixels))

In [17]:
x_train_test = np.append(x_train,x_test, axis=0)
y_train_test = np.append(y_train,y_test, axis=0)

In [18]:
#try standared tree with no limits 
clf = tree.DecisionTreeClassifier()
clf.fit(x_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [19]:
#predict on test set
y_pred_test_data = clf.predict(x_test)
print(classification_report(y_test, y_pred_test_data))

              precision    recall  f1-score   support

         0.0       0.68      0.61      0.64       450
         1.0       0.59      0.73      0.65       630
         2.0       0.57      0.47      0.51       150
         3.0       0.81      0.82      0.82       420
         4.0       0.79      0.75      0.77       690
         5.0       0.89      0.85      0.87       720
         6.0       0.90      0.71      0.80       270
         7.0       0.06      0.08      0.07        60
         8.0       0.83      0.88      0.85       690
         9.0       0.56      0.43      0.49        90

    accuracy                           0.75      4170
   macro avg       0.67      0.63      0.65      4170
weighted avg       0.76      0.75      0.75      4170



In [20]:
print("kappa:",cohen_kappa_score(y_test, y_pred_test_data))
print("MAE:",mean_absolute_error(y_test, y_pred_test_data))# kappa greater then good

kappa: 0.7085977195439088
MAE: 0.779136690647482


## The same procedure for 2160 images of each class (the whole dataset)

In [21]:
#equalize the data so all classes have the same amount of instances - smaller classes take their max

def equilize(data):
    train_combined_equal = []
    for x in range(10):
        count = 0
        #while count < 2160:
        for i in data :
            if i[-1] == x and count < 2160:
                train_combined_equal.append(i)
                count += 1
        
    train_combined_equal = np.asarray(train_combined_equal)
    np.random.shuffle(train_combined_equal)      
    
    return train_combined_equal

train_combined_equal = equilize(train_data_combined)
print("length:",len(train_combined_equal))

length: 12660


In [22]:
# Split data and lables that have undergone the same randomization process 
x_train = train_combined_equal[:,:-1]
y_train = train_combined_equal[:, [-1]].reshape(len(train_combined_equal),)   


x_train_all = train_data_combined[:,:-1]
y_train_all = train_data_combined[:, [-1]].reshape(12660,)

x_test = test_data_combined[:,:-1]
y_test = test_data_combined[:, [-1]].reshape(4170,)

#what is the mean of the data
print("train mean:",np.mean(x_train))
print("test mean:",np.mean(x_test))
#shows that test set has been offset

train mean: 80.957739030685
test mean: 85.48841426858513


In [23]:
#  Reducing the number of attributes using opencv's reduce functionality as shown in the second python Lecture
#  As seen in python lectures: https://vision.hw.ac.uk/webapps/blackboard/content/listContent.jsp?course_id=_94419_1&content_id=_3391865_1
x_train_all = np.apply_along_axis(
        func1d=lambda img: cv2.resize(img.reshape((48,48)), (num_pixels,num_pixels)),
        axis =1, arr = x_train_all).reshape((-1,num_pixels*num_pixels))

x_train = np.apply_along_axis(
        func1d=lambda img: cv2.resize(img.reshape((48,48)), (num_pixels,num_pixels)),
        axis =1, arr = x_train).reshape((-1,num_pixels*num_pixels))

x_test  = np.apply_along_axis(
        func1d=lambda img: cv2.resize(img.reshape((48,48)), (num_pixels,num_pixels)),
        axis =1, arr = x_test).reshape((-1,num_pixels*num_pixels))

In [24]:
x_train_test = np.append(x_train,x_test, axis=0)
y_train_test = np.append(y_train,y_test, axis=0)

In [25]:
#try standared tree with no limits 
clf = tree.DecisionTreeClassifier()
clf.fit(x_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [26]:
#predict on test set
y_pred_test_data = clf.predict(x_test)
print(classification_report(y_test, y_pred_test_data))

              precision    recall  f1-score   support

         0.0       0.64      0.68      0.66       450
         1.0       0.70      0.78      0.73       630
         2.0       0.83      0.47      0.60       150
         3.0       0.87      0.85      0.86       420
         4.0       0.78      0.78      0.78       690
         5.0       0.88      0.95      0.91       720
         6.0       0.81      0.83      0.82       270
         7.0       0.26      0.23      0.25        60
         8.0       0.90      0.85      0.88       690
         9.0       0.65      0.33      0.44        90

    accuracy                           0.79      4170
   macro avg       0.73      0.68      0.69      4170
weighted avg       0.79      0.79      0.79      4170



In [27]:
print("kappa:",cohen_kappa_score(y_test, y_pred_test_data))
print("MAE:",mean_absolute_error(y_test, y_pred_test_data))# kappa greater then good

kappa: 0.7586583250892167
MAE: 0.6345323741007194
