# Initialisation

In [1]:
%config Completer.use_jedi = False
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [2]:
import os
os.environ["OMP_NUM_THREADS"] = str(os.cpu_count())
os.environ["OPENBLAS_NUM_THREADS"] = str(os.cpu_count())
os.environ["MKL_NUM_THREADS"] = str(os.cpu_count())
os.environ["VECLIB_MAXIMUM_THREADS"] = str(os.cpu_count())
os.environ["NUMEXPR_NUM_THREADS"] = str(os.cpu_count())

import re
import pickle
import numpy as np
import seaborn as sns
import matplotlib.cm as cmap #Importing colormap
sns.set()

from pytz import timezone
from datetime import datetime
from collections import namedtuple
from sklearn.preprocessing import OrdinalEncoder

from sklearn.cluster import KMeans
from sklearn.neighbors import DistanceMetric


def SaveVariable(Variable, FileName):
    with open(FileName, 'wb') as io:
        pickle.dump(Variable, io)
    
def LoadVariable(FileName):
    with open(FileName, "rb") as io:
        Res = pickle.load(io)
    return Res

print("Num threads set to:", os.cpu_count())
print("Ran on (" + datetime.now(timezone('Europe/Athens')).strftime("%a, %Y-%m-%d %H:%M %Z %z") + ")")

from performance_function import calculate_performance_numpy

Num threads set to: 48
Ran on (Sat, 2021-05-15 16:53 EEST +0300)


# Data

In [14]:
SupervisedType = "Classification"
RandomState = 1337

X_Train, Y_Train = LoadVariable(f"{os.getcwd()}/raw_data.pkl")
X_Test = LoadVariable(f"{os.getcwd()}/test_data (2).pkl")[0]
IntEncoder = OrdinalEncoder()
Y_Train = IntEncoder.fit_transform([[x] for x in Y_Train]).squeeze()
# Y_Test = IntEncoder.transform([[x] for x in Y_Test]).squeeze()

N = X_Train.shape[0]
Y_TrainUnique = set(Y_Train)
# Y_TestUnique = set(Y_Test)
print(f"Unique Values: Y_Train ({len(Y_TrainUnique)})") #, Y_Test ({len(Y_TestUnique)}), all values are the same: {Y_TrainUnique == Y_TestUnique}")


print("X_Train:", type(X_Train), X_Train.shape, X_Train.min(), X_Train.max())
print("X_Test :", type(X_Test), X_Test.shape, X_Test.min(), X_Test.max())

print("Y_Train:", type(Y_Train), Y_Train.shape, min(Y_Train), max(Y_Train), len(set(Y_Train)))
# print("Y_Test: ", type(Y_Test), Y_Test.shape, min(Y_Test), max(Y_Test), len(set(Y_Test)))

print("\nY_Train[:10]:", Y_Train[:10])

# number of classes
K = len(set(Y_Train)) if SupervisedType.lower() == "classification" and len(set(Y_Train)) != 2 else (Y_Train.shape[1] if SupervisedType.lower() == "multivariateregression" else 1) #An output_size (K) > 1 can be either Multiclass or Multivariate-Regression, like Lat/Lon coordinates

if 'train_dataset' in locals() or 'train_dataset' in globals():
    tmpX, tmpY = next(iter(train_loader))
    NonSingularDims = np.sum([1 for DimVal in tmpX.shape if DimVal > 1])
    if NonSingularDims == 2:
        N, D = [DimVal for DimVal in tmpX.shape if DimVal > 1]
        H1, W1 = (0, 0)
    elif NonSingularDims == 3:
        D = 0
        N, H1, W1 =[DimVal for DimVal in tmpX.shape if DimVal > 1] #This is RNN NxTxD
    elif NonSingularDims == 4:
        N, H1, W1, D = [DimVal for DimVal in tmpX.shape if DimVal > 1]
    
else:
    if len(X_Train.shape) == 2:
        N, D = X_Train.shape
        H1, W1 = (0, 0)
    elif len(X_Train.shape) == 3:
        D = 0
        N, H1, W1 = X_Train.shape #This is a Picture with no Colour, not RNN
    elif len(X_Train.shape) == 4:
        N, H1, W1, D = X_Train.shape

print()
print("X_Train.shape", X_Train.shape, " Y_Train.shape", Y_Train.shape)
print("X_Test.shape ", X_Test.shape)#, " Y_Test.shape ", Y_Test.shape)
print("K         ", K)
print("N:", N, "H1:", H1, "W1:", W1, "D:", D)
if 'train_dataset' in locals() or 'train_dataset' in globals():
    print(f"\nData after transformation with batch size = {batch_size}:")
    print("X.shape", tuple(tmpX.shape), "\tY.shape", tuple(tmpY.shape))

Unique Values: Y_Train (600)
X_Train: <class 'numpy.ndarray'> (9600, 11025) -1.0 1.0
X_Test : <class 'numpy.ndarray'> (320, 11025) -1.0 1.0
Y_Train: <class 'numpy.ndarray'> (9600,) 0.0 599.0 600

Y_Train[:10]: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]

X_Train.shape (9600, 11025)  Y_Train.shape (9600,)
X_Test.shape  (320, 11025)
K          600
N: 9600 H1: 0 W1: 0 D: 11025


# Model

Euclidean Distance: sqrt(sum((x - y)^2))

In [26]:
Euclidean = DistanceMetric.get_metric('euclidean')
EucDist = Euclidean.pairwise(X_Train)

cmc_scores, acc = calculate_performance_numpy(EucDist, Y_Train)
print(f"{(acc * 100):.2f}% accuracy with the (Euclidean) Metric")
print("The cmc scores are:", cmc_scores)

0.34% accuracy with the (Euclidean) Metric
The cmc scores are: [0.00572917 0.0109375  0.01479167 0.02489583]


Manhattan Distance sum(|x - y|)

In [33]:
Manhattan = DistanceMetric.get_metric('manhattan')
ManhDist = Manhattan.pairwise(X_Train)

cmc_scores, acc = calculate_performance_numpy(ManhDist, Y_Train)
print(f"{(acc * 100):.2f}% accuracy with the (Manhattan) Metric")
print("The cmc scores are:", cmc_scores)

0.43% accuracy with the (Manhattan) Metric
The cmc scores are: [0.00854167 0.0159375  0.02       0.02989583]


Chebyshev Distance max(|x - y|)

In [34]:
Chebyshev = DistanceMetric.get_metric('chebyshev')
ChebDist = Chebyshev.pairwise(X_Train)

cmc_scores, acc = calculate_performance_numpy(ChebDist, Y_Train)
print(f"{(acc * 100):.2f}% accuracy with the (Chebyshev) Metric")
print("The cmc scores are:", cmc_scores)

0.29% accuracy with the (Chebyshev) Metric
The cmc scores are: [0.00375    0.01135417 0.01625    0.0278125 ]


Hamming Distance N_unequal(x, y) / N_tot

In [39]:
Hamming = DistanceMetric.get_metric('hamming')
HammDist = Hamming.pairwise(X_Train)

cmc_scores, acc = calculate_performance_numpy(HammDist, Y_Train)
print(f"{(acc * 100):.2f}% accuracy with the (Hamming) Metric")
print("The cmc scores are:", cmc_scores)

0.16% accuracy with the (Hamming) Metric
The cmc scores are: [0.0025     0.00541667 0.00614583 0.0078125 ]


Canberra Distance sum(|x - y| / (|x| + |y|))

In [41]:
Canberra = DistanceMetric.get_metric('canberra')
CanbDist = Canberra.pairwise(X_Train)

cmc_scores, acc = calculate_performance_numpy(CanbDist, Y_Train)
print(f"{(acc * 100):.2f}% accuracy with the (Canberra) Metric")
print("The cmc scores are:", cmc_scores)

2.36% accuracy with the (Canberra) Metric
The cmc scores are: [0.04645833 0.0846875  0.11072917 0.15354167]


BrayCurtis Distance sum(|x - y|) / (sum(|x|) + sum(|y|))

In [42]:
Braycurtis = DistanceMetric.get_metric('braycurtis')
BraycDist = Braycurtis.pairwise(X_Train)

cmc_scores, acc = calculate_performance_numpy(BraycDist, Y_Train)
print(f"{(acc * 100):.2f}% accuracy with the (Braycurtis) Metric")
print("The cmc scores are:", cmc_scores)

1.77% accuracy with the (Braycurtis) Metric
The cmc scores are: [0.03354167 0.06427083 0.08520833 0.12208333]


# Saving the Results

In [43]:
Euclidean = DistanceMetric.get_metric('euclidean')
EucDist = Euclidean.pairwise(X_Test)
SaveVariable(EucDist, "test_distances_euclidean.pkl")

Manhattan = DistanceMetric.get_metric('manhattan')
ManhDist = Manhattan.pairwise(X_Test)
SaveVariable(ManhDist, "test_distances_manhattan.pkl")

Chebyshev = DistanceMetric.get_metric('chebyshev')
ChebDist = Chebyshev.pairwise(X_Test)
SaveVariable(ChebDist, "test_distances_chebyshev.pkl")

Hamming = DistanceMetric.get_metric('hamming')
HammDist = Hamming.pairwise(X_Test)
SaveVariable(HammDist, "test_distances_hamming.pkl")

Canberra = DistanceMetric.get_metric('canberra')
CanbDist = Canberra.pairwise(X_Test)
SaveVariable(CanbDist, "test_distances_canberra.pkl")

Braycurtis = DistanceMetric.get_metric('braycurtis')
BraycDist = Braycurtis.pairwise(X_Test)
SaveVariable(BraycDist, "test_distances_braycurtis.pkl")