In [1]:
import pandas as pd
import numpy as np
import numpy as np
from collections import defaultdict
from dtaidistance import dtw
from src.utils import error
from operator import itemgetter
import sys
from time import time
import heapq

# Analysing results from OPFython

Now, we aim to analyse if the obtained results are the same as if OPFython were used.

In [2]:
"""Union Find structure with Path Compression + Merge by Size"""
class DSU:
    def __init__(self, n):
        self.par = np.arange(n)
        self.siz = np.ones(n)
        
    def find(self, x):
        if self.par[x] == x:
            return x
        self.par[x] = self.find(self.par[x])
        return self.par[x]
    
    def same(self, u, v):
        return self.find(u) == self.find(v)

    def merge(self, u, v):
        u = self.find(u)
        v = self.find(v)
        if u == v:
            return
        if self.siz[u] > self.siz[v]:
            u, v = v, u
        self.par[u] = v
        self.siz[v] += self.siz[u]

"""Optimum Path Forest Classifier"""
class OptimumPathForestClassifier:    
    def __init__(self, cost='euclidean-distance'):
        available_cost_functions = {
            'euclidean-distance': lambda x, y: np.linalg.norm(x - y),
            'manhattan-distance': lambda x, y: np.sum(np.abs(x - y)),
            'dtw-distance': lambda x, y: dtw.distance_fast(x, y, window=100, use_pruning=True)
        }
        assert cost in available_cost_functions.keys(),\
            f"Invalid cost function. Should be one of {available_cost_functions.keys()}"
        self.F = available_cost_functions[cost]
    
    def fit(self, X_, Y_):
        n = len(Y_)
        self.X = np.array(X_, copy=True, dtype=float)
        self.label = np.ones(n, dtype=int) * -1
        Y = np.array(Y_, copy=True, dtype=int)
        
        # First of all, builds the graph
        self.adj = defaultdict(list)
        self.edges = []
        for u in range(n):
            self.adj[u] = [(v, self.F(self.X[u], self.X[v])) for v in range(n)]
            self.edges += [(w, u, v) for v, w in self.adj[u]]
        
        # Runs MST (Kruskal) to choose PROTOTYPES (seed vertices)
        self.prototypes = []        
        self.edges.sort()
        dsu = DSU(n)
        for w, u, v in self.edges:
            if not dsu.same(u, v):
                dsu.merge(u, v)
                if Y[u] != Y[v]:
                    self.prototypes += [u, v]
        self.prototypes = np.unique(self.prototypes)
        
        # Run multisourced dijkstra on prototypes to get the cost
        self.cost = np.ones(n) * np.inf
        self.cost[self.prototypes] = 0
        self.label[self.prototypes] = Y[self.prototypes]
        
        pq = [[0., u] for u in self.prototypes]
        heapq.heapify(pq)
        while pq:
            u_w, u = heapq.heappop(pq)
            if self.cost[u] < u_w:
                continue
            for v, w in self.adj[u]:
                if self.cost[v] > max(u_w, w):
                    self.cost[v] = max(u_w, w)
                    self.label[v] = self.label[u]
                    heapq.heappush(pq, [self.cost[v], v])
        self.ordered_nodes = [(u, self.cost[u]) for u in range(n)]
        self.ordered_nodes.sort(key=itemgetter(1))
                    
    def _classify_one_vertex(self, x):
        best_index, best_cost = self.ordered_nodes[0]
        best_cost = max(best_cost, self.F(self.X[best_index], x))
        best_label = self.label[best_index]
        
        for i in range(1, len(self.X)):
            cur_index, cur_cost = self.ordered_nodes[i]
            if cur_cost > best_cost:
                break
            cur_cost = max(cur_cost, self.F(self.X[cur_index], x))
            cur_label = self.label[cur_index]
            if cur_cost < best_cost:
                best_index, best_cost, best_label = cur_index, cur_cost, cur_label
        return best_label
    
    def classify(self, X_):
        X_train = np.array(X_, copy=True)
        return [self._classify_one_vertex(x) for x in X_train]

In [12]:
from opfython.models import SupervisedOPF

df_names = ['WordSynonyms', 'ChlorineConcentration', 'ShapesAll', 'EthanolLevel', 'FordA']
for df_name in df_names:
    df = pd.read_table(f'data/UCRArchive_2018/{df_name}/{df_name}_TRAIN.tsv', header=None)
    df_test = pd.read_table(f'data/UCRArchive_2018/{df_name}/{df_name}_TEST.tsv', header=None)
    datasets_df = pd.read_csv('data/DataSummary.csv')

    X, Y = df.iloc[:, 1:], df.iloc[:, 0]
    X_test, Y_test = df_test.iloc[:, 1:], df_test.iloc[:, 0]
    Y += 1
    Y_test += 1
    dataset_error = datasets_df.loc[datasets_df['Name'] == df_name].iloc[:, 7].values[0]
    
    # Coded OPF
    opf = OptimumPathForestClassifier('euclidean-distance')
    opf.fit(X, Y)
    preds = opf.classify(X_test)
    
    # Lib OPF (OPFython)
    opf_lib = SupervisedOPF(distance="euclidean", pre_computed_distance=None)
    opf_lib.fit(np.array(X, copy=True), np.array(Y, copy=True))
    preds_lib = opf_lib.predict(np.array(X_test, copy=True))
    
    print()
    print(df_name)
    print(">> OPF (coded) error: %.3f" % error(preds, Y_test))
    print(">> OPF (lib) error: %.3f" % error(preds_lib, Y_test))
    print(">> ED (w=0) error: %.3f" % dataset_error)
    print(f"  > same predictions? {np.all(preds_lib == preds)}")
    print()

2022-09-12 21:12:21,929 - opfython.models.supervised — INFO — Overriding class: OPF -> SupervisedOPF.
2022-09-12 21:12:21,930 - opfython.core.opf — INFO — Creating class: OPF.
2022-09-12 21:12:21,930 - opfython.core.opf — DEBUG — Distance: euclidean | Pre-computed distance: False.
2022-09-12 21:12:21,931 - opfython.core.opf — INFO — Class created.
2022-09-12 21:12:21,931 - opfython.models.supervised — INFO — Class overrided.
2022-09-12 21:12:21,932 - opfython.models.supervised — INFO — Fitting classifier ...
2022-09-12 21:12:21,937 - opfython.models.supervised — DEBUG — Finding prototypes ...
2022-09-12 21:12:22,033 - opfython.models.supervised — DEBUG — Prototypes: [124, 0, 174, 118, 157, 186, 133, 126, 161, 10, 86, 199, 52, 37, 208, 88, 18, 262, 104, 218, 213, 45, 30, 65, 247, 108, 28, 230, 235, 100, 64, 85, 3, 142, 96, 31, 227, 200, 55, 111, 202, 212, 106, 251, 99, 180, 17, 183, 39, 229, 19, 146, 256, 63, 91, 128, 34, 169, 207, 40, 232, 259, 260, 264, 78, 46, 149, 22, 56, 210, 204, 

2022-09-12 21:12:59,328 - opfython.models.supervised — DEBUG — Prototypes: [408, 0, 74, 480, 104, 494, 249, 55, 484, 376, 456, 411, 475, 290, 240, 307, 98, 263, 501, 86, 57, 48, 223, 123, 149, 451, 399, 211, 219, 193, 413, 165, 232, 422, 225, 154, 364, 239, 220, 209, 366, 365, 378, 340, 500, 147, 199, 186, 446, 208, 368, 250, 402, 379, 152, 482, 112, 124, 167, 142, 421, 166, 400, 251, 150, 412, 401, 431, 409, 138, 132, 131, 109, 338, 330, 141, 415, 452, 429, 347, 105, 417, 430, 110, 375, 385, 327, 121, 151, 302, 291, 437, 122, 436, 465, 278, 140, 386, 383, 377, 395, 95, 453, 367, 373, 242, 113, 489, 426, 343, 296, 301, 241, 230, 72, 94, 103, 403, 345, 463, 185, 329, 178, 448, 171, 308, 187, 115, 498, 137, 116, 231, 160, 161, 77, 236, 344, 305, 75, 96, 248, 39, 342, 389, 97, 369, 145, 196, 214, 212, 192, 447, 237, 469, 56, 179, 234, 410, 40, 62, 156, 351, 176, 164, 174, 200, 114, 188, 69, 158, 202, 235, 157, 180, 70, 45, 227, 10, 53, 471, 334, 414, 107, 134, 438, 102, 382, 88, 78, 89, 4

2022-09-12 21:16:36,193 - opfython.models.supervised — INFO — Classifier has been fitted.
2022-09-12 21:16:36,194 - opfython.models.supervised — INFO — Training time: 55.3963623046875 seconds.
2022-09-12 21:16:36,195 - opfython.models.supervised — INFO — Predicting data ...
2022-09-12 21:17:10,721 - opfython.models.supervised — INFO — Data has been predicted.
2022-09-12 21:17:10,722 - opfython.models.supervised — INFO — Prediction time: 34.52512001991272 seconds.

FordA
>> OPF (coded) error: 0.341
>> OPF (lib) error: 0.341
>> ED (w=0) error: 0.335
  > same predictions? True



As we can see, the predictions (using Euclidean Distance) are the same.