In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from src.OPFClassifier import DSU
import numpy as np
from collections import defaultdict
from dtaidistance import dtw
from src.utils import error
from operator import itemgetter
import sys
import heapq

In [2]:
df_name = 'WordSynonyms'

df = pd.read_table(f'data/UCRArchive_2018/{df_name}/{df_name}_TRAIN.tsv', header=None)
df_test = pd.read_table(f'data/UCRArchive_2018/{df_name}/{df_name}_TEST.tsv', header=None)
datasets_df = pd.read_csv('data/DataSummary.csv')

X, Y = df.iloc[:, 1:], df.iloc[:, 0]
X_test, Y_test = df_test.iloc[:, 1:], df_test.iloc[:, 0]
dataset_error = datasets_df.loc[datasets_df['Name'] == df_name].iloc[:, 7].values[0]

df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,261,262,263,264,265,266,267,268,269,270
count,267.0,267.0,267.0,267.0,267.0,267.0,267.0,267.0,267.0,267.0,...,267.0,267.0,267.0,267.0,267.0,267.0,267.0,267.0,267.0,267.0
mean,9.116105,-1.06481,-1.037813,-1.005432,-0.967361,-0.926633,-0.885112,-0.84326,-0.802281,-0.765167,...,-0.004913,-0.107153,-0.218689,-0.330607,-0.446418,-0.567254,-0.688438,-0.805846,-0.909509,-0.99073
std,7.003598,0.297771,0.314203,0.34031,0.360485,0.366483,0.367091,0.375067,0.389692,0.399434,...,0.977561,0.877289,0.775271,0.685158,0.5935,0.502324,0.422849,0.367574,0.342001,0.326981
min,1.0,-2.260736,-2.258724,-2.226218,-2.221851,-2.222243,-2.220001,-2.214589,-2.206155,-2.195612,...,-1.750738,-1.750414,-1.750161,-1.750388,-1.751637,-1.76179,-1.781287,-1.813242,-1.989937,-2.189689
25%,3.0,-1.187217,-1.18315,-1.17287,-1.12612,-1.101117,-1.057204,-1.016442,-0.984648,-0.955151,...,-0.778291,-0.792408,-0.821748,-0.838169,-0.856748,-0.895335,-0.932915,-1.008489,-1.071877,-1.133381
50%,8.0,-1.00066,-0.978911,-0.957183,-0.931947,-0.90169,-0.866264,-0.836568,-0.805765,-0.781656,...,-0.292878,-0.359844,-0.435772,-0.50678,-0.589579,-0.670793,-0.743282,-0.817052,-0.878975,-0.942861
75%,14.0,-0.861169,-0.818737,-0.807341,-0.785858,-0.761313,-0.73476,-0.709592,-0.677477,-0.647337,...,0.544046,0.402501,0.223014,0.036733,-0.163714,-0.368679,-0.509985,-0.639358,-0.730808,-0.788201
max,25.0,-0.529065,0.392315,1.499541,2.117332,2.164945,1.832347,1.43409,1.212975,1.225051,...,3.410901,2.592988,2.246816,2.166496,1.96485,1.563155,1.576978,1.954937,2.061931,1.65353


In [3]:
"""Optimum Path Forest Classifier"""
class OptimumPathForestClassifier:    
    def __init__(self, cost='euclidean-distance'):
        available_cost_functions = {
            'euclidean-distance': lambda x, y: np.linalg.norm(x - y),
            'manhattan-distance': lambda x, y: np.sum(np.abs(x - y)),
            'dtw-distance': lambda x, y: dtw.distance_fast(x, y, use_pruning=True)
        }
        assert cost in available_cost_functions.keys(),\
            f"Invalid cost function. Should be one of {available_cost_functions.keys()}"
        self.F = available_cost_functions[cost]
    
    def fit(self, X_, Y_):
        n = len(Y_)
        self.X = np.array(X_, copy=True, dtype=float)
        self.label = np.ones(n, dtype=int) * -1
        Y = np.array(Y_, copy=True, dtype=int)
        
        # First of all, builds the graph
        self.adj = defaultdict(list)
        self.edges = []
        for u in range(n):
            self.adj[u] = [(v, self.F(self.X[u], self.X[v])) for v in range(n)]
            self.edges += [(w, u, v) for v, w in self.adj[u]]
        
        # Runs MST (Kruskal) to choose PROTOTYPES (seed vertices)
        self.prototypes = []        
        self.edges.sort()
        dsu = DSU(n)
        for w, u, v in self.edges:
            if not dsu.same(u, v):
                dsu.merge(u, v)
                if Y[u] != Y[v]:
                    self.prototypes += [u, v]
        self.prototypes = np.unique(self.prototypes)
        
        # Run multisourced dijkstra on prototypes to get the cost
        self.cost = np.ones(n) * np.inf
        self.cost[self.prototypes] = 0
        self.label[self.prototypes] = Y[self.prototypes]
        
        pq = [[0., u] for u in self.prototypes]
        heapq.heapify(pq)
        while pq:
            u_w, u = heapq.heappop(pq)
            if self.cost[u] < u_w:
                continue
            for v, w in self.adj[u]:
                if self.cost[v] > max(u_w, w):
                    self.cost[v] = max(u_w, w)
                    self.label[v] = self.label[u]
                    heapq.heappush(pq, [self.cost[v], v])
        self.ordered_nodes = [(u, self.cost[u]) for u in range(n)]
        self.ordered_nodes.sort(key=itemgetter(1))
                    
    def _classify_one_vertex(self, x):
        best_index, best_cost = self.ordered_nodes[0]
        best_cost = max(best_cost, self.F(self.X[best_index], x))
        best_label = self.label[best_index]
        
        for i in range(1, len(self.X)):
            cur_index, cur_cost = self.ordered_nodes[i]
            if cur_cost > best_cost:
                break
            cur_cost = max(cur_cost, self.F(self.X[cur_index], x))
            cur_label = self.label[cur_index]
            if cur_cost < best_cost:
                best_index, best_cost, best_label = cur_index, cur_cost, cur_label
        return best_label
    
    def classify(self, X_):
        X_train = np.array(X_, copy=True)
        return [self._classify_one_vertex(x) for x in X_train]
    
opf = OptimumPathForestClassifier('euclidean-distance')
opf.fit(X, Y)
preds = opf.classify(X_test)

print(preds)
print(f"OPF error: {error(preds, Y_test)}")
print(f"ED (w=0) error: {dataset_error}")

[4, 12, 14, 4, 14, 9, 2, 14, 4, 2, 20, 4, 2, 6, 21, 9, 22, 10, 7, 2, 2, 18, 22, 11, 4, 14, 6, 4, 13, 2, 6, 12, 4, 2, 22, 2, 2, 6, 4, 18, 23, 2, 18, 20, 24, 11, 16, 4, 2, 22, 4, 1, 20, 1, 21, 12, 8, 8, 24, 2, 23, 4, 21, 8, 2, 2, 13, 4, 14, 4, 2, 4, 22, 10, 11, 6, 14, 8, 19, 2, 18, 8, 2, 8, 4, 4, 4, 8, 21, 6, 4, 12, 2, 8, 2, 23, 4, 6, 8, 2, 9, 4, 10, 2, 11, 20, 11, 2, 2, 10, 4, 4, 4, 11, 22, 8, 6, 14, 2, 8, 14, 6, 22, 6, 14, 4, 21, 14, 2, 8, 12, 2, 4, 2, 8, 22, 22, 18, 9, 6, 4, 2, 18, 13, 4, 6, 4, 10, 2, 2, 10, 2, 4, 2, 9, 2, 8, 2, 2, 22, 6, 4, 16, 1, 4, 2, 4, 11, 2, 16, 14, 2, 2, 8, 10, 2, 4, 4, 2, 15, 2, 4, 8, 14, 23, 4, 2, 16, 12, 2, 4, 10, 14, 8, 11, 2, 4, 4, 2, 4, 2, 2, 4, 4, 22, 4, 11, 4, 24, 16, 2, 16, 2, 16, 4, 21, 10, 2, 8, 2, 10, 4, 5, 2, 18, 2, 2, 5, 2, 3, 17, 13, 16, 2, 14, 1, 4, 5, 10, 2, 8, 2, 2, 20, 4, 21, 8, 8, 2, 2, 6, 6, 18, 4, 4, 18, 24, 4, 6, 10, 11, 4, 23, 6, 4, 4, 2, 10, 2, 2, 4, 20, 2, 4, 2, 20, 6, 4, 1, 2, 1, 2, 12, 2, 6, 3, 6, 24, 10, 22, 9, 9, 2, 4, 20, 9, 6, 2,

In [4]:
# Creates a SupervisedOPF instance
from opfython.models import SupervisedOPF

opf_2 = SupervisedOPF(distance="euclidean", pre_computed_distance=None)

# Fits training data into the classifier
opf_2.fit(np.array(X, copy=True), np.array(Y, copy=True))

# Predicts new data
preds_2 = opf_2.predict(np.array(X_test, copy=True))
print(preds_2)
print(f"OPF error: {error(preds_2, Y_test)}")

2022-09-01 14:48:35,594 - opfython.models.supervised — INFO — Overriding class: OPF -> SupervisedOPF.
2022-09-01 14:48:35,595 - opfython.core.opf — INFO — Creating class: OPF.
2022-09-01 14:48:35,596 - opfython.core.opf — DEBUG — Distance: euclidean | Pre-computed distance: False.
2022-09-01 14:48:35,596 - opfython.core.opf — INFO — Class created.
2022-09-01 14:48:35,597 - opfython.models.supervised — INFO — Class overrided.
2022-09-01 14:48:35,598 - opfython.models.supervised — INFO — Fitting classifier ...
2022-09-01 14:48:35,601 - opfython.models.supervised — DEBUG — Finding prototypes ...
2022-09-01 14:48:36,663 - opfython.models.supervised — DEBUG — Prototypes: [124, 0, 174, 118, 157, 186, 133, 126, 161, 10, 86, 199, 52, 37, 208, 88, 18, 262, 104, 218, 213, 45, 30, 65, 247, 108, 28, 230, 235, 100, 64, 85, 3, 142, 96, 31, 227, 200, 55, 111, 202, 212, 106, 251, 99, 180, 17, 183, 39, 229, 19, 146, 256, 63, 91, 128, 34, 169, 207, 40, 232, 259, 260, 264, 78, 46, 149, 22, 56, 210, 204, 