In [68]:
import pandas as pd
import numpy as np
import numpy as np
from collections import defaultdict
from dtaidistance import dtw
from src.utils import error
from operator import itemgetter
import sys
import matplotlib.pyplot as plt
from time import time
import heapq
from sklearn.manifold import MDS
%matplotlib inline

### Changing Kruskral to Prim

In [60]:
"""Optimum Path Forest Classifier"""
class OptimumPathForestClassifier:    
    def __init__(self, cost='euclidean-distance'):
        available_cost_functions = {
            'euclidean-distance': lambda x, y: np.linalg.norm(x - y),
            'manhattan-distance': lambda x, y: np.sum(np.abs(x - y)),
            'dtw-distance': lambda x, y: dtw.distance_fast(x, y, window=100, use_pruning=True)
        }
        assert cost in available_cost_functions.keys(),\
            f"Invalid cost function. Should be one of {available_cost_functions.keys()}"
        self.F = available_cost_functions[cost]
    
    def find_prototypes(self, adj, Y):
        prototypes = []
        pq = [[0., 0, -1]]
        heapq.heapify(pq)
        
        cost = np.ones(len(adj)) * np.inf
        cost[0] = 0.
        while pq:
            u_w, u, p = heapq.heappop(pq)
            if u_w < cost[u]: continue
            cost[u] = 0.
                
            # Edge p->u is a part of MST.
            if p != -1 and Y[u] != Y[p]:
                prototypes += [u, v]

            for v, w in adj[u]:
                if cost[v] > w:
                    cost[v] = w
                    heapq.heappush(pq, [w, v, u])
        return list(np.unique(prototypes))
    
    def fit(self, X_, Y_):
        n = len(Y_)
        self.X = np.array(X_, copy=True, dtype=float)
        self.label = np.ones(n, dtype=int) * -1
        Y = np.array(Y_, copy=True, dtype=int)
        
        # First of all, builds the graph
        adj = defaultdict(list)
        for u in range(n):
            adj[u] = [(int(v), self.F(self.X[u], self.X[v])) for v in range(n)]
            
        # Runs MST (Prim) to choose PROTOTYPES (seed vertices)
        self.prototypes = self.find_prototypes(adj, Y)
        
        # Run multisourced dijkstra on prototypes to get the cost
        self.cost = np.ones(n) * np.inf
        self.cost[self.prototypes] = 0
        self.label[self.prototypes] = Y[self.prototypes]
        
        pq = [[0., u] for u in self.prototypes]
        heapq.heapify(pq)
        while pq:
            u_w, u = heapq.heappop(pq)
            if self.cost[u] < u_w:
                continue
            for v, w in adj[u]:
                if self.cost[v] > max(u_w, w):
                    self.cost[v] = max(u_w, w)
                    self.label[v] = self.label[u]
                    heapq.heappush(pq, [self.cost[v], v])
        self.ordered_nodes = [(u, self.cost[u]) for u in range(n)]
        self.ordered_nodes.sort(key=itemgetter(1))
                    
    def _classify_one_vertex(self, x):
        best_index, best_cost = self.ordered_nodes[0]
        best_cost = max(best_cost, self.F(self.X[best_index], x))
        best_label = self.label[best_index]
        
        for i in range(1, len(self.X)):
            cur_index, cur_cost = self.ordered_nodes[i]
            if cur_cost > best_cost:
                break
            cur_cost = max(cur_cost, self.F(self.X[cur_index], x))
            cur_label = self.label[cur_index]
            if cur_cost < best_cost:
                best_index, best_cost, best_label = cur_index, cur_cost, cur_label
        return best_label
    
    def classify(self, X_):
        X_train = np.array(X_, copy=True)
        return [self._classify_one_vertex(x) for x in X_train]

In [61]:
df_names = ['WordSynonyms', 'ChlorineConcentration', 'ShapesAll', 'EthanolLevel', 'FordA']
for df_name in df_names:
    df = pd.read_table(f'data/UCRArchive_2018/{df_name}/{df_name}_TRAIN.tsv', header=None)
    df_test = pd.read_table(f'data/UCRArchive_2018/{df_name}/{df_name}_TEST.tsv', header=None)
    datasets_df = pd.read_csv('data/DataSummary.csv')

    X, Y = df.iloc[:, 1:], df.iloc[:, 0]
    X_test, Y_test = df_test.iloc[:, 1:], df_test.iloc[:, 0]
    dataset_error = datasets_df.loc[datasets_df['Name'] == df_name].iloc[:, 7].values[0]
    
    start_time = time()
    opf = OptimumPathForestClassifier('euclidean-distance')
    opf.fit(X, Y)
    preds = opf.classify(X_test)
    print("%s (%.2fs):" % (df_name, time() - start_time))
    print(">> OPF error: %.3f" % error(preds, Y_test))
    print(">> ED (w=0) error: %.3f" % dataset_error)
    print()

WordSynonyms (1.07s):
>> OPF error: 0.400
>> ED (w=0) error: 0.382

ChlorineConcentration (8.38s):
>> OPF error: 0.355
>> ED (w=0) error: 0.350

ShapesAll (5.27s):
>> OPF error: 0.262
>> ED (w=0) error: 0.248

EthanolLevel (13.64s):
>> OPF error: 0.726
>> ED (w=0) error: 0.726

FordA (127.18s):
>> OPF error: 0.367
>> ED (w=0) error: 0.335



In [43]:
def find_prototypes_with_kruskal(adj, Y):
    from src.OPFClassifier import DSU
    n = len(Y)
    
    prototypes = []
    edges = []
    for u in range(n):
        edges += [(w, u, v) for v, w in adj[u]]
    edges.sort()
    dsu = DSU(n)
    for w, u, v in edges:
        if not dsu.same(u, v):
            dsu.merge(u, v)
            if Y[u] != Y[v]:
                prototypes += [u, v]
    return np.unique(prototypes)

In [73]:
df_name = 'WordSynonyms'
df = pd.read_table(f'data/UCRArchive_2018/{df_name}/{df_name}_TRAIN.tsv', header=None)
df_test = pd.read_table(f'data/UCRArchive_2018/{df_name}/{df_name}_TEST.tsv', header=None)
datasets_df = pd.read_csv('data/DataSummary.csv')

X, Y = df.iloc[:, 1:], df.iloc[:, 0]
X_test, Y_test = df_test.iloc[:, 1:], df_test.iloc[:, 0]
dataset_error = datasets_df.loc[datasets_df['Name'] == df_name].iloc[:, 7].values[0]

In [74]:
opf = OptimumPathForestClassifier('euclidean-distance')
opf.fit(X, Y)
preds = opf.classify(X_test)
error(preds, Y_test)

0.3996865203761755

In [75]:
opf = OptimumPathForestClassifier('euclidean-distance')
opf.find_prototypes = find_prototypes_with_kruskal
opf.fit(X, Y)
preds = opf.classify(X_test)
error(preds, Y_test)

0.3871473354231975

### Label Propagation

In [87]:
"""Optimum Path Forest Classifier"""
class OptimumPathForestClassifier:    
    def __init__(self, cost='euclidean-distance'):
        available_cost_functions = {
            'euclidean-distance': lambda x, y: np.linalg.norm(x - y),
            'manhattan-distance': lambda x, y: np.sum(np.abs(x - y)),
            'dtw-distance': lambda x, y: dtw.distance_fast(x, y, window=100, use_pruning=True)
        }
        assert cost in available_cost_functions.keys(),\
            f"Invalid cost function. Should be one of {available_cost_functions.keys()}"
        self.F = available_cost_functions[cost]
    
    def find_prototypes(self, adj, Y):
        from src.OPFClassifier import DSU
        n = len(Y)

        prototypes = []
        edges = []
        for u in range(n):
            edges += [(w, u, v) for v, w in adj[u]]
        edges.sort()
        dsu = DSU(n)
        for w, u, v in edges:
            if not dsu.same(u, v):
                dsu.merge(u, v)
                if Y[u] != Y[v]:
                    prototypes += [u, v]
        return np.unique(prototypes)
    
    def fit(self, X_, Y_):
        n = len(Y_)
        self.X = np.array(X_, copy=True, dtype=float)
        self.label = np.ones(n, dtype=int) * -1
        Y = np.array(Y_, copy=True, dtype=int)
        
        # First of all, builds the graph
        adj = defaultdict(list)
        for u in range(n):
            adj[u] = [(int(v), self.F(self.X[u], self.X[v])) for v in range(n)]
            
        # Runs MST (Prim) to choose PROTOTYPES (seed vertices)
        self.prototypes = self.find_prototypes(adj, Y)
        
        # Run multisourced dijkstra on prototypes to get the cost
        self.cost = np.ones(n) * np.inf
        self.cost[self.prototypes] = 0
        self.label[self.prototypes] = Y[self.prototypes]
        
        # Uses LabelPropagation
        from sklearn.semi_supervised import LabelPropagation
        self.label_prop_model = LabelPropagation()
        self.label_prop_model.fit(X_, self.label)
        self.label = self.label_prop_model.predict(X)
    
    def classify(self, X_):
        return self.label_prop_model.predict(X_)

In [89]:
df_name = 'WordSynonyms'
df = pd.read_table(f'data/UCRArchive_2018/{df_name}/{df_name}_TRAIN.tsv', header=None)
df_test = pd.read_table(f'data/UCRArchive_2018/{df_name}/{df_name}_TEST.tsv', header=None)
datasets_df = pd.read_csv('data/DataSummary.csv')

X, Y = df.iloc[:, 1:], df.iloc[:, 0]
X_test, Y_test = df_test.iloc[:, 1:], df_test.iloc[:, 0]
dataset_error = datasets_df.loc[datasets_df['Name'] == df_name].iloc[:, 7].values[0]

opf = OptimumPathForestClassifier('euclidean-distance')
opf.fit(X, Y)
preds = opf.classify(X_test)
error(preds, Y_test)
print(preds)

[ 1  1 14  1  1  1  1  1  1  2  1  4  1  1  1  1  1  1  1  1  1 18  1  1
  4  1  1  1  1  2  6  1  4  2  1  2  2  6  1 18  1  1  1  1  1  1  1  1
  1 22  4  1  1  1  1 12  1  1  1  2  1  4  1  1  1  2  1  1  1  1  1  1
  1  1  1  6  1  1  1  2 18  8  1  1  1  1  1  1  1  6  1  1  2  1  2  1
  1  1  1  2  1  1  1  1 11  1  1  1  1  1  1  1  1  1  1  1  1 14  1  1
  1  1  1  1 14  4  1  1  1  1  1  1  1  2  1  1  1  1  1  1  1  2  1  1
  1  6  1 10  1  1  1  1  1  2  1  1  1  1  2  1  1  1 16  1  1  2  4  1
  2  1  1  1  1  8  1  1  4  1  1 15  2  1  1  1  1  1  2  1  1  1  1  1
  1  1  1  2  1  4  2  1  1  1  1  4  1  1  1  1  1  1  1 16  2  1  4  1
 10  1  1  2 10  1  1  1  1  2  1  1  2  1  1  1 16  2  1  1  4  1  1  1
  1  2  1  1  4  1  1  1  1  2  1  6 18  4  4  1  1  1  1  1  1  1  1  1
  1  4  2  1  2  1  4  1  2  1  1  1  1  1  1  1  1  2  1  2  6  1  6  1
  1  1  1  1  1  4  1  1  1  2  1  1  1  1  8  6  1  4  2  1  1  6  1  1
  2  2  1  1  1  1  1 11  1  4 16  1  1  1  1  1  2

  probabilities /= normalizer
  probabilities /= normalizer
