# Downloading data and the encodings file

In [0]:
!git clone https://github.com/hfawaz/bigdata18.git
!wget http://supplementarymaterial.xyz/bigdata2018/pre-trained-models.tar.gz
!tar -xzvf pre-trained-models.tar.gz
!wget https://www.cs.ucr.edu/~eamonn/time_series_data/UCR_TS_Archive_2015.zip
!unzip -P attempttoclassify UCR_TS_Archive_2015.zip
!pip install gdown
!gdown https://drive.google.com/uc?id=1C_11YFbtjlaFUg_XV8xHU8-qWCa-dC4W

In [0]:
import numpy as np
import pandas as pd
from os import listdir, walk
from os.path import join
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
from itertools import product
import pickle
from scipy.stats import spearmanr, pearsonr, kendalltau
from scipy.spatial.distance import cityblock
from scipy.spatial import distance
from scipy.stats import binom_test
from sklearn.metrics import calinski_harabasz_score,silhouette_score, davies_bouldin_score
from sklearn.metrics.pairwise import cosine_distances
from scipy.stats import wilcoxon

In [0]:
# function to calculate MRR
def mrr(labels, ranking, is_ranked = False):
    if not is_ranked:
        ranking = ranking.argsort(axis=1)
    rr = np.array([1/(np.argwhere(v==ranking[k,:])[0][0]+1) for k, v in enumerate(labels)])
    return rr.mean()
# function to calculate the rank of the optimal source model
def ranks(labels, ranking, is_ranked = False):
    if not is_ranked:
        ranking = ranking.argsort(axis=1)
    ranks = np.array([np.argwhere(v==ranking[k,:])[0][0]+1 for k, v in enumerate(labels)])
    return ranks

# Reading results from IDS - Fawaz et. al. (https://github.com/hfawaz/bigdata18)

In [0]:
similarity_lists = pd.read_csv("bigdata18/results/similar_datasets.csv", index_col=0)

sorted_unique_datasets = similarity_lists.index.unique().sort_values()
label_encoder_dict = dict(zip(sorted_unique_datasets.to_list(), range(len(sorted_unique_datasets))))

ranked_similarity_IDS = np.vectorize(label_encoder_dict.get)(similarity_lists)

transfer_results = pd.read_csv("bigdata18/results/df_transfer_acc.csv", index_col=0)

# The paper says that the rows are the source dataset. 
# Since we want to find the best dataset to transfer from we do argmax on the rows.
transfer_results = transfer_results.sort_index(axis=0).sort_index(axis=1)
transfer_results = transfer_results.to_numpy()

best_source = transfer_results.argmax(axis=0)

# Load encodings from file

In [0]:
load_path = join("encoding_transfer.pkl")
with open(load_path, "rb") as file:
    encodings_with_labels= pickle.load(file)

# Calculate the apriori dissimilarity using MSC with cosine distance

In [0]:
# may take a few minutes
apriori_dissimilarity = np.ones(encodings_with_labels.shape)*(10**10)
for i in range(encodings_with_labels.shape[0]):
    for j in range(encodings_with_labels.shape[1]):
        if i!=j:
            apriori_dissimilarity[i, j] = -silhouette_score(encodings_with_labels[i, j][0],
                                                            encodings_with_labels[i, j][1], metric="cosine")

In [0]:
# note that we deleted the transfer of a model to itself because it was set as 0 in the transfer_results file, this could have dramatic effect on results...
transfer_results_without_self_transfer = np.array([np.delete(transfer_results[:,i],i,axis=0) for i in range(transfer_results.shape[1])])
transfer_results_mean_per_target = transfer_results_without_self_transfer.mean(axis=1)

In [15]:
print("MRR of Random Source Model Selection: ",np.mean(1/(np.arange(84)+1)))
print("MRR of IDS: ", mrr(best_source, ranked_similarity_IDS, is_ranked = True))
print("MRR of our method: ", mrr(best_source, apriori_dissimilarity.T))

MRR of Random Source Model Selection:  0.05969015517363009
MRR of IDS:  0.11626114810805344
MRR of our method:  0.11496363721744161


In [16]:
our_top1_hits = np.sum(transfer_results.argmax(axis=0) == apriori_dissimilarity.T.argmin(axis=1))
IDS_top1_hits = np.sum(transfer_results.argmax(axis=0) == ranked_similarity_IDS[:,0])
N = transfer_results.shape[0]
print("Number of Wins of our method: {}, Accuracy: {:.1f}, P-value: {:.6f}".format(
       our_top1_hits,
      100*our_top1_hits/N,
      binom_test(our_top1_hits,N,1/(N-1))
      ))
print("Number of Wins of IDS: {}, Accuracy: {:.1f}, P-value: {:.6f}".format(
    IDS_top1_hits,
    100*IDS_top1_hits/N,
     binom_test(IDS_top1_hits,N,1/(N-1))
))
print("Number of wins Random: {}, Accuracy: {:.1f}, P-value: {:.6f}".format(
    85*1/(N-1),
    100*1/(N-1),
    binom_test(1,N,1/(N-1))))

Number of Wins of our method: 6, Accuracy: 7.1, P-value: 0.000558
Number of Wins of IDS: 4, Accuracy: 4.7, P-value: 0.018982
Number of wins Random: 1.0119047619047619, Accuracy: 1.2, P-value: 1.000000


In [18]:
ranks_of_best_model_SMS = ranks(best_source, apriori_dissimilarity.T)
ranks_of_best_model_IDS = ranks(best_source, ranked_similarity_IDS, is_ranked = True)
print("The difference between our method and IDS isn't statistically significant: " , wilcoxon(ranks_of_best_model_SMS, ranks_of_best_model_IDS))

The difference between our method and IDS isn't statistically significant:  WilcoxonResult(statistic=1519.0, pvalue=0.7655931250772526)


## Using other clustering quality metrics

In [0]:
# helper function for outputting top1 results
def top1stats(apriori_dissimilarity, extra_text):
    our_top1_hits = np.sum(transfer_results.argmax(axis=0) == apriori_dissimilarity.T.argmin(axis=1))
    N = transfer_results.shape[0]
    print("Number of Wins of {}: {}, Accuracy: {:.1f}, P-value: {:.6f}".format(
        extra_text,
        our_top1_hits,
        100*our_top1_hits/N,
        binom_test(our_top1_hits,N,1/(N-1))
        ))

### davies_bouldin_score

In [0]:
apriori_dissimilarity_db = np.ones(encodings_with_labels.shape)*(10**10)
for i in range(encodings_with_labels.shape[0]):
    for j in range(encodings_with_labels.shape[1]):
        if i!=j:
            apriori_dissimilarity_db[i, j] = davies_bouldin_score(encodings_with_labels[i, j][0],
                                                            encodings_with_labels[i, j][1])

In [0]:
print("MRR of our method with davies_bouldin_score: ", mrr(best_source, apriori_dissimilarity_db.T))
top1stats(apriori_dissimilarity_db, "our method with davies_bouldin_score")

MRR of our method with davies_bouldin_score:  0.09569933999670382
Number of Wins of our method with davies_bouldin_score: 3, Accuracy: 3.5, P-value: 0.081397


### calinski_harabasz_score

In [0]:
apriori_dissimilarity_vrc = np.ones(encodings_with_labels.shape)*(10**10)
for i in range(encodings_with_labels.shape[0]):
    for j in range(encodings_with_labels.shape[1]):
        if i!=j:
            apriori_dissimilarity_vrc[i, j] = -calinski_harabasz_score(encodings_with_labels[i, j][0],
                                                            encodings_with_labels[i, j][1])

In [0]:
print("MRR of method with calinski_harabasz_score: ", mrr(best_source, apriori_dissimilarity_vrc.T))
top1stats(apriori_dissimilarity_vrc, "our method with calinski_harabasz_score")

MRR of method with calinski_harabasz_score:  0.06173886810954078
Number of Wins of our method with calinski_harabasz_score: 2, Accuracy: 2.4, P-value: 0.268642


## Using other distance metrics within silhouette

In [0]:
# may take a few minutes
apriori_dissimilarity_euclidean = np.ones(encodings_with_labels.shape)*(10**10)
for i in range(encodings_with_labels.shape[0]):
    for j in range(encodings_with_labels.shape[1]):
        if i!=j:
            apriori_dissimilarity_euclidean[i, j] = -silhouette_score(encodings_with_labels[i, j][0],
                                                            encodings_with_labels[i, j][1], metric="euclidean")
print("MRR of method with silhouette with euclidean: ", mrr(best_source, apriori_dissimilarity_euclidean.T))
top1stats(apriori_dissimilarity_euclidean, "our method with euclidean")

MRR of method with silhouette with euclidean:  0.10669794129118935
Number of Wins of our method with euclidean: 5, Accuracy: 5.9, P-value: 0.003567


In [0]:
# may take a few minutes
apriori_dissimilarity_manhattan = np.ones(encodings_with_labels.shape)*(10**10)
for i in range(encodings_with_labels.shape[0]):
    for j in range(encodings_with_labels.shape[1]):
        if i!=j:
            apriori_dissimilarity_manhattan[i, j] = -silhouette_score(encodings_with_labels[i, j][0],
                                                            encodings_with_labels[i, j][1], metric="manhattan")
print("MRR of method with silhouette with manhattan: ", mrr(best_source, apriori_dissimilarity_manhattan.T))
top1stats(apriori_dissimilarity_manhattan, "our method with manhattan")

MRR of method with silhouette with manhattan:  0.11024946519492677
Number of Wins of our method with manhattan: 5, Accuracy: 5.9, P-value: 0.003567
