### Part 1: Downloading Metrics From Papers

In [1]:
import os
import pandas as pd
import numpy as np

cur_dir = os.getcwd().split('/')
if cur_dir[-1] == 'notebooks':
    os.chdir("..")

from src.data_loader import get_html_page_and_prepare_soup, get_content_list_from_html, load_model_results 
from src.data_stats import find_shape_of_datasets

In [2]:
def make_dir_if_not_exist(dir_name):
    if not os.path.isdir(dir_name):
        os.mkdir(dir_name)

In [3]:
DATA_DIR_NAME = 'papers'
DATASET_DIR_NAME = DATA_DIR_NAME + '/' + 'datasets'
METRICS_DIR_NAME = DATA_DIR_NAME + '/' + 'metrics'

make_dir_if_not_exist(DATA_DIR_NAME)
make_dir_if_not_exist(DATASET_DIR_NAME)
make_dir_if_not_exist(METRICS_DIR_NAME)

In [4]:
METRICS_DIR_NAME

'papers/metrics'

In [5]:
page_link = "https://timeseriesclassification.com/results/PublishedResults/"

soup_main = get_html_page_and_prepare_soup(page_link)

In [6]:
paper_list = get_content_list_from_html(soup_main, 'a')[1:]

paper_models_dict = load_model_results(paper_list, METRICS_DIR_NAME, need_download=False)

Parsing Bakeoff2017 models...

Parsing Bakeoff2021 models...

Parsing Bakeoff2023 models...

Parsing HIVE-COTEV2 models...



In [7]:
paper_models_dict['Bakeoff2023'].keys()

dict_keys(['1NN-DTW', 'Arsenal', 'BOSS', 'CIF', 'CNN', 'Catch22', 'DrCIF', 'EE', 'FreshPRINCE', 'HC1', 'HC2', 'Hydra-MR', 'Hydra', 'InceptionT', 'Mini-R', 'MrSQM', 'Multi-R', 'PF', 'RDST', 'RISE', 'ROCKET', 'RSF', 'RSTSF', 'ResNet', 'STC', 'STSF', 'ShapeDTW', 'Signatures', 'TDE', 'TS-CHIEF', 'TSF', 'TSFresh', 'WEASEL-D', 'WEASEL', 'cBOSS'])

In [8]:
for paper in paper_models_dict.keys():
    print(f"{paper}: {len(paper_models_dict[paper].keys())} models")

Bakeoff2017: 25 models
Bakeoff2021: 10 models
Bakeoff2023: 35 models
HIVE-COTEV2: 18 models


In [9]:
find_shape_of_datasets(paper_models_dict['Bakeoff2023'])

Unnamed: 0,1NN-DTW,Arsenal,BOSS,CIF,CNN,Catch22,DrCIF,EE,FreshPRINCE,HC1,...,STSF,ShapeDTW,Signatures,TDE,TS-CHIEF,TSF,TSFresh,WEASEL-D,WEASEL,cBOSS
row,112,112,112,112,112,112,112,112,142,112,...,112,112,112,112,112,112,112,142,112,112
col,31,31,31,31,31,31,31,31,31,31,...,31,31,31,31,31,31,31,31,31,31


### Part 2: Downloading Datasets

In [10]:
from src.data_loader import get_size_of_file_in_mb, process_datasets, load_datasets_from_json

In [11]:
dataset_lists = paper_models_dict['Bakeoff2023']['Arsenal'].iloc[:,0]

In [12]:
from aeon.datasets import load_classification
X, y, meta_data = load_classification(dataset_lists[0], return_metadata=True)
print(" Shape of X = ", X.shape)
print(" Meta data = ", meta_data)

 Shape of X =  (781, 1, 176)
 Meta data =  {'problemname': 'adiac', 'timestamps': False, 'missing': False, 'univariate': True, 'equallength': True, 'classlabel': True, 'targetlabel': False, 'class_values': ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37']}


In [13]:
# [(a, b) for a, b  in enumerate(dataset_lists)]

In [14]:
%%time
file_sizes_mb, problematic_datasets = process_datasets(dataset_lists, DATASET_DIR_NAME)

Saved 'Adiac.json' successfully. Size: 1.58 MB.
Saved 'ArrowHead.json' successfully. Size: 0.61 MB.
Saved 'Beef.json' successfully. Size: 0.33 MB.
Saved 'BeetleFly.json' successfully. Size: 0.24 MB.
Saved 'BirdChicken.json' successfully. Size: 0.24 MB.
Saved 'Car.json' successfully. Size: 0.79 MB.
Saved 'CBF.json' successfully. Size: 1.38 MB.
Saved 'ChlorineConcentration.json' successfully. Size: 8.5 MB.
Saved 'CinCECGTorso.json' successfully. Size: 27.72 MB.
Saved 'Coffee.json' successfully. Size: 0.18 MB.
Saved 'Computers.json' successfully. Size: 4.32 MB.
Saved 'CricketX.json' successfully. Size: 2.75 MB.
Saved 'CricketY.json' successfully. Size: 2.73 MB.
Saved 'CricketZ.json' successfully. Size: 2.75 MB.
Saved 'DiatomSizeReduction.json' successfully. Size: 1.28 MB.
Saved 'DistalPhalanxOutlineAgeGroup.json' successfully. Size: 0.5 MB.
Saved 'DistalPhalanxOutlineCorrect.json' successfully. Size: 0.81 MB.
Saved 'DistalPhalanxTW.json' successfully. Size: 0.5 MB.
Saved 'Earthquakes.json

In [15]:
problematic_datasets

[]

In [16]:
get_size_of_file_in_mb(DATASET_DIR_NAME + '/' + 'StarLightCurves.json')

109.71

In [17]:
pd.Series(file_sizes_mb).sort_values(ascending=False).head(25)

StarLightCurves               109.71
UWaveGestureLibraryAll         48.58
HandOutlines                   42.64
MixedShapesRegularTrain        34.54
NonInvasiveFetalECGThorax2     33.41
NonInvasiveFetalECGThorax1     33.40
MixedShapesSmallTrain          29.82
FordA                          28.57
Mallat                         28.37
CinCECGTorso                   27.72
FordB                          25.84
Phoneme                        25.35
EthanolLevel                   20.54
ElectricDevices                19.20
UWaveGestureLibraryX           16.31
UWaveGestureLibraryZ           16.31
UWaveGestureLibraryY           16.29
Yoga                           16.21
InlineSkate                    14.39
SemgHandGenderCh2              14.06
SemgHandMovementCh2            14.06
SemgHandSubjectCh2             14.06
Wafer                          12.46
FreezerRegularTrain            10.26
FreezerSmallTrain               9.84
dtype: float64

In [19]:
loaded_datasets = load_datasets_from_json(dataset_lists, DATASET_DIR_NAME)

100%|█████████████████████████████████████████████████████████████████████████████████| 112/112 [00:06<00:00, 17.82it/s]


In [21]:
loaded_datasets.keys()

dict_keys(['Adiac', 'ArrowHead', 'Beef', 'BeetleFly', 'BirdChicken', 'Car', 'CBF', 'ChlorineConcentration', 'CinCECGTorso', 'Coffee', 'Computers', 'CricketX', 'CricketY', 'CricketZ', 'DiatomSizeReduction', 'DistalPhalanxOutlineAgeGroup', 'DistalPhalanxOutlineCorrect', 'DistalPhalanxTW', 'Earthquakes', 'ECG200', 'ECG5000', 'ECGFiveDays', 'ElectricDevices', 'FaceAll', 'FaceFour', 'FacesUCR', 'FiftyWords', 'Fish', 'FordA', 'FordB', 'GunPoint', 'Ham', 'HandOutlines', 'Haptics', 'Herring', 'InlineSkate', 'InsectWingbeatSound', 'ItalyPowerDemand', 'LargeKitchenAppliances', 'Lightning2', 'Lightning7', 'Mallat', 'Meat', 'MedicalImages', 'MiddlePhalanxOutlineAgeGroup', 'MiddlePhalanxOutlineCorrect', 'MiddlePhalanxTW', 'MoteStrain', 'NonInvasiveFetalECGThorax1', 'NonInvasiveFetalECGThorax2', 'OliveOil', 'OSULeaf', 'PhalangesOutlinesCorrect', 'Phoneme', 'Plane', 'ProximalPhalanxOutlineAgeGroup', 'ProximalPhalanxOutlineCorrect', 'ProximalPhalanxTW', 'RefrigerationDevices', 'ScreenType', 'ShapeletS

In [20]:
X_adiac, y_adiac, meta_data_adiac = loaded_datasets['Adiac']
print(X_adiac.shape, y_adiac.shape)
print(meta_data_adiac.keys())

(781, 1, 176) (781,)
dict_keys(['problemname', 'timestamps', 'missing', 'univariate', 'equallength', 'classlabel', 'targetlabel', 'class_values'])


__длина TS, dim , num of classes (=2, >2), джинни/энтрпия для дисбаланса классов, type__