In [2]:
import joblib
import sys
import torch
import math
import numpy as np
# from tqdm.notebook import tqdm
from tqdm import tqdm
import os
import pandas as pd
from sktime.datasets import load_from_tsfile_to_dataframe
from aeon.datasets import load_classification

import warnings
warnings.filterwarnings("ignore")

### Prepare the UEA classification datasets

In [5]:
dataset_names=[
    'ArticularyWordRecognition', 
    'AtrialFibrillation', 
    'BasicMotions', 
    'CharacterTrajectories', 
    'Cricket', 
    'DuckDuckGeese', 
    'ERing', 
    'EigenWorms', 
    'Epilepsy', 
    'EthanolConcentration', 
    'FaceDetection', 
    'FingerMovements', 
    'HandMovementDirection', 
    'Handwriting', 
    'Heartbeat', 
    # 'InsectWingbeat',
    'JapaneseVowels', 
    'LSST', 
    'Libras', 
    'MotorImagery', 
    'NATOPS', 
    'PEMS-SF', 
    'PenDigits', 
    'PhonemeSpectra', 
    'RacketSports', 
    'SelfRegulationSCP1', 
    'SelfRegulationSCP2', 
    'SpokenArabicDigits', 
    'StandWalkJump', 
    'UWaveGestureLibrary'
]


# Change the dataset root to the path of your dataset
dataset_root='~/data/UEA_multivariate'

# TODO: Change the path before using
# Moved the files from this path into scope of docker container.
save_root = '~/data/VQShape/uea'

def interpolate_uts(x, new_len):
    x = torch.from_numpy(x)
    x = torch.nn.functional.interpolate(x.view(1, 1, -1), new_len, mode='linear')
    return x.squeeze()

def dataframe_to_list(df):
    """
    Convert each element of a pandas DataFrame to a list.

    :param df: pandas DataFrame.
    :return: List containing all elements of the DataFrame.
    """
    elements_list = []
    for _, row in df.iterrows():
        elements_list.extend(row.tolist())
    return elements_list

def load_single(dataset_root, dataset):
    # df, labels = load_from_tsfile_to_dataframe(f"{dataset_root}/{dataset}/{dataset}_{flag}.ts")
    data_array, labels = load_classification(dataset)
    data_list = [data_array[i] for i in range(data_array.shape[0])]
    data_list = np.array(data_list)

    data_list = [interpolate_uts(data_list.flatten(), new_len=512).float() for x in data_list]
    # data_list = [(x - np.mean(x))/(np.std(x) + np.finfo(float).eps) for x in data_list]
    data_list = [(x - x.mean()) / (x.var() + 1e-5).sqrt() for x in data_list]
    return torch.stack(data_list, dim=0).numpy()


# Load the UEA datasets and write each univariate TS into a csv file
for flag in ['TRAIN', 'TEST']:
    for dataset in dataset_names:
        print(dataset, end=": ")
        save_dir = f"{save_root}/{flag}"
        if not os.path.isdir(save_dir):
            os.makedirs(save_dir)
        data = load_single(dataset_root, dataset)
        print(data.shape)

        for i, x in enumerate(tqdm(data)):
            df = pd.DataFrame(x)
            df.to_csv(f"{save_dir}/{i}.csv", index=False, header=False)

ArticularyWordRecognition: (575, 512)


100%|██████████| 575/575 [00:00<00:00, 1907.08it/s]


AtrialFibrillation: (30, 512)


100%|██████████| 30/30 [00:00<00:00, 826.49it/s]


BasicMotions: (80, 512)


100%|██████████| 80/80 [00:00<00:00, 1734.35it/s]


CharacterTrajectories: (2858, 512)


100%|██████████| 2858/2858 [00:01<00:00, 1846.29it/s]


Cricket: (180, 512)


100%|██████████| 180/180 [00:00<00:00, 1716.14it/s]

DuckDuckGeese: 




(100, 512)


100%|██████████| 100/100 [00:00<00:00, 1707.10it/s]

ERing: 




(300, 512)


100%|██████████| 300/300 [00:00<00:00, 1771.28it/s]

EigenWorms: 




(259, 512)


100%|██████████| 259/259 [00:00<00:00, 1734.70it/s]

Epilepsy: 




(275, 512)


100%|██████████| 275/275 [00:00<00:00, 1762.11it/s]

EthanolConcentration: 




(524, 512)


100%|██████████| 524/524 [00:00<00:00, 1764.73it/s]


FaceDetection: (9414, 512)


100%|██████████| 9414/9414 [00:05<00:00, 1853.96it/s]


FingerMovements: (416, 512)


100%|██████████| 416/416 [00:00<00:00, 1765.88it/s]


HandMovementDirection: (234, 512)


100%|██████████| 234/234 [00:00<00:00, 1757.01it/s]

Handwriting: 




(1000, 512)


100%|██████████| 1000/1000 [00:00<00:00, 1724.21it/s]


Heartbeat: (409, 512)


100%|██████████| 409/409 [00:00<00:00, 1763.36it/s]


JapaneseVowels: (640, 512)


100%|██████████| 640/640 [00:00<00:00, 1760.24it/s]


LSST: (4925, 512)


100%|██████████| 4925/4925 [00:02<00:00, 1727.34it/s]


Libras: (360, 512)


100%|██████████| 360/360 [00:00<00:00, 1730.14it/s]


MotorImagery: (378, 512)


100%|██████████| 378/378 [00:00<00:00, 1758.47it/s]


NATOPS: (360, 512)


100%|██████████| 360/360 [00:00<00:00, 1778.49it/s]


PEMS-SF: (440, 512)


100%|██████████| 440/440 [00:00<00:00, 1764.99it/s]


PenDigits: (10992, 512)


100%|██████████| 10992/10992 [00:06<00:00, 1810.46it/s]


PhonemeSpectra: (6668, 512)


100%|██████████| 6668/6668 [00:03<00:00, 1774.56it/s]


RacketSports: (303, 512)


100%|██████████| 303/303 [00:00<00:00, 1731.96it/s]

SelfRegulationSCP1: 




(561, 512)


100%|██████████| 561/561 [00:00<00:00, 1746.98it/s]


SelfRegulationSCP2: (380, 512)


100%|██████████| 380/380 [00:00<00:00, 1733.19it/s]


SpokenArabicDigits: (8798, 512)


100%|██████████| 8798/8798 [00:05<00:00, 1754.59it/s]


StandWalkJump: (27, 512)


100%|██████████| 27/27 [00:00<00:00, 1689.59it/s]

UWaveGestureLibrary: 




(440, 512)


100%|██████████| 440/440 [00:00<00:00, 1773.46it/s]


ArticularyWordRecognition: (575, 512)


100%|██████████| 575/575 [00:00<00:00, 1912.07it/s]


AtrialFibrillation: (30, 512)


100%|██████████| 30/30 [00:00<00:00, 1712.80it/s]


BasicMotions: (80, 512)


100%|██████████| 80/80 [00:00<00:00, 1740.88it/s]


CharacterTrajectories: (2858, 512)


100%|██████████| 2858/2858 [00:01<00:00, 1856.42it/s]


Cricket: (180, 512)


100%|██████████| 180/180 [00:00<00:00, 1747.03it/s]

DuckDuckGeese: 




(100, 512)


100%|██████████| 100/100 [00:00<00:00, 1732.95it/s]


ERing: (300, 512)


100%|██████████| 300/300 [00:00<00:00, 1777.79it/s]


EigenWorms: (259, 512)


100%|██████████| 259/259 [00:00<00:00, 1726.42it/s]

Epilepsy: 




(275, 512)


100%|██████████| 275/275 [00:00<00:00, 1739.14it/s]

EthanolConcentration: 




(524, 512)


100%|██████████| 524/524 [00:00<00:00, 1779.68it/s]


FaceDetection: (9414, 512)


100%|██████████| 9414/9414 [00:05<00:00, 1843.22it/s]


FingerMovements: (416, 512)


100%|██████████| 416/416 [00:00<00:00, 1738.35it/s]


HandMovementDirection: (234, 512)


100%|██████████| 234/234 [00:00<00:00, 1784.76it/s]

Handwriting: 




(1000, 512)


100%|██████████| 1000/1000 [00:00<00:00, 1734.60it/s]


Heartbeat: (409, 512)


100%|██████████| 409/409 [00:00<00:00, 1791.69it/s]


JapaneseVowels: (640, 512)


100%|██████████| 640/640 [00:00<00:00, 1768.52it/s]


LSST: (4925, 512)


100%|██████████| 4925/4925 [00:02<00:00, 1754.75it/s]


Libras: (360, 512)


100%|██████████| 360/360 [00:00<00:00, 1810.08it/s]


MotorImagery: (378, 512)


100%|██████████| 378/378 [00:00<00:00, 1784.03it/s]


NATOPS: (360, 512)


100%|██████████| 360/360 [00:00<00:00, 1811.31it/s]


PEMS-SF: (440, 512)


100%|██████████| 440/440 [00:00<00:00, 1782.92it/s]


PenDigits: (10992, 512)


100%|██████████| 10992/10992 [00:05<00:00, 1835.02it/s]


PhonemeSpectra: (6668, 512)


100%|██████████| 6668/6668 [00:03<00:00, 1787.18it/s]


RacketSports: (303, 512)


100%|██████████| 303/303 [00:00<00:00, 1743.48it/s]


SelfRegulationSCP1: (561, 512)


100%|██████████| 561/561 [00:00<00:00, 1757.91it/s]


SelfRegulationSCP2: (380, 512)


100%|██████████| 380/380 [00:00<00:00, 1755.44it/s]


SpokenArabicDigits: (8798, 512)


100%|██████████| 8798/8798 [00:05<00:00, 1748.19it/s]


StandWalkJump: (27, 512)


100%|██████████| 27/27 [00:00<00:00, 1699.40it/s]


UWaveGestureLibrary: (440, 512)


100%|██████████| 440/440 [00:00<00:00, 1773.78it/s]


### Prepare the Forecasting datasets

In [6]:
import pandas as pd
import glob

glob.glob("../data/PILE/forecasting/autoformer/*")

[]

In [7]:
from tqdm import tqdm
import os
import numpy as np

dataset = 'traffic'
df = pd.read_csv(f'../data/PILE/forecasting/autoformer/{dataset}.csv')
save_dir = f'../data/VQShape/forecasting/TRAIN/{dataset}'
test_size = 0.2
step_size = 48
seq_lengths = [512] # [96, 192, 336, 512]
data = df.values[:, 1:]
data = data[:int(data.shape[0]*(1-test_size)), :]
start_idx = np.arange(0, data.shape[0] - step_size, step_size)

if not os.path.isdir(save_dir):
    os.makedirs(save_dir)

for l in seq_lengths:
    for c in tqdm(range(data.shape[1]), desc=f'{l}'):
        for start in start_idx:
            x = data[start:start+l, c]
            pd.DataFrame(x).to_csv(f"{save_dir}/{c}_{start}_{l}.csv", index=False, header=False)


FileNotFoundError: [Errno 2] No such file or directory: '../data/PILE/forecasting/autoformer/traffic.csv'

In [63]:
from tqdm import tqdm
import os
import numpy as np

dataset = 'ETTh2'
df = pd.read_csv(f'../data/PILE/forecasting/autoformer/{dataset}.csv')
save_dir = f'../data/VQShape/forecasting/TRAIN/{dataset}'

step_size = 48
seq_lengths = [512] # [96, 192, 336, 512]
data = df.values[:, 1:]
data = data[:12 * 30 * 24 + 4 * 30 * 24, :]
print(data.shape)
start_idx = np.arange(0, data.shape[0] - step_size, step_size)

if not os.path.isdir(save_dir):
    os.makedirs(save_dir)

for l in seq_lengths:
    for c in tqdm(range(data.shape[1]), desc=f'{l}'):
        for start in start_idx:
            x = data[start:start+l, c]
            pd.DataFrame(x).to_csv(f"{save_dir}/{c}_{start}_{l}.csv", index=False, header=False)


(11520, 7)


512: 100%|██████████| 7/7 [00:00<00:00,  9.77it/s]


In [64]:
from tqdm import tqdm
import os
import numpy as np

dataset = 'national_illness'
df = pd.read_csv(f'../data/PILE/forecasting/autoformer/{dataset}.csv')
save_dir = f'../data/VQShape/forecasting/TRAIN/{dataset}'
test_size = 0.2
step_size = 12
seq_lengths = [24, 36, 48, 60]
data = df.values[:, 1:]
data = data[:int(data.shape[0]*(1-test_size)), :]
start_idx = np.arange(0, data.shape[0] - step_size, step_size)

if not os.path.isdir(save_dir):
    os.makedirs(save_dir)

for l in seq_lengths:
    for c in tqdm(range(data.shape[1]), desc=f'{l}'):
        for start in start_idx:
            x = data[start:start+l, c]
            pd.DataFrame(x).to_csv(f"{save_dir}/{c}_{start}_{l}.csv", index=False, header=False)


24: 100%|██████████| 7/7 [00:00<00:00, 85.36it/s]
36: 100%|██████████| 7/7 [00:00<00:00, 85.98it/s]
48: 100%|██████████| 7/7 [00:00<00:00, 85.31it/s]
60: 100%|██████████| 7/7 [00:00<00:00, 83.13it/s]


### Prepare the UCR classification datasets


In [11]:
from sktime.datasets import load_from_ucr_tsv_to_dataframe
import os
from tqdm import tqdm
import numpy as np


def df_to_feature(df, labels):
    features = []
    for _, row in df.iterrows():
        sample = []
        for c in row.tolist():
            # sample.append(interpolate_uts(c.values.flatten(), seq_len))
            sample.append(c.values.flatten())
        features.append(np.stack(sample, axis=0))
    features = np.stack(features, axis=0)

    labels = pd.Series(labels, dtype="category")
    labels = pd.DataFrame(labels.cat.codes, dtype=np.int8).values

    return features, labels


root = f"../data/timeseries_lib/UCR_2018"
datasets = os.listdir(root)
datasets = sorted([d for d in datasets if not d.startswith(".")])

count = 0

for d in tqdm(datasets):
    save_dir = f'../data/VQShape/ucr/TEST/{d}'
    if not os.path.isdir(save_dir):
        os.makedirs(save_dir)

    features, labels = load_from_ucr_tsv_to_dataframe(f"{root}/{d}/{d}_TEST.tsv")
    features, labels = df_to_feature(features, labels)

    for i in range(features.shape[0]):
        x = features[i].flatten()
        pd.DataFrame(x).to_csv(f"{save_dir}/{i}.csv", index=False, header=False)
        count += 1

count

100%|██████████| 128/128 [01:01<00:00,  2.09it/s]


130603