In [1]:
import os
import json
from collections import OrderedDict
import pandas as pd

In [2]:
meta_info_path='./meta_info'
snr_info = {
    'snr30': {},
    'snr20': {},
    'snr10': {},
    'snr0': {}
}

def categorize_snr(vn, dataset_info):
    categories = {'snr30': [], 'snr20': [], 'snr10': [], 'snr0': []}
    for i in range(vn):
        snr = dataset_info[f'var{i}']['snr']
        if snr >= 30:
            categories['snr30'].append(i)
        elif snr >= 20:
            categories['snr20'].append(i)
        elif snr >= 10:
            categories['snr10'].append(i)
        else:
            categories['snr0'].append(i)
    return categories

In [3]:
for fp in os.listdir(meta_info_path):
    name = fp.split('.')[0]
    with open(os.path.join(meta_info_path, fp), 'r') as file:
        dataset_info = json.load(file)
    
    vn = dataset_info['variance_num']
    categories = categorize_snr(vn, dataset_info)
    
    for snr_level, indices in categories.items():
        if indices:
            snr_info[snr_level].update({name: indices})

In [4]:
def merge_dicts(dict1, dict2):
    merged_dict = {**dict1, **dict2}
    for key in set(dict1) & set(dict2):
        merged_dict[key] = dict1[key] + dict2[key]
    return merged_dict

snr_greater_20 = merge_dicts(snr_info['snr20'], snr_info['snr30'])

In [5]:
snr_greater_20 = OrderedDict(sorted(snr_greater_20.items()))

In [6]:
def condition(k):
    if k.startswith('cmip6') or k.startswith('era5'):
        if k in ('cmip6_1850', 'cmip6_1855') or k in ('era5_1989', 'era5_1990'):
            return True
        return False

    if 'missing' in k:
        return False
    
    return True

In [7]:
filtered_datasets = {k: v for k, v in snr_greater_20.items() if condition(k)}
filtered_datasets 

{'LOOP_SEATTLE': [0],
 'M_DENSE': [0],
 'PEMS04': [2],
 'PEMS07': [0],
 'PEMS08': [0, 2],
 'PEMS_BAY': [0],
 'Q-TRAFFIC': [0],
 'alibaba_cluster_trace_2018': [1],
 'australian_electricity_demand': [0],
 'bdg-2_bear': [0],
 'bdg-2_fox': [0],
 'bdg-2_panther': [0],
 'beijing_air_quality': [7],
 'borg_cluster_data_2011': [1],
 'cif_2016_12': [0],
 'cif_2016_6': [0],
 'cmip6_1850': [11,
  13,
  14,
  0,
  1,
  2,
  3,
  4,
  5,
  6,
  7,
  8,
  9,
  10,
  26,
  27,
  28,
  29,
  30,
  31,
  32,
  33,
  34,
  35,
  36,
  37,
  38],
 'cmip6_1855': [11,
  13,
  14,
  0,
  1,
  2,
  3,
  4,
  5,
  6,
  7,
  8,
  9,
  10,
  26,
  27,
  28,
  29,
  30,
  31,
  32,
  33,
  34,
  35,
  36,
  37,
  38],
 'covid19_energy': [0],
 'covid_deaths': [0],
 'elecdemand': [0],
 'elf': [0],
 'era5_1989': [9, 0, 3, 4, 5, 6, 7, 8, 17, 24, 25, 26, 27, 28, 29, 30],
 'era5_1990': [9, 0, 3, 4, 5, 6, 7, 8, 17, 24, 25, 26, 27, 28, 29, 30],
 'favorita_transactions': [0],
 'fred_md': [0],
 'gfc14_load': [0],
 'gfc17_l

In [8]:
with open('filtered_datasets.json', 'w') as f:
    json.dump(filtered_datasets, f, separators=(',', ':'), indent=None)