In [None]:
import os

In [None]:
WORK_DIR = os.environ.get("WORK", "")
DATA_DIR = os.path.join(WORK_DIR, "data")

In [None]:
n_periods = 80

### Utils

In [None]:
def get_subdirs_from_splits(subset: str, splits: list[str]):
    subdirs = []
    for split_name in splits:
        split_dir = os.path.join(DATA_DIR, subset, split_name)
        if not os.path.exists(split_dir):
            continue
        subdirs.extend(
            [
                d
                for d in os.listdir(split_dir)
                if os.path.isdir(os.path.join(split_dir, d))
            ]
        )
    subdirs = set(subdirs)
    print(f"Found {len(subdirs)} subdirs in {subset} splits {splits}")
    return list(set(subdirs))

In [None]:
def find_substrings(
    setA: list[str],
    setB: list[str],
    setA_name: str = "setA",
    setB_name: str = "setB",
):
    matching_strings = {b for b in setB if any(a in b.split("_") for a in setA)}
    matching_patterns = {a for a in setA if any(a in b.split("_") for b in setB)}
    res = (list(matching_strings), list(matching_patterns))
    print(f"Found {len(res[0])} {setB_name} systems names that use {setA_name} systems")
    print(f"Found {len(res[1])} {setA_name} systems names in {setB_name}")
    return res

### Compare Skew and Base Subdirectories

In [None]:
skew_subset = f"final_skew{n_periods}"
base_subset = f"final_base{n_periods}"

In [None]:
skew_splits_train = ["train", "train_z5_z10", "train_z10_z15"]
skew_subdirs_train = get_subdirs_from_splits(skew_subset, skew_splits_train)

In [None]:
# Step 1: Compare skew train vs base test_zeroshot
base_splits_test_zeroshot = [
    "test_zeroshot",
    "test_zeroshot_z5_z10",
    "test_zeroshot_z10_z15",
]
base_subdirs_test_zeroshot = get_subdirs_from_splits(
    base_subset, base_splits_test_zeroshot
)
res = find_substrings(
    base_subdirs_test_zeroshot,
    skew_subdirs_train,
    setA_name="base test_zeroshot",
    setB_name="skew train",
)
print(res[1])

In [None]:
# Step 2: Compare skew train vs base test
base_splits_test = ["test"]
base_subdirs_test = get_subdirs_from_splits(base_subset, base_splits_test)
res = find_substrings(
    base_subdirs_test, skew_subdirs_train, setA_name="base test", setB_name="skew train"
)
print(res[1])

In [None]:
# Step 3: Compare skew test vs base test_zeroshot
skew_splits_test = ["test"]
skew_subdirs_test = get_subdirs_from_splits(skew_subset, skew_splits_test)

base_splits = ["test_zeroshot", "test_zeroshot_z5_z10", "test_zeroshot_z10_z15"]
base_subdirs = get_subdirs_from_splits(base_subset, base_splits)
res = find_substrings(
    base_subdirs_test_zeroshot,
    skew_subdirs_test,
    setA_name="base test_zeroshot",
    setB_name="skew test",
)
print(res[1])

In [None]:
# Step 4: Compare skew test_zeroshot vs base test_zeroshot
skew_splits_test_zeroshot = [
    "test_zeroshot",
    "test_zeroshot_z5_z10",
    "test_zeroshot_z10_z15",
]
skew_subdirs_test_zeroshot = get_subdirs_from_splits(
    skew_subset, skew_splits_test_zeroshot
)

res = find_substrings(
    base_subdirs_test_zeroshot,
    skew_subdirs_test_zeroshot,
    setA_name="base test_zeroshot",
    setB_name="skew test_zeroshot",
)
print(res[1])

### Compare Train vs Test_Zeroshot

In [None]:
skew_split_train = ["train", "train_z5_z10", "train_z10_z15"]
skew_split_test_zeroshot = [
    "test_zeroshot",
    "test_zeroshot_z5_z10",
    "test_zeroshot_z10_z15",
]
# intersection = list(set(skew_subdirs_as) & set(skew_subdirs_bs))
res = find_substrings(
    skew_split_train,
    skew_split_test_zeroshot,
    setA_name="skew train",
    setB_name="skew test_zeroshot",
)

In [None]:
base_split_train = ["train", "train_z5_z10", "train_z10_z15"]
base_split_test_zeroshot = [
    "test_zeroshot",
    "test_zeroshot_z5_z10",
    "test_zeroshot_z10_z15",
]
# intersection = list(set(skew_subdirs_as) & set(skew_subdirs_bs))
res = find_substrings(
    base_split_train,
    base_split_test_zeroshot,
    setA_name="base train",
    setB_name="base test_zeroshot",
)