In [1]:
import os
import pickle as pkl
import random
from pathlib import Path
from collections import Counter
import itertools
cwd = os.getcwd()
biclfs_paper_data = Path(cwd) / "data_biclfs_paper" / "data"
biclfs_data_dir = Path(cwd) / "data_biclfs"
training_dir = biclfs_paper_data / "training_dicts"
testing_dir = biclfs_paper_data / "testing_dicts"

In [2]:
task2training_data = {}
task2testing_data = {}
for filepath in training_dir.glob("*.pkl"):
    with open(filepath, "rb") as f:
        data = pkl.load(f)
        task2training_data[filepath.stem] = data
for filepath in testing_dir.glob("*.pkl"):
    with open(filepath, "rb") as f:
        data = pkl.load(f)
        task2testing_data[filepath.stem] = data

In [3]:
train_groups = [task.split("_")[0] for task in task2training_data]
test_groups = [task.split("_")[0] for task in task2testing_data]

In [4]:
cv_split = [[0, 3], [4, 7], [8, 19]]
cv_split = [[f"group{x}" for x in range(fold[0], fold[1] + 1)] for fold in cv_split]
train_folds = []
test_folds = []
for fold in cv_split:
    train_folds.append([])
    test_folds.append([])
    for group in fold:
        if group in test_groups:
            test_folds[-1].append(group)
        if group in train_groups:
            train_folds[-1].append(group)
train_folds, test_folds

([['group0', 'group1', 'group2', 'group3'],
  ['group4', 'group5', 'group6', 'group7'],
  ['group8',
   'group9',
   'group10',
   'group11',
   'group12',
   'group13',
   'group14',
   'group15',
   'group16',
   'group17',
   'group18',
   'group19']],
 [['group0', 'group1', 'group2', 'group3'],
  ['group4', 'group7'],
  ['group8', 'group9', 'group10', 'group14', 'group18', 'group19']])

In [5]:
# Convert to the format used in paper
index_split = [[0, 1, 2], [1, 2, 0], [2, 0, 1]]
random.shuffle(index_split)
cv_split_ict = []
for split in index_split:
    cv_split_ict.append(
        {
            "train": train_folds[split[0]],
            "val": train_folds[split[1]],
            "test": train_folds[split[2]],
        }
    )
cv_split_ict

[{'train': ['group0', 'group1', 'group2', 'group3'],
  'val': ['group4', 'group5', 'group6', 'group7'],
  'test': ['group8',
   'group9',
   'group10',
   'group11',
   'group12',
   'group13',
   'group14',
   'group15',
   'group16',
   'group17',
   'group18',
   'group19']},
 {'train': ['group4', 'group5', 'group6', 'group7'],
  'val': ['group8',
   'group9',
   'group10',
   'group11',
   'group12',
   'group13',
   'group14',
   'group15',
   'group16',
   'group17',
   'group18',
   'group19'],
  'test': ['group0', 'group1', 'group2', 'group3']},
 {'train': ['group8',
   'group9',
   'group10',
   'group11',
   'group12',
   'group13',
   'group14',
   'group15',
   'group16',
   'group17',
   'group18',
   'group19'],
  'val': ['group0', 'group1', 'group2', 'group3'],
  'test': ['group4', 'group5', 'group6', 'group7']}]

In [6]:
# Replace with actual task names
for i, split in enumerate(cv_split_ict):
    cv_split_ict[i] = {
        "train": [task for task in task2training_data if task.split("_")[0] in split["train"]],
        "val": [task for task in task2testing_data if task.split("_")[0] in split["val"]],
        "test": [task for task in task2testing_data if task.split("_")[0] in split["test"]],
    }

In [7]:
check_train = []
for split in cv_split_ict:
    check_train += split["train"]
assert sorted(check_train) == sorted(list(task2training_data.keys()))

In [8]:
check_val = []
check_test = []
for split in cv_split_ict:
    check_val += split["val"]
    check_test += split["test"]
assert sorted(check_val) == sorted(list(task2testing_data.keys()))
assert sorted(check_test) == sorted(list(task2testing_data.keys()))

In [9]:
with open(biclfs_data_dir / "cross_validation_splits.pkl", "wb") as f:
    pkl.dump(cv_split_ict, f)

In [10]:
# Convert data to the format used in paper (which I believe is incorrect)
# The only difference is that the data from the training
# fold will be separate from the one in the val and testing fold
training_data = {
    task: [{"<input>": example["c"], "<label>": example["a"]} for example in examples]
    for task, examples in task2training_data.items()
}
training_templates = {
    task: list(set([f"<input> {example['q']} <label>" for example in examples]))
    for task, examples in task2training_data.items()       
}

In [11]:
with open(biclfs_data_dir / "training_data.pkl", "wb") as f:
    pkl.dump(training_data, f)
with open(biclfs_data_dir / "training_templates.pkl", "wb") as f:
    pkl.dump(training_templates, f)

In [12]:
# The testing format is different from the training one
testing_data = {
    task: [{"<input>": example["c"], "<label>": example["a"]} 
           for examples in testing_examples.values() 
           for example in examples]
    for task, testing_examples in task2testing_data.items()
}
testing_templates = {
    task: [
        f"<input> {example[1]} <label>" 
        for example in examples
    ] for task, examples in task2testing_data.items()
}

In [13]:
with open(biclfs_data_dir / "testing_data.pkl", "wb") as f:
    pkl.dump(testing_data, f)
with open(biclfs_data_dir / "testing_templates.pkl", "wb") as f:
    pkl.dump(testing_templates, f)

In [5]:
with open(biclfs_data_dir / "training_data.pkl", "rb") as f:
    training_data = pkl.load(f)
with open(biclfs_data_dir / "testing_data.pkl", "rb") as f:
    testing_data = pkl.load(f)
with open(biclfs_data_dir / "training_templates.pkl", "rb") as f:
    training_templates = pkl.load(f)
with open(biclfs_data_dir / "testing_templates.pkl", "rb") as f:
    testing_templates = pkl.load(f)

In [15]:
training_data_templated = {}
assert training_data.keys() == training_templates.keys()
for task in training_data:
    training_data_templated[task] = [
        {
            "<input>": example["<input>"],
            "template": template, # Might need to remove the period at the end
            "<label>": example["<label>"]
        }
        for example, template in zip(training_data[task], training_templates[task])
    ]

{'<input>': '* food : tea , sugar , macaroni , canned beef , wheat flour , condensed milk ; * hygiene : detergent , laundry soap ; * bedding : bed linen , blankets ; * clothing : warm jackets , trousers , suits , felt boots ( for both adults and children ) .',
 'template': '<input> Are the people described in the text in need of food? <label>',
 '<label>': 1}

In [23]:
testing_data_templated = {}
assert testing_data.keys() == testing_templates.keys()
for task in testing_data:
    testing_data_templated[task] = [
        {
            "<input>": example["<input>"],
            "template": template, # Might need to remove the period at the end
            "<label>": example["<label>"]
        }
        for example, template in zip(testing_data[task], testing_templates[task])
    ]

In [24]:
with open(biclfs_data_dir / "training_data_templated.pkl", "wb") as f:
    pkl.dump(training_data_templated, f)
with open(biclfs_data_dir / "testing_data_templated.pkl", "wb") as f:
    pkl.dump(testing_data_templated, f)