In [2]:
import pathlib as pl
import pandas as pd

In [3]:
train_file_path= '/Users/alexb/Documents/EXIST_2023_Shared_Task/training/EXIST2023_training.json'
dev_file_path = '/Users/alexb/Documents/EXIST_2023_Shared_Task/dev/EXIST2023_dev.json'
test_file_path = '/Users/alexb/Documents/EXIST_2023_Shared_Task/test/EXIST2023_test.json'
gold_folder= '/Users/alexb/Documents/EXIST_2023_Shared_Task/evaluation/golds/'
output_folder= './output/'
# Make sure the output folder exists
pl.Path(output_folder).mkdir(parents=True, exist_ok=True)


In [11]:
split_data = {'train': pd.read_json(train_file_path, orient='index'), 
              'dev': pd.read_json(dev_file_path, orient='index'), 
              'test': pd.read_json(test_file_path, orient='index')}

In [12]:
# Load the corresponding gold files into a dictionary (soft and hard) for each of the three tasks (1, 2, 3)
# Example file name: EXIST2023_training_task1_gold_hard.json
gold_data = {}
for task in range(1, 4):
    gold_data[task] = {}
    for gold_type in ['hard', 'soft']:
        train_gold_file_path = pl.Path(gold_folder) / f'EXIST2023_training_task{task}_gold_{gold_type}.json'
        dev_gold_file_path = pl.Path(gold_folder) / f'EXIST2023_dev_task{task}_gold_{gold_type}.json'
        test_gold_file_path = pl.Path(gold_folder) / f'EXIST2023_test_task{task}_gold_{gold_type}.json'
        # Merge train, dev and test gold files
        gold_df = pd.concat([pd.read_json(train_gold_file_path, orient='index'), 
                             pd.read_json(dev_gold_file_path, orient='index'), 
                             pd.read_json(test_gold_file_path, orient='index')])
        # Now load data into dataframe
        gold_data[task][gold_type] = gold_df

In [14]:
# Now join the gold data to the split data
for task in range(1, 4):
    for gold_type in ['hard', 'soft']:
        for split, data in split_data.items():
            prefix = f'task{task}_'
            new_df = gold_data[task][gold_type].add_prefix(prefix)
            overlap = set(data.columns) & set(new_df.columns)
            if overlap:
                data = data.join(new_df, lsuffix='_left', rsuffix='_right')
            else:
                data = data.join(new_df)
            split_data[split] = data

            

In [25]:
# For each task, create a folder with the corresponding data
for task in range(1, 4):
    # Create output folder for this task
    task_output_folder = pl.Path(output_folder, f'exist_2023_t{task}')
    task_output_folder.mkdir(parents=True, exist_ok=True)
    for split, data in split_data.items():
        for lang in ['en', 'es']:
            # Select rows with the corresponding language
            lang_data = data[data['lang'] == lang]
            # Select columns with the corresponding task
            task_columns = [col for col in lang_data.columns if f'task{task}_' in col] + ['id_EXIST', 'tweet']
            task_data = lang_data[task_columns]
            # Rename columns to remove the prefix
            task_data = task_data.rename(columns={col: col.replace(f'task{task}_', '') for col in task_columns})
            # Rename tweet and id columns
            task_data = task_data.rename(columns={'id_EXIST': 'id', 'tweet': 'text'})
            # Save to json file, pretty printed
            output_file_path = task_output_folder / f'{split}_{lang}.json' 
            task_data.to_json(output_file_path, orient='records', lines=True, force_ascii=False, indent=4)        

In [24]:
task_columns

['task3_hard_label', 'task3_soft_label', 'id_EXIST', 'tweet']