In [25]:
LANG1 = "en"
LANG2 = "nl"
EXAMPLE_TOKEN_LEN = 250
MODEL_SIZE1 = "125M"
MODEL_SIZE2 = "1.3B"
MODEL_SIZE3 = "2.7B"
MODEL_SIZE4 = "6B"

DATASET_DIR = "EMEA"
DATASET_NAME = "EMEA-c"
SOURCE_DIR = "./datasets"

# Create config file
config = { "dataset_dir": DATASET_DIR,
              "dataset_name": DATASET_NAME,
              "source_dir": SOURCE_DIR,
              "example_token_len": EXAMPLE_TOKEN_LEN
        }

# Save to file
import json
with open("config.json", "w") as f:
    json.dump(config, f, indent=4)


In [20]:
# Step 1. Preprocess the data 

# increase number of usable sentences
# runs for both languages
!python preprocessing.py --config_file config.json

2024-06-14 17:13:59,838 - INFO - Parsing arguments...
Parsing arguments...
2024-06-14 17:13:59,838 - INFO - Loading tokenizer...
Loading tokenizer...
2024-06-14 17:14:00,058 - INFO - ==== Starting data preprocessing script ====
==== Starting data preprocessing script ====
2024-06-14 17:14:00,058 - INFO - This may take a while depending on the size of the dataset...
This may take a while depending on the size of the dataset...
2024-06-14 17:14:00,058 - INFO - Counting tokens for en...
Counting tokens for en...
Generating byte offset dataset from file:  EMEA/EMEA.en
2024-06-14 17:14:19,368 - INFO - Number of samples >= 250 tokens in ./datasets/EMEA/csv/250/EMEA.en.csv: 147
Number of samples >= 250 tokens in ./datasets/EMEA/csv/250/EMEA.en.csv: 147
2024-06-14 17:14:19,368 - INFO - Counting tokens for nl...
Counting tokens for nl...
Generating byte offset dataset from file:  EMEA/EMEA.nl
2024-06-14 17:14:45,676 - INFO - Number of samples >= 250 tokens in ./datasets/EMEA/csv/250/EMEA.nl.csv

In [22]:
# Step 2. Process the data to correct format

# NOTE: change dataset name to name + "-c" in running this right after preprocessing!!!
# gets dataset in the correct format for the experiment
!python process_data.py --config_file config.json

2024-06-14 17:15:01,261 - INFO - Parsing arguments...
Parsing arguments...
2024-06-14 17:15:01,262 - INFO - Loading tokenizer...
Loading tokenizer...
2024-06-14 17:15:01,499 - INFO - ==== Sarting data processing script ====
==== Sarting data processing script ====
2024-06-14 17:15:01,499 - INFO - This may take a while depending on the size of the dataset...
This may take a while depending on the size of the dataset...
2024-06-14 17:15:01,499 - INFO - Counting tokens for en...
Counting tokens for en...
Generating byte offset dataset from file:  EMEA/250/EMEA-c.en
2024-06-14 17:15:15,014 - INFO - Number of samples >= 250 tokens in ./datasets/EMEA/csv/250/EMEA-c.en.csv: 15942
Number of samples >= 250 tokens in ./datasets/EMEA/csv/250/EMEA-c.en.csv: 15942
2024-06-14 17:15:15,014 - INFO - Filtering sentences for en...
Filtering sentences for en...
2024-06-14 17:15:15,047 - INFO - Generating JSONL for en...
Generating JSONL for en...
2024-06-14 17:15:15,372 - INFO - Counting tokens for nl...

In [24]:
# Step 3. (optional) Split data to train and eval sets to train the model

# This will be done for both languages
# model size not relevant here, put in any config file as input
!python split_train_val.py --config_file exp-configs/EMEA/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE1}-{LANG2}.json

2024-06-14 17:19:19,585 - INFO - ==== Starting data train+val split script ====
==== Starting data train+val split script ====
2024-06-14 17:19:19,604 - INFO - Splitting indices...
Splitting indices...
2024-06-14 17:19:19,613 - INFO - Splitting datasets into train and validation sets...
Splitting datasets into train and validation sets...
2024-06-14 17:19:19,613 - INFO - Processing language: en
Processing language: en
Output file: EMEA/250/EMEA-c-250.en-train.jsonl
2024-06-14 17:19:19,886 - INFO - Processing language: nl
Processing language: nl
Output file: EMEA/250/EMEA-c-250.nl-train.jsonl
2024-06-14 17:19:20,081 - INFO - ==== Data train+val split script completed ====
==== Data train+val split script completed ====


In [27]:
# Step 4. Split (training) data to (pre)prefix set and suffix set

# supply the training dataset here only if you want to split the training data
# !python split_dataset.py --config_file exp-configs/EMEA/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE1}-{LANG1}.json
# !python split_dataset.py --config_file exp-configs/EMEA/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE2}-{LANG1}.json
# !python split_dataset.py --config_file exp-configs/EMEA/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE3}-{LANG1}.json
!python split_dataset.py --config_file exp-configs/EMEA/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE4}-{LANG1}.json

# !python split_dataset.py --config_file exp-configs/EMEA/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE1}-{LANG2}.json
# !python split_dataset.py --config_file exp-configs/EMEA/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE2}-{LANG2}.json
# !python split_dataset.py --config_file exp-configs/EMEA/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE3}-{LANG2}.json
!python split_dataset.py --config_file exp-configs/EMEA/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE4}-{LANG2}.json

tokenizer_config.json: 100%|███████████████████| 619/619 [00:00<00:00, 2.60MB/s]
vocab.json: 100%|████████████████████████████| 798k/798k [00:00<00:00, 6.91MB/s]
merges.txt: 100%|████████████████████████████| 456k/456k [00:00<00:00, 1.87MB/s]
tokenizer.json: 100%|██████████████████████| 1.37M/1.37M [00:00<00:00, 3.28MB/s]
added_tokens.json: 100%|███████████████████| 4.04k/4.04k [00:00<00:00, 19.8MB/s]
special_tokens_map.json: 100%|█████████████████| 357/357 [00:00<00:00, 1.50MB/s]
2024-06-14 17:22:31,042 - INFO
===== Starting dataset token split generation for language en with token length 250 =====
2024-06-14 17:22:31,042 - INFO
Opened file: EMEA/250/EMEA-c-250.en-train.jsonl
2024-06-14 17:22:31,061 - INFO
Processed 32 lines
2024-06-14 17:22:31,077 - INFO
Processed 64 lines
2024-06-14 17:22:31,092 - INFO
Processed 96 lines
2024-06-14 17:22:31,107 - INFO
Processed 128 lines
2024-06-14 17:22:31,121 - INFO
Processed 160 lines
2024-06-14 17:22:31,135 - INFO
Processed 192 lines
2024-06-14 

In [None]:
# Step 5. Train the model + perform extraction

# run this directly in terminal, model cannot be loaded in notebook due to memory constraints I think, crashes kernel
# NOTE: I cannot run this locally, so I run this on a HPC of the university
# Uploaded full contents of datasets + EMEA folders to Habrok so it has all data for training + extraction

# !python train.py --config_file exp-configs/EMEA/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE1}-{LANG1}.json
# !python extraction.py --config_file exp-configs/EMEA/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE1}-{LANG1}.json

In [31]:
# Decode the model generations from the numpy files to jsonl files
# NOTE: numpy files have been downloaded from the HPC where they were generated

from transformers import AutoTokenizer
import os
import numpy as np
from experiment_lib import generations_to_jsonl

tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-2.7B")
experiment_base = os.path.join("tmp", "EMEA", "nl", "nl-100-100-2.7B")

SOURCE_DIR = "./datasets"
DATASET_DIR = "EMEA"
EXAMPLE_TOKEN_LEN = 100
NUM_TRIALS = 100
exids = os.path.join(
    SOURCE_DIR,
    DATASET_DIR,
    "csv",
    str(EXAMPLE_TOKEN_LEN),
    "common_exids-" + str(EXAMPLE_TOKEN_LEN) + ".csv",
)


def decode_generations(
    experiment_dir,
    source_dir,
    dataset_dir,
    tokenizer,
    num_trials,
    example_token_len,
    exids,
):
    for i in range(0, NUM_TRIALS):
        file_path = os.path.join(experiment_base, f"generations/{i}.npy")
        data = np.load(file_path)
        print("Data shape: %s", str(data.shape))

        output_file_path = os.path.join(
            experiment_base, f"decoded/decoded_strings_trial_{i}.jsonl"
        )
        output_dir = os.path.dirname(output_file_path)
        os.makedirs(output_dir, exist_ok=True)
        generations_to_jsonl(output_file_path, data, tokenizer, exids)

    print("done")

In [None]:
# Calculate BLEU and METEOR scores for the generated outputs

!python calculate_scores.py --config_file exp-configs/EMEA/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE1}-{LANG1}.json
!python calculate_scores.py --config_file exp-configs/EMEA/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE2}-{LANG1}.json
!python calculate_scores.py --config_file exp-configs/EMEA/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE3}-{LANG1}.json

!python calculate_scores.py --config_file exp-configs/EMEA/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE1}-{LANG2}.json
!python calculate_scores.py --config_file exp-configs/EMEA/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE2}-{LANG2}.json
!python calculate_scores.py --config_file exp-configs/EMEA/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE3}-{LANG2}.json

In [None]:
# Evaluate the model outputs: sort and merge scores into single files to simplify analysis & plotting
!python evaluation.py --config_file exp-configs/EMEA/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE1}-{LANG1}.json
!python evaluation.py --config_file exp-configs/EMEA/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE2}-{LANG1}.json
!python evaluation.py --config_file exp-configs/EMEA/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE3}-{LANG1}.json

!python evaluation.py --config_file exp-configs/EMEA/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE1}-{LANG2}.json
!python evaluation.py --config_file exp-configs/EMEA/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE2}-{LANG2}.json
!python evaluation.py --config_file exp-configs/EMEA/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE3}-{LANG2}.json