In [83]:
LANG1 = "en"
LANG2 = "nl"
EXAMPLE_TOKEN_LEN = 100
MODEL_SIZE1 = "125M"
MODEL_SIZE2 = "1.3B"
MODEL_SIZE3 = "2.7B"
MODEL_SIZE4 = "6B"

DATASET_DIR = "EMEA"
DATASET_NAME = "EMEA-c"
SOURCE_DIR = "./datasets"

# Create config file
config = { "dataset_dir": DATASET_DIR,
              "dataset_name": DATASET_NAME,
              "source_dir": SOURCE_DIR,
              "example_token_len": EXAMPLE_TOKEN_LEN
        }

# Save to file
import json
with open("config.json", "w") as f:
    json.dump(config, f, indent=4)


In [51]:
# Step 1. Preprocess the data 

# increase number of usable sentences
# runs for both languages
!python preprocessing.py --config_file config.json

2024-06-15 20:59:05,685 - INFO - Parsing arguments...
Parsing arguments...
2024-06-15 20:59:05,685 - INFO - Loading tokenizer...
Loading tokenizer...
2024-06-15 20:59:05,907 - INFO - ==== Starting data preprocessing script ====
==== Starting data preprocessing script ====
2024-06-15 20:59:05,907 - INFO - This may take a while depending on the size of the dataset...
This may take a while depending on the size of the dataset...
2024-06-15 20:59:05,907 - INFO - Counting tokens for en...
Counting tokens for en...
Generating byte offset dataset from file:  EMEA/EMEA.en
2024-06-15 20:59:25,203 - INFO - Number of samples >= 250 tokens in ./datasets/EMEA/csv/250/EMEA.en.csv: 147
Number of samples >= 250 tokens in ./datasets/EMEA/csv/250/EMEA.en.csv: 147
2024-06-15 20:59:25,203 - INFO - Counting tokens for nl...
Counting tokens for nl...
Generating byte offset dataset from file:  EMEA/EMEA.nl
2024-06-15 20:59:51,878 - INFO - Number of samples >= 250 tokens in ./datasets/EMEA/csv/250/EMEA.nl.csv

In [84]:
# Step 2. Process the data to correct format

# NOTE: change dataset name to name + "-c" in running this right after preprocessing!!!
# gets dataset in the correct format for the experiment
!python process_data.py --config_file config.json

2024-06-15 21:25:26,517 - INFO - Parsing arguments...
Parsing arguments...
2024-06-15 21:25:26,517 - INFO - Loading tokenizer...
Loading tokenizer...
2024-06-15 21:25:26,759 - INFO - ==== Sarting data processing script ====
==== Sarting data processing script ====
2024-06-15 21:25:26,759 - INFO - This may take a while depending on the size of the dataset...
This may take a while depending on the size of the dataset...
2024-06-15 21:25:26,759 - INFO - Counting tokens for en...
Counting tokens for en...
Generating byte offset dataset from file:  EMEA/100/EMEA-c.en
2024-06-15 21:25:41,026 - INFO - Number of samples >= 100 tokens in ./datasets/EMEA/csv/100/EMEA-c.en.csv: 31176
Number of samples >= 100 tokens in ./datasets/EMEA/csv/100/EMEA-c.en.csv: 31176
2024-06-15 21:25:41,026 - INFO - Filtering sentences for en...
Filtering sentences for en...
2024-06-15 21:25:41,070 - INFO - Generating JSONL for en...
Generating JSONL for en...
2024-06-15 21:25:41,420 - INFO - Counting tokens for nl...

In [85]:
import numpy as np
import os

# Shrink the dataset to a smaller size
def shrink_datasets(path1, path2, size):
    with open(path1, "r") as f:
        data1 = f.readlines()
    with open(path2, "r") as f:
        data2 = f.readlines()
    
    num_indices = int(len(data1))
    indices = np.random.choice(num_indices, size, replace=False)

    new_data1 = [data1[i] for i in indices]
    new_data2 = [data2[i] for i in indices]

    with open(path1, "w") as f:
        f.writelines(new_data1)
    with open(path2, "w") as f:
        f.writelines(new_data2)

size = 11000

path1 = os.path.join(DATASET_DIR, str(EXAMPLE_TOKEN_LEN), f"{DATASET_NAME}-{EXAMPLE_TOKEN_LEN}.{LANG1}")
path2 = os.path.join(DATASET_DIR, str(EXAMPLE_TOKEN_LEN), f"{DATASET_NAME}-{EXAMPLE_TOKEN_LEN}.{LANG2}")
shrink_datasets(path1, path2, size)

path1 = os.path.join(DATASET_DIR, str(EXAMPLE_TOKEN_LEN), f"{DATASET_NAME}-{EXAMPLE_TOKEN_LEN}.{LANG1}.jsonl")
path2 = os.path.join(DATASET_DIR, str(EXAMPLE_TOKEN_LEN), f"{DATASET_NAME}-{EXAMPLE_TOKEN_LEN}.{LANG2}.jsonl")
shrink_datasets(path1, path2, size)

In [57]:
def update_batch_size(directory):
    for filename in os.listdir(directory):
        if filename.endswith('.json'):
            filepath = os.path.join(directory, filename)
            with open(filepath, 'r') as json_file:
                data = json.load(json_file)
            if 'batch_size' in data:
                data['batch_size'] = 64
                with open(filepath, 'w') as json_file:
                    json.dump(data, json_file, indent=4)

def update_val_percentage(directory):
    for filename in os.listdir(directory):
        if filename.endswith('.json'):
            filepath = os.path.join(directory, filename)
            with open(filepath, 'r') as json_file:
                data = json.load(json_file)
            if 'validation_split_percentage' in data:
                data['validation_split_percentage'] = 0.1
                with open(filepath, 'w') as json_file:
                    json.dump(data, json_file, indent=4)

dir = "exp-configs/EMEA/100"
dir2 = "exp-configs/EMEA/150"
dir3 = "exp-configs/EMEA/200"
dir4 = "exp-configs/EMEA/250"


# update_batch_size(dir)
update_batch_size(dir2)
update_batch_size(dir3)
update_batch_size(dir4)

In [86]:
# Step 3. (optional) Split data to train and eval sets to train the model

# This will be done for both languages
# model size not relevant here, put in any config file as input

# 11k examples in data: 10k in train, 1k in eval
!python split_train_val.py --config_file exp-configs/{DATASET_DIR}/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE1}-{LANG2}.json

2024-06-15 21:27:14,410 - INFO - ==== Starting data train+val split script ====
==== Starting data train+val split script ====
2024-06-15 21:27:14,416 - INFO - Splitting indices...
Splitting indices...
# of indices:  11000
2024-06-15 21:27:14,419 - INFO - Splitting datasets into train and validation sets...
Splitting datasets into train and validation sets...
2024-06-15 21:27:14,420 - INFO - Processing language: en
Processing language: en
Output file: EMEA/100/EMEA-c-100.en-train.jsonl
2024-06-15 21:27:14,558 - INFO - Processing language: nl
Processing language: nl
Output file: EMEA/100/EMEA-c-100.nl-train.jsonl
2024-06-15 21:27:14,655 - INFO - ==== Data train+val split script completed ====
==== Data train+val split script completed ====


In [87]:
# Step 4. Split (training) data to (pre)prefix set and suffix set

# supply the training dataset here only if you want to split the training data
!python split_dataset.py --config_file exp-configs/{DATASET_DIR}/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE1}-{LANG1}.json
!python split_dataset.py --config_file exp-configs/{DATASET_DIR}/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE2}-{LANG1}.json
!python split_dataset.py --config_file exp-configs/{DATASET_DIR}/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE3}-{LANG1}.json
!python split_dataset.py --config_file exp-configs/{DATASET_DIR}/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE4}-{LANG1}.json

!python split_dataset.py --config_file exp-configs/{DATASET_DIR}/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE1}-{LANG2}.json
!python split_dataset.py --config_file exp-configs/{DATASET_DIR}/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE2}-{LANG2}.json
!python split_dataset.py --config_file exp-configs/{DATASET_DIR}/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE3}-{LANG2}.json
!python split_dataset.py --config_file exp-configs/{DATASET_DIR}/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE4}-{LANG2}.json

2024-06-15 21:27:18,802 - INFO
===== Starting dataset token split generation for language en with token length 100 =====
2024-06-15 21:27:18,803 - INFO
Opened file: EMEA/100/EMEA-c-100.en-train.jsonl
2024-06-15 21:27:18,858 - INFO
Processed 64 lines
2024-06-15 21:27:18,873 - INFO
Processed 128 lines
2024-06-15 21:27:18,887 - INFO
Processed 192 lines
2024-06-15 21:27:18,901 - INFO
Processed 256 lines
2024-06-15 21:27:18,914 - INFO
Processed 320 lines
2024-06-15 21:27:18,927 - INFO
Processed 384 lines
2024-06-15 21:27:18,940 - INFO
Processed 448 lines
2024-06-15 21:27:18,953 - INFO
Processed 512 lines
2024-06-15 21:27:18,966 - INFO
Processed 576 lines
2024-06-15 21:27:18,980 - INFO
Processed 640 lines
2024-06-15 21:27:18,992 - INFO
Processed 704 lines
2024-06-15 21:27:19,005 - INFO
Processed 768 lines
2024-06-15 21:27:19,018 - INFO
Processed 832 lines
2024-06-15 21:27:19,030 - INFO
Processed 896 lines
2024-06-15 21:27:19,043 - INFO
Processed 960 lines
2024-06-15 21:27:19,056 - INFO
Proce

In [None]:
# Step 5. Train the model + perform extraction

# run this directly in terminal, model cannot be loaded in notebook due to memory constraints I think, crashes kernel
# NOTE: I cannot run this locally, so I run this on a HPC of the university
# Uploaded full contents of datasets + EMEA folders to Habrok so it has all data for training + extraction

# !python train.py --config_file exp-configs/EMEA/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE1}-{LANG1}.json
# !python extraction.py --config_file exp-configs/EMEA/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE1}-{LANG1}.json

In [31]:
# Decode the model generations from the numpy files to jsonl files
# NOTE: numpy files have been downloaded from the HPC where they were generated

from transformers import AutoTokenizer
import os
import numpy as np
from experiment_lib import generations_to_jsonl

tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-2.7B")
experiment_base = os.path.join("tmp", "EMEA", "nl", "nl-100-100-2.7B")

SOURCE_DIR = "./datasets"
DATASET_DIR = "EMEA"
EXAMPLE_TOKEN_LEN = 100
NUM_TRIALS = 100
exids = os.path.join(
    SOURCE_DIR,
    DATASET_DIR,
    "csv",
    str(EXAMPLE_TOKEN_LEN),
    "common_exids-" + str(EXAMPLE_TOKEN_LEN) + ".csv",
)


def decode_generations(
    experiment_dir,
    source_dir,
    dataset_dir,
    tokenizer,
    num_trials,
    example_token_len,
    exids,
):
    for i in range(0, NUM_TRIALS):
        file_path = os.path.join(experiment_base, f"generations/{i}.npy")
        data = np.load(file_path)
        print("Data shape: %s", str(data.shape))

        output_file_path = os.path.join(
            experiment_base, f"decoded/decoded_strings_trial_{i}.jsonl"
        )
        output_dir = os.path.dirname(output_file_path)
        os.makedirs(output_dir, exist_ok=True)
        generations_to_jsonl(output_file_path, data, tokenizer, exids)

    print("done")

In [None]:
# Calculate BLEU and METEOR scores for the generated outputs

!python calculate_scores.py --config_file exp-configs/{DATASET_DIR}/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE1}-{LANG1}.json
!python calculate_scores.py --config_file exp-configs/{DATASET_DIR}/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE2}-{LANG1}.json
!python calculate_scores.py --config_file exp-configs/{DATASET_DIR}/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE3}-{LANG1}.json

!python calculate_scores.py --config_file exp-configs/{DATASET_DIR}/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE1}-{LANG2}.json
!python calculate_scores.py --config_file exp-configs/{DATASET_DIR}/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE2}-{LANG2}.json
!python calculate_scores.py --config_file exp-configs/{DATASET_DIR}/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE3}-{LANG2}.json

In [38]:
# Evaluate the model outputs: sort and merge scores into single files to simplify analysis & plotting
!python evaluation.py --config_file exp-configs/{DATASET_DIR}/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE1}-{LANG1}.json
!python evaluation.py --config_file exp-configs/{DATASET_DIR}/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE2}-{LANG1}.json
!python evaluation.py --config_file exp-configs/{DATASET_DIR}/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE3}-{LANG1}.json

!python evaluation.py --config_file exp-configs/{DATASET_DIR}/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE1}-{LANG2}.json
!python evaluation.py --config_file exp-configs/{DATASET_DIR}/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE2}-{LANG2}.json
!python evaluation.py --config_file exp-configs/{DATASET_DIR}/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE3}-{LANG2}.json

2024-06-14 19:28:00,403 - INFO
Model directory not provided, using default model specified in config.
2024-06-14 19:28:00,403 - INFO
==== Starting evaluation ====
2024-06-14 19:28:00,403 - INFO
Experiment name: en-100-100-125M
2024-06-14 19:28:00,403 - INFO
Language: en
2024-06-14 19:28:00,403 - INFO
Model: EleutherAI/gpt-neo-125M
2024-06-14 19:28:00,403 - INFO
Loading list of example IDs for dataset europarl...
2024-06-14 19:28:00,405 - INFO
Loaded 7398 example IDs
2024-06-14 19:28:00,405 - INFO
Bleu scores for this experiment previously merged, skipping...
2024-06-14 19:28:00,406 - INFO
Sorting BLEU scores...
2024-06-14 19:28:00,406 - INFO
Output file tmp/europarl/en/en-100-100-125M/scores/sorted_compl_bleu_scores.jsonl already exists and is not empty, skipping...
2024-06-14 19:28:00,406 - INFO
Sorted BLEU scores saved to tmp/europarl/en/en-100-100-125M/scores/sorted_compl_bleu_scores.jsonl
2024-06-14 19:28:00,407 - INFO
METEOR scores for this experiment previously merged, skipping..

In [97]:
import os
import fileinput

def update_memory_allocation(directory):
    for filename in os.listdir(directory):
        if filename.endswith('.sh'):
            filepath = os.path.join(directory, filename)
            with fileinput.FileInput(filepath, inplace=True) as file:
                for line in file:
                    print(line.replace('#SBATCH --time=24:00:00', '#SBATCH --time=26:00:00'), end='')

update_memory_allocation('habrok-scripts/EMEA/200')

In [None]:
def update_experiment_number(directory):
    for filename in os.listdir(directory):
        if filename.endswith('.sh'):
            filepath = os.path.join(directory, filename)
            with fileinput.FileInput(filepath, inplace=True) as file:
                for line in file:
                    print(line.replace('200', '250'), end='')

update_experiment_number('habrok-scripts/EMEA/250')