In [2]:
LANG1 = "en"
LANG2 = "nl"
EXAMPLE_TOKEN_LEN = 100
MODEL_SIZE1 = "125M"
MODEL_SIZE2 = "1.3B"
MODEL_SIZE3 = "2.7B"
MODEL_SIZE4 = "6B"

DATASET_DIR = "europarl"
DATASET_NAME = "europarl-v7.nl-en"
SOURCE_DIR = "./datasets"

# Create config file
config = { "dataset_dir": DATASET_DIR,
              "dataset_name": DATASET_NAME,
              "source_dir": SOURCE_DIR,
              "example_token_len": EXAMPLE_TOKEN_LEN
        }

# Save to file
import json
with open("config.json", "w") as f:
    json.dump(config, f, indent=4)


In [51]:
# Step 1. Preprocess the data 

# increase number of usable sentences
# runs for both languages
!python preprocessing.py --config_file config.json

2024-06-15 20:59:05,685 - INFO - Parsing arguments...
Parsing arguments...
2024-06-15 20:59:05,685 - INFO - Loading tokenizer...
Loading tokenizer...
2024-06-15 20:59:05,907 - INFO - ==== Starting data preprocessing script ====
==== Starting data preprocessing script ====
2024-06-15 20:59:05,907 - INFO - This may take a while depending on the size of the dataset...
This may take a while depending on the size of the dataset...
2024-06-15 20:59:05,907 - INFO - Counting tokens for en...
Counting tokens for en...
Generating byte offset dataset from file:  EMEA/EMEA.en
2024-06-15 20:59:25,203 - INFO - Number of samples >= 250 tokens in ./datasets/EMEA/csv/250/EMEA.en.csv: 147
Number of samples >= 250 tokens in ./datasets/EMEA/csv/250/EMEA.en.csv: 147
2024-06-15 20:59:25,203 - INFO - Counting tokens for nl...
Counting tokens for nl...
Generating byte offset dataset from file:  EMEA/EMEA.nl
2024-06-15 20:59:51,878 - INFO - Number of samples >= 250 tokens in ./datasets/EMEA/csv/250/EMEA.nl.csv

In [84]:
# Step 2. Process the data to correct format

# NOTE: change dataset name to name + "-c" in running this right after preprocessing!!!
# gets dataset in the correct format for the experiment
!python process_data.py --config_file config.json

2024-06-15 21:25:26,517 - INFO - Parsing arguments...
Parsing arguments...
2024-06-15 21:25:26,517 - INFO - Loading tokenizer...
Loading tokenizer...
2024-06-15 21:25:26,759 - INFO - ==== Sarting data processing script ====
==== Sarting data processing script ====
2024-06-15 21:25:26,759 - INFO - This may take a while depending on the size of the dataset...
This may take a while depending on the size of the dataset...
2024-06-15 21:25:26,759 - INFO - Counting tokens for en...
Counting tokens for en...
Generating byte offset dataset from file:  EMEA/100/EMEA-c.en
2024-06-15 21:25:41,026 - INFO - Number of samples >= 100 tokens in ./datasets/EMEA/csv/100/EMEA-c.en.csv: 31176
Number of samples >= 100 tokens in ./datasets/EMEA/csv/100/EMEA-c.en.csv: 31176
2024-06-15 21:25:41,026 - INFO - Filtering sentences for en...
Filtering sentences for en...
2024-06-15 21:25:41,070 - INFO - Generating JSONL for en...
Generating JSONL for en...
2024-06-15 21:25:41,420 - INFO - Counting tokens for nl...

In [85]:
import numpy as np
import os

# Shrink the dataset to a smaller size
def shrink_datasets(path1, path2, size):
    with open(path1, "r") as f:
        data1 = f.readlines()
    with open(path2, "r") as f:
        data2 = f.readlines()
    
    num_indices = int(len(data1))
    indices = np.random.choice(num_indices, size, replace=False)

    new_data1 = [data1[i] for i in indices]
    new_data2 = [data2[i] for i in indices]

    with open(path1, "w") as f:
        f.writelines(new_data1)
    with open(path2, "w") as f:
        f.writelines(new_data2)

size = 11000

path1 = os.path.join(DATASET_DIR, str(EXAMPLE_TOKEN_LEN), f"{DATASET_NAME}-{EXAMPLE_TOKEN_LEN}.{LANG1}")
path2 = os.path.join(DATASET_DIR, str(EXAMPLE_TOKEN_LEN), f"{DATASET_NAME}-{EXAMPLE_TOKEN_LEN}.{LANG2}")
shrink_datasets(path1, path2, size)

path1 = os.path.join(DATASET_DIR, str(EXAMPLE_TOKEN_LEN), f"{DATASET_NAME}-{EXAMPLE_TOKEN_LEN}.{LANG1}.jsonl")
path2 = os.path.join(DATASET_DIR, str(EXAMPLE_TOKEN_LEN), f"{DATASET_NAME}-{EXAMPLE_TOKEN_LEN}.{LANG2}.jsonl")
shrink_datasets(path1, path2, size)

In [2]:
def update_batch_size(directory):
    for filename in os.listdir(directory):
        if filename.endswith('.json'):
            filepath = os.path.join(directory, filename)
            with open(filepath, 'r') as json_file:
                data = json.load(json_file)
            if 'batch_size' in data:
                data['batch_size'] = 64
                with open(filepath, 'w') as json_file:
                    json.dump(data, json_file, indent=4)

def update_trials(directory):
    for filename in os.listdir(directory):
        if filename.endswith('.json'):
            filepath = os.path.join(directory, filename)
            with open(filepath, 'r') as json_file:
                data = json.load(json_file)
            if 'num_trials' in data:
                data['num_trials'] = 50
                with open(filepath, 'w') as json_file:
                    json.dump(data, json_file, indent=4)

dir = "exp-configs/EMEA/100"
dir2 = "exp-configs/EMEA/150"
dir3 = "exp-configs/EMEA/200"
dir4 = "exp-configs/EMEA/250"


update_trials(dir)
update_trials(dir2)
update_trials(dir3)
update_trials(dir4)

NameError: name 'os' is not defined

In [86]:
# Step 3. (optional) Split data to train and eval sets to train the model

# This will be done for both languages
# model size not relevant here, put in any config file as input

# 11k examples in data: 10k in train, 1k in eval
!python split_train_val.py --config_file exp-configs/{DATASET_DIR}/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE1}-{LANG2}.json

2024-06-15 21:27:14,410 - INFO - ==== Starting data train+val split script ====
==== Starting data train+val split script ====
2024-06-15 21:27:14,416 - INFO - Splitting indices...
Splitting indices...
# of indices:  11000
2024-06-15 21:27:14,419 - INFO - Splitting datasets into train and validation sets...
Splitting datasets into train and validation sets...
2024-06-15 21:27:14,420 - INFO - Processing language: en
Processing language: en
Output file: EMEA/100/EMEA-c-100.en-train.jsonl
2024-06-15 21:27:14,558 - INFO - Processing language: nl
Processing language: nl
Output file: EMEA/100/EMEA-c-100.nl-train.jsonl
2024-06-15 21:27:14,655 - INFO - ==== Data train+val split script completed ====
==== Data train+val split script completed ====


In [87]:
# Step 4. Split (training) data to (pre)prefix set and suffix set

# supply the training dataset here only if you want to split the training data
!python split_dataset.py --config_file exp-configs/{DATASET_DIR}/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE1}-{LANG1}.json
!python split_dataset.py --config_file exp-configs/{DATASET_DIR}/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE2}-{LANG1}.json
!python split_dataset.py --config_file exp-configs/{DATASET_DIR}/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE3}-{LANG1}.json
!python split_dataset.py --config_file exp-configs/{DATASET_DIR}/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE4}-{LANG1}.json

!python split_dataset.py --config_file exp-configs/{DATASET_DIR}/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE1}-{LANG2}.json
!python split_dataset.py --config_file exp-configs/{DATASET_DIR}/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE2}-{LANG2}.json
!python split_dataset.py --config_file exp-configs/{DATASET_DIR}/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE3}-{LANG2}.json
!python split_dataset.py --config_file exp-configs/{DATASET_DIR}/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE4}-{LANG2}.json

2024-06-15 21:27:18,802 - INFO
===== Starting dataset token split generation for language en with token length 100 =====
2024-06-15 21:27:18,803 - INFO
Opened file: EMEA/100/EMEA-c-100.en-train.jsonl
2024-06-15 21:27:18,858 - INFO
Processed 64 lines
2024-06-15 21:27:18,873 - INFO
Processed 128 lines
2024-06-15 21:27:18,887 - INFO
Processed 192 lines
2024-06-15 21:27:18,901 - INFO
Processed 256 lines
2024-06-15 21:27:18,914 - INFO
Processed 320 lines
2024-06-15 21:27:18,927 - INFO
Processed 384 lines
2024-06-15 21:27:18,940 - INFO
Processed 448 lines
2024-06-15 21:27:18,953 - INFO
Processed 512 lines
2024-06-15 21:27:18,966 - INFO
Processed 576 lines
2024-06-15 21:27:18,980 - INFO
Processed 640 lines
2024-06-15 21:27:18,992 - INFO
Processed 704 lines
2024-06-15 21:27:19,005 - INFO
Processed 768 lines
2024-06-15 21:27:19,018 - INFO
Processed 832 lines
2024-06-15 21:27:19,030 - INFO
Processed 896 lines
2024-06-15 21:27:19,043 - INFO
Processed 960 lines
2024-06-15 21:27:19,056 - INFO
Proce

In [None]:
# Step 5. Train the model + perform extraction

# run this directly in terminal, model cannot be loaded in notebook due to memory constraints I think, crashes kernel
# NOTE: I cannot run this locally, so I run this on a HPC of the university
# Uploaded full contents of datasets + EMEA folders to Habrok so it has all data for training + extraction

# !python train.py --config_file exp-configs/EMEA/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE1}-{LANG1}.json
# !python extraction.py --config_file exp-configs/EMEA/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE1}-{LANG1}.json

In [31]:
# Decode the model generations from the numpy files to jsonl files
# NOTE: numpy files have been downloaded from the HPC where they were generated

from transformers import AutoTokenizer
import os
import numpy as np
from experiment_lib import generations_to_jsonl

tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-2.7B")
experiment_base = os.path.join("tmp", "EMEA", "nl", "nl-100-100-2.7B")

SOURCE_DIR = "./datasets"
DATASET_DIR = "EMEA"
EXAMPLE_TOKEN_LEN = 100
NUM_TRIALS = 100
exids = os.path.join(
    SOURCE_DIR,
    DATASET_DIR,
    "csv",
    str(EXAMPLE_TOKEN_LEN),
    "common_exids-" + str(EXAMPLE_TOKEN_LEN) + ".csv",
)


def decode_generations(
    experiment_dir,
    source_dir,
    dataset_dir,
    tokenizer,
    num_trials,
    example_token_len,
    exids,
):
    for i in range(0, NUM_TRIALS):
        file_path = os.path.join(experiment_base, f"generations/{i}.npy")
        data = np.load(file_path)
        print("Data shape: %s", str(data.shape))

        output_file_path = os.path.join(
            experiment_base, f"decoded/decoded_strings_trial_{i}.jsonl"
        )
        output_dir = os.path.dirname(output_file_path)
        os.makedirs(output_dir, exist_ok=True)
        generations_to_jsonl(output_file_path, data, tokenizer, exids)

    print("done")

In [6]:
# Calculate BLEU and METEOR scores for the generated outputs

!python calculate_scores.py --config_file exp-configs/{DATASET_DIR}/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE1}-{LANG1}.json
!python calculate_scores.py --config_file exp-configs/{DATASET_DIR}/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE2}-{LANG1}.json
!python calculate_scores.py --config_file exp-configs/{DATASET_DIR}/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE3}-{LANG1}.json

# !python calculate_scores.py --config_file exp-configs/{DATASET_DIR}/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE1}-{LANG2}.json
# !python calculate_scores.py --config_file exp-configs/{DATASET_DIR}/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE2}-{LANG2}.json
# !python calculate_scores.py --config_file exp-configs/{DATASET_DIR}/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE3}-{LANG2}.json

2024-06-16 23:32:25,494 - INFO - ===== Starting BLEU- & METEOR-score calculation between generated and original text in language en for 50 prefix & suffix length =====
===== Starting BLEU- & METEOR-score calculation between generated and original text in language en for 50 prefix & suffix length =====
2024-06-16 23:32:25,494 - INFO - ===== Decoding original preprefixes, prefixes & suffixes =====
===== Decoding original preprefixes, prefixes & suffixes =====
2024-06-16 23:32:25,500 - INFO - Starting BLEU-score calculation for trial 0
Starting BLEU-score calculation for trial 0
2024-06-16 23:32:25,500 - INFO - Saving BLEU scores for trial 0 to tmp/europarl/en/en-100-100-125M/bleu_scores/bleu_scores_trial_0.jsonl
Saving BLEU scores for trial 0 to tmp/europarl/en/en-100-100-125M/bleu_scores/bleu_scores_trial_0.jsonl
2024-06-16 23:32:25,500 - INFO - BLEU scores for trial 0 previously calculated, skipping calculation
BLEU scores for trial 0 previously calculated, skipping calculation
2024-06

In [8]:
# Evaluate the model outputs: sort and merge scores into single files to simplify analysis & plotting
!python evaluation.py --config_file exp-configs/{DATASET_DIR}/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE1}-{LANG1}.json
!python evaluation.py --config_file exp-configs/{DATASET_DIR}/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE2}-{LANG1}.json
!python evaluation.py --config_file exp-configs/{DATASET_DIR}/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE3}-{LANG1}.json

# !python evaluation.py --config_file exp-configs/{DATASET_DIR}/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE1}-{LANG2}.json
# !python evaluation.py --config_file exp-configs/{DATASET_DIR}/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE2}-{LANG2}.json
# !python evaluation.py --config_file exp-configs/{DATASET_DIR}/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE3}-{LANG2}.json

2024-06-17 00:23:12,795 - INFO
Model directory not provided, using default model specified in config.
2024-06-17 00:23:12,795 - INFO
==== Starting evaluation ====
2024-06-17 00:23:12,795 - INFO
Experiment name: en-100-100-125M
2024-06-17 00:23:12,795 - INFO
Language: en
2024-06-17 00:23:12,795 - INFO
Model: EleutherAI/gpt-neo-125M
2024-06-17 00:23:12,795 - INFO
Loading list of example IDs for dataset europarl...
2024-06-17 00:23:12,796 - INFO
Loaded 7398 example IDs
2024-06-17 00:23:12,797 - INFO
Processing example 81...
2024-06-17 00:23:12,901 - INFO
Merged BLEU scores for exid 81
2024-06-17 00:23:12,901 - INFO
Processing example 83...
2024-06-17 00:23:12,938 - INFO
Merged BLEU scores for exid 83
2024-06-17 00:23:12,939 - INFO
Processing example 568...
2024-06-17 00:23:12,977 - INFO
Merged BLEU scores for exid 568
2024-06-17 00:23:12,977 - INFO
Processing example 577...
2024-06-17 00:23:13,014 - INFO
Merged BLEU scores for exid 577
2024-06-17 00:23:13,014 - INFO
Processing example 765

In [5]:
!python accuracy.py --config_file exp-configs/{DATASET_DIR}/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE1}-{LANG1}.json
!python accuracy.py --config_file exp-configs/{DATASET_DIR}/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE2}-{LANG1}.json
!python accuracy.py --config_file exp-configs/{DATASET_DIR}/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE3}-{LANG1}.json

# !python accuracy.py --config_file exp-configs/{DATASET_DIR}/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE1}-{LANG2}.json
# !python accuracy.py --config_file exp-configs/{DATASET_DIR}/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE2}-{LANG2}.json
# !python accuracy.py --config_file exp-configs/{DATASET_DIR}/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE3}-{LANG2}.json

2024-06-16 23:31:54,163 - INFO - Finished counting amount of correct guesses.
Finished counting amount of correct guesses.
2024-06-16 23:31:54,164 - INFO - Saving output to tmp/europarl/nl/nl-100-100-125M/accuracy.jsonl
Saving output to tmp/europarl/nl/nl-100-100-125M/accuracy.jsonl
2024-06-16 23:31:56,111 - INFO - Finished counting amount of correct guesses.
Finished counting amount of correct guesses.
2024-06-16 23:31:56,111 - INFO - Saving output to tmp/europarl/nl/nl-100-100-1.3B/accuracy.jsonl
Saving output to tmp/europarl/nl/nl-100-100-1.3B/accuracy.jsonl
2024-06-16 23:31:57,972 - INFO - Finished counting amount of correct guesses.
Finished counting amount of correct guesses.
2024-06-16 23:31:57,972 - INFO - Saving output to tmp/europarl/nl/nl-100-100-2.7B/accuracy.jsonl
Saving output to tmp/europarl/nl/nl-100-100-2.7B/accuracy.jsonl


In [102]:
import os
import fileinput

def update_memory_allocation(directory):
    for filename in os.listdir(directory):
        if filename.endswith('.sh'):
            filepath = os.path.join(directory, filename)
            with fileinput.FileInput(filepath, inplace=True) as file:
                for line in file:
                    print(line.replace('#SBATCH --time=26:00:00', '#SBATCH --time=24:00:00'), end='')

update_memory_allocation('habrok-scripts/EMEA/100')

In [98]:
def update_experiment_number(directory):
    for filename in os.listdir(directory):
        if filename.endswith('.sh'):
            filepath = os.path.join(directory, filename)
            with fileinput.FileInput(filepath, inplace=True) as file:
                for line in file:
                    print(line.replace('200', '250'), end='')

update_experiment_number('habrok-scripts/EMEA/250')

In [10]:
from nltk.translate.bleu_score import sentence_bleu
import json
import nltk

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B")

# Function to calculate the BLEU score between the reference and candidate text
def calc_bleu_score(reference, candidate):
    return sentence_bleu([reference], candidate)

suff_file = 'datasets/europarl/nl/100/EleutherAI/gpt-neo-1.3B/_suffix.jsonl'
with open(suff_file, "r", encoding="utf-8", newline="") as file:
    suffix_lines = file.readlines()

index = 1567
# index = 19

json_line = json.loads(suffix_lines[index])
exid = json_line["exid"]
suffix = json.loads(suffix_lines[index])["text"].strip()


# trial 44 should give 1 for this one
# guess = {"exid": "402237", "text": "Aan de orde is de aanbeveling voor de tweede lezing (A5-0099/2003) van mevrouw Schörling, namens de Commissie milieubeheer, volksgezondheid en consumentenbeleid, betreffende het gemeenschappelijk standpunt, door de Raad vastgesteld met het oog op de aanneming"}

# guess = {"exid": "402237", "text": "Aan de orde is de aanbeveling voor de tweede lezing (A5-0099/2003) van mevrouw Schörling, namens de Commissie milieubeheer, volksgezondheid en consumentenbescherming, over de mededeling van de Commissie aan de Raad en het Europees Parlement betreffende de voorkoming van de ondoord"}

guess = {"exid": "402237", "text": "Aan de orde is de aanbeveling voor de tweede lezing (A5-0099/2003) van mevrouw Schörling, namens de Commissie milieubeheer Van Liettjes, zegt het Europees Parlement om een zeer geval op het belang van onze lezing en de ontwikkeling van de Europese Unie (EVE"}

candidate = guess["text"]


suffix_ref = tokenizer.tokenize(suffix)
suffix_ref = [s.replace('Ġ', ' ') for s in suffix_ref]
cand = tokenizer.tokenize(candidate)


cand = cand[50:]
suffix_cand = [c.replace('Ġ', ' ') for c in cand]


print(suffix_ref)
print(suffix_cand)

print(len(suffix_ref))
print(len(suffix_cand))


suffix_score = calc_bleu_score(suffix_ref, suffix_cand)
print(suffix_score)


[',', ' vol', 'ks', 'ge', 'z', 'ond', 'heid', ' en', ' cons', 'ument', 'en', 'be', 'le', 'id', ',', ' bet', 're', 'ff', 'ende', ' he', 't', ' gem', 'e', 'ens', 'ch', 'app', 'el', 'ijk', ' stand', 'p', 'unt', ',', ' door', ' de', ' Ra', 'ad', ' vast', 'gest', 'e', 'ld', ' met', ' he', 't', ' o', 'og', ' op', ' de', ' a', 'ann', 'eming']
[' Van', ' L', 'iet', 't', 'j', 'es', ',', ' z', 'eg', 't', ' he', 't', ' Europe', 'es', ' Par', 'lement', ' om', ' e', 'en', ' z', 'eer', ' g', 'eval', ' op', ' he', 't', ' bel', 'ang', ' van', ' on', 'ze', ' le', 'zing', ' en', ' de', ' on', 'tw', 'ik', 'ke', 'ling', ' van', ' de', ' Euro', 'p', 'ese', ' Un', 'ie', ' (', 'E', 'VE']
50
50
4.591835960079284e-155


The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
