In [2]:
LANG1 = "en"
LANG2 = "nl"
EXAMPLE_TOKEN_LEN = 100
MODEL_SIZE1 = "125M"
MODEL_SIZE2 = "1.3B"
MODEL_SIZE3 = "2.7B"
MODEL_SIZE4 = "6B"

DATASET_DIR = "EMEA"
DATASET_NAME = "europarl-v7.nl-en"
SOURCE_DIR = "./test"

# Create config file
config = {
    "dataset_dir": "test",
    "dataset_name": "EMEA-c",
    "source_dir": SOURCE_DIR,
    "example_token_len": EXAMPLE_TOKEN_LEN,
    "root_dir": "test",
    "experiment_name": "test1",
    "model_name": "gpt2",
    "num_trials": 1,
    "language": "en",
    "split": "train",
    "suffix_len": 50,
    "prefix_len": 50,
    "example_token_len": 100,
    "preprefix_len": 0,
    "source_file": "train_dataset.npy",
    "batch_size": 64,
    "model": "EleutherAI/gpt-neo-2.7B",
    "train_file": "test/100/train-en.txt",
    "validation_file": "test/100/validation-en.txt",
    "validation_split_percentage": 0.1,
    "seed": 42,
    "num_trial": 50,
}

# Save to file
import json

with open("config.json", "w") as f:
    json.dump(config, f, indent=4)

In [77]:
# Step 1. Preprocess the data 

# increase number of usable sentences
# runs for both languages
!python preprocessing.py --config_file config.json

2024-06-27 13:16:26,868 - INFO - Parsing arguments...
Parsing arguments...
2024-06-27 13:16:26,869 - INFO - Loading tokenizer...
Loading tokenizer...
2024-06-27 13:16:27,112 - INFO - ==== Starting data preprocessing script ====
==== Starting data preprocessing script ====
2024-06-27 13:16:27,112 - INFO - This may take a while depending on the size of the dataset...
This may take a while depending on the size of the dataset...
2024-06-27 13:16:27,112 - INFO - Counting tokens for en...
Counting tokens for en...
Generating byte offset dataset from file:  test/EMEA.en
2024-06-27 13:16:46,824 - INFO - Number of samples >= 100 tokens in ./test/test/csv/100/EMEA.en.csv: 2848
Number of samples >= 100 tokens in ./test/test/csv/100/EMEA.en.csv: 2848
2024-06-27 13:16:46,824 - INFO - Counting tokens for nl...
Counting tokens for nl...
Generating byte offset dataset from file:  test/EMEA.nl
2024-06-27 13:17:13,646 - INFO - Number of samples >= 100 tokens in ./test/test/csv/100/EMEA.nl.csv: 17156
Nu

In [80]:
# Step 2. Process the data to correct format

# NOTE: change dataset name to name + "-c" in running this right after preprocessing!!!
# gets dataset in the correct format for the experiment
!python process_data.py --config_file config.json

2024-06-27 13:18:15,796 - INFO - Parsing arguments...
Parsing arguments...
2024-06-27 13:18:15,796 - INFO - Loading tokenizer...
Loading tokenizer...
2024-06-27 13:18:16,088 - INFO - ==== Sarting data processing script ====
==== Sarting data processing script ====
2024-06-27 13:18:16,088 - INFO - This may take a while depending on the size of the dataset...
This may take a while depending on the size of the dataset...
2024-06-27 13:18:16,088 - INFO - Counting tokens for en...
Counting tokens for en...
Generating byte offset dataset from file:  test/100/EMEA-c.en
2024-06-27 13:18:29,580 - INFO - Number of samples >= 100 tokens in ./test/test/csv/100/EMEA-c.en.csv: 31176
Number of samples >= 100 tokens in ./test/test/csv/100/EMEA-c.en.csv: 31176
2024-06-27 13:18:29,580 - INFO - Filtering sentences for en...
Filtering sentences for en...
2024-06-27 13:18:29,625 - INFO - Generating JSONL for en...
Generating JSONL for en...
2024-06-27 13:18:29,989 - INFO - Counting tokens for nl...
Countin

In [85]:
import numpy as np
import os


# Shrink the dataset to a smaller size
def shrink_datasets(path1, path2, size):
    with open(path1, "r") as f:
        data1 = f.readlines()
    with open(path2, "r") as f:
        data2 = f.readlines()

    num_indices = int(len(data1))
    indices = np.random.choice(num_indices, size, replace=False)

    new_data1 = [data1[i] for i in indices]
    new_data2 = [data2[i] for i in indices]

    with open(path1, "w") as f:
        f.writelines(new_data1)
    with open(path2, "w") as f:
        f.writelines(new_data2)


size = 11000

path1 = os.path.join(
    DATASET_DIR, str(EXAMPLE_TOKEN_LEN), f"{DATASET_NAME}-{EXAMPLE_TOKEN_LEN}.{LANG1}"
)
path2 = os.path.join(
    DATASET_DIR, str(EXAMPLE_TOKEN_LEN), f"{DATASET_NAME}-{EXAMPLE_TOKEN_LEN}.{LANG2}"
)
shrink_datasets(path1, path2, size)

path1 = os.path.join(
    DATASET_DIR,
    str(EXAMPLE_TOKEN_LEN),
    f"{DATASET_NAME}-{EXAMPLE_TOKEN_LEN}.{LANG1}.jsonl",
)
path2 = os.path.join(
    DATASET_DIR,
    str(EXAMPLE_TOKEN_LEN),
    f"{DATASET_NAME}-{EXAMPLE_TOKEN_LEN}.{LANG2}.jsonl",
)
shrink_datasets(path1, path2, size)

In [2]:
def update_batch_size(directory):
    for filename in os.listdir(directory):
        if filename.endswith(".json"):
            filepath = os.path.join(directory, filename)
            with open(filepath, "r") as json_file:
                data = json.load(json_file)
            if "batch_size" in data:
                data["batch_size"] = 64
                with open(filepath, "w") as json_file:
                    json.dump(data, json_file, indent=4)


def update_trials(directory):
    for filename in os.listdir(directory):
        if filename.endswith(".json"):
            filepath = os.path.join(directory, filename)
            with open(filepath, "r") as json_file:
                data = json.load(json_file)
            if "num_trials" in data:
                data["num_trials"] = 50
                with open(filepath, "w") as json_file:
                    json.dump(data, json_file, indent=4)


dir = "exp-configs/EMEA/100"
dir2 = "exp-configs/EMEA/150"
dir3 = "exp-configs/EMEA/200"
dir4 = "exp-configs/EMEA/250"


update_trials(dir)
update_trials(dir2)
update_trials(dir3)
update_trials(dir4)

NameError: name 'os' is not defined

In [90]:
# Step 3. (optional) Split data to train and eval sets to train the model

# This will be done for both languages
# model size not relevant here, put in any config file as input

# 11k examples in data: 10k in train, 1k in eval
!python split_train_val.py --config_file config.json

2024-06-27 13:23:20,017 - INFO - ==== Starting data train+val split script ====
==== Starting data train+val split script ====
2024-06-27 13:23:20,071 - INFO - Splitting indices...
Splitting indices...
# of indices:  32646
2024-06-27 13:23:20,084 - INFO - Splitting datasets into train and validation sets...
Splitting datasets into train and validation sets...
2024-06-27 13:23:20,084 - INFO - Processing language: en
Processing language: en
Traceback (most recent call last):
  File "/Users/annavisman/stack/RUG/CS/Year3/thesis/thesis-llm-privacy/split_train_val.py", line 155, in <module>
    main()
  File "/Users/annavisman/stack/RUG/CS/Year3/thesis/thesis-llm-privacy/split_train_val.py", line 135, in main
    with open(os.path.join(dataset_path + ".jsonl") , "r") as f, open(indices_file, "r") as idx_file:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
FileNotFoundError: [Errno 2] No such file or directory: 'test/100/EMEA-c.en.jsonl'


In [15]:
# Step 4. Split (training) data to (pre)prefix set and suffix set

# supply the training dataset here only if you want to split the training data
!python split_dataset.py --config_file exp-configs/{DATASET_DIR}/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE1}-{LANG1}.json
!python split_dataset.py --config_file exp-configs/{DATASET_DIR}/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE2}-{LANG1}.json
!python split_dataset.py --config_file exp-configs/{DATASET_DIR}/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE3}-{LANG1}.json
!python split_dataset.py --config_file exp-configs/{DATASET_DIR}/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE4}-{LANG1}.json

!python split_dataset.py --config_file exp-configs/{DATASET_DIR}/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE1}-{LANG2}.json
!python split_dataset.py --config_file exp-configs/{DATASET_DIR}/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE2}-{LANG2}.json
!python split_dataset.py --config_file exp-configs/{DATASET_DIR}/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE3}-{LANG2}.json
!python split_dataset.py --config_file exp-configs/{DATASET_DIR}/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE4}-{LANG2}.json

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


2024-06-18 17:37:24,492 - INFO
===== Starting dataset token split generation for language en with token length 250 =====
2024-06-18 17:37:24,493 - INFO
Opened file: EMEA/250/EMEA-c-250.en-train.jsonl
2024-06-18 17:37:24,565 - INFO
Processed 64 lines
2024-06-18 17:37:24,594 - INFO
Processed 128 lines
2024-06-18 17:37:24,623 - INFO
Processed 192 lines
2024-06-18 17:37:24,653 - INFO
Processed 256 lines
2024-06-18 17:37:24,681 - INFO
Processed 320 lines
2024-06-18 17:37:24,709 - INFO
Processed 384 lines
2024-06-18 17:37:24,738 - INFO
Processed 448 lines
2024-06-18 17:37:24,765 - INFO
Processed 512 lines
2024-06-18 17:37:24,794 - INFO
Processed 576 lines
2024-06-18 17:37:24,822 - INFO
Processed 640 lines
2024-06-18 17:37:24,850 - INFO
Processed 704 lines
2024-06-18 17:37:24,878 - INFO
Processed 768 lines
2024-06-18 17:37:24,905 - INFO
Processed 832 lines
2024-06-18 17:37:24,933 - INFO
Processed 896 lines
2024-06-18 17:37:24,961 - INFO
Processed 960 lines
2024-06-18 17:37:24,987 - INFO
Proce

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


2024-06-18 17:37:30,940 - INFO
===== Starting dataset token split generation for language en with token length 250 =====
2024-06-18 17:37:30,940 - INFO
Opened file: EMEA/250/EMEA-c-250.en-train.jsonl
2024-06-18 17:37:30,973 - INFO
Processed 64 lines
2024-06-18 17:37:31,003 - INFO
Processed 128 lines
2024-06-18 17:37:31,030 - INFO
Processed 192 lines
2024-06-18 17:37:31,058 - INFO
Processed 256 lines
2024-06-18 17:37:31,085 - INFO
Processed 320 lines
2024-06-18 17:37:31,113 - INFO
Processed 384 lines
2024-06-18 17:37:31,142 - INFO
Processed 448 lines
2024-06-18 17:37:31,170 - INFO
Processed 512 lines
2024-06-18 17:37:31,198 - INFO
Processed 576 lines
2024-06-18 17:37:31,225 - INFO
Processed 640 lines
2024-06-18 17:37:31,252 - INFO
Processed 704 lines
2024-06-18 17:37:31,279 - INFO
Processed 768 lines
2024-06-18 17:37:31,306 - INFO
Processed 832 lines
2024-06-18 17:37:31,334 - INFO
Processed 896 lines
2024-06-18 17:37:31,363 - INFO
Processed 960 lines
2024-06-18 17:37:31,391 - INFO
Proce

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


2024-06-18 17:37:37,406 - INFO
===== Starting dataset token split generation for language en with token length 250 =====
2024-06-18 17:37:37,406 - INFO
Opened file: EMEA/250/EMEA-c-250.en-train.jsonl
2024-06-18 17:37:37,441 - INFO
Processed 64 lines
2024-06-18 17:37:37,471 - INFO
Processed 128 lines
2024-06-18 17:37:37,500 - INFO
Processed 192 lines
2024-06-18 17:37:37,531 - INFO
Processed 256 lines
2024-06-18 17:37:37,559 - INFO
Processed 320 lines
2024-06-18 17:37:37,587 - INFO
Processed 384 lines
2024-06-18 17:37:37,614 - INFO
Processed 448 lines
2024-06-18 17:37:37,642 - INFO
Processed 512 lines
2024-06-18 17:37:37,671 - INFO
Processed 576 lines
2024-06-18 17:37:37,700 - INFO
Processed 640 lines
2024-06-18 17:37:37,727 - INFO
Processed 704 lines
2024-06-18 17:37:37,754 - INFO
Processed 768 lines
2024-06-18 17:37:37,783 - INFO
Processed 832 lines
2024-06-18 17:37:37,820 - INFO
Processed 896 lines
2024-06-18 17:37:37,850 - INFO
Processed 960 lines
2024-06-18 17:37:37,878 - INFO
Proce

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


2024-06-18 17:37:43,980 - INFO
===== Starting dataset token split generation for language en with token length 250 =====
2024-06-18 17:37:43,981 - INFO
Opened file: EMEA/250/EMEA-c-250.en-train.jsonl
2024-06-18 17:37:44,013 - INFO
Processed 64 lines
2024-06-18 17:37:44,043 - INFO
Processed 128 lines
2024-06-18 17:37:44,070 - INFO
Processed 192 lines
2024-06-18 17:37:44,100 - INFO
Processed 256 lines
2024-06-18 17:37:44,127 - INFO
Processed 320 lines
2024-06-18 17:37:44,156 - INFO
Processed 384 lines
2024-06-18 17:37:44,184 - INFO
Processed 448 lines
2024-06-18 17:37:44,211 - INFO
Processed 512 lines
2024-06-18 17:37:44,240 - INFO
Processed 576 lines
2024-06-18 17:37:44,267 - INFO
Processed 640 lines
2024-06-18 17:37:44,295 - INFO
Processed 704 lines
2024-06-18 17:37:44,322 - INFO
Processed 768 lines
2024-06-18 17:37:44,350 - INFO
Processed 832 lines
2024-06-18 17:37:44,378 - INFO
Processed 896 lines
2024-06-18 17:37:44,405 - INFO
Processed 960 lines
2024-06-18 17:37:44,432 - INFO
Proce

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


2024-06-18 17:37:50,507 - INFO
===== Starting dataset token split generation for language nl with token length 250 =====
2024-06-18 17:37:50,507 - INFO
Opened file: EMEA/250/EMEA-c-250.nl-train.jsonl
2024-06-18 17:37:50,572 - INFO
Processed 64 lines
2024-06-18 17:37:50,594 - INFO
Processed 128 lines
2024-06-18 17:37:50,613 - INFO
Processed 192 lines
2024-06-18 17:37:50,634 - INFO
Processed 256 lines
2024-06-18 17:37:50,654 - INFO
Processed 320 lines
2024-06-18 17:37:50,674 - INFO
Processed 384 lines
2024-06-18 17:37:50,693 - INFO
Processed 448 lines
2024-06-18 17:37:50,713 - INFO
Processed 512 lines
2024-06-18 17:37:50,734 - INFO
Processed 576 lines
2024-06-18 17:37:50,756 - INFO
Processed 640 lines
2024-06-18 17:37:50,776 - INFO
Processed 704 lines
2024-06-18 17:37:50,796 - INFO
Processed 768 lines
2024-06-18 17:37:50,815 - INFO
Processed 832 lines
2024-06-18 17:37:50,835 - INFO
Processed 896 lines
2024-06-18 17:37:50,856 - INFO
Processed 960 lines
2024-06-18 17:37:50,876 - INFO
Proce

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


2024-06-18 17:37:55,776 - INFO
===== Starting dataset token split generation for language nl with token length 250 =====
2024-06-18 17:37:55,776 - INFO
Opened file: EMEA/250/EMEA-c-250.nl-train.jsonl
2024-06-18 17:37:55,799 - INFO
Processed 64 lines
2024-06-18 17:37:55,821 - INFO
Processed 128 lines
2024-06-18 17:37:55,842 - INFO
Processed 192 lines
2024-06-18 17:37:55,863 - INFO
Processed 256 lines
2024-06-18 17:37:55,889 - INFO
Processed 320 lines
2024-06-18 17:37:55,929 - INFO
Processed 384 lines
2024-06-18 17:37:55,949 - INFO
Processed 448 lines
2024-06-18 17:37:55,969 - INFO
Processed 512 lines
2024-06-18 17:37:55,990 - INFO
Processed 576 lines
2024-06-18 17:37:56,009 - INFO
Processed 640 lines
2024-06-18 17:37:56,029 - INFO
Processed 704 lines
2024-06-18 17:37:56,049 - INFO
Processed 768 lines
2024-06-18 17:37:56,069 - INFO
Processed 832 lines
2024-06-18 17:37:56,090 - INFO
Processed 896 lines
2024-06-18 17:37:56,111 - INFO
Processed 960 lines
2024-06-18 17:37:56,131 - INFO
Proce

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


2024-06-18 17:38:01,006 - INFO
===== Starting dataset token split generation for language nl with token length 250 =====
2024-06-18 17:38:01,006 - INFO
Opened file: EMEA/250/EMEA-c-250.nl-train.jsonl
2024-06-18 17:38:01,031 - INFO
Processed 64 lines
2024-06-18 17:38:01,053 - INFO
Processed 128 lines
2024-06-18 17:38:01,075 - INFO
Processed 192 lines
2024-06-18 17:38:01,095 - INFO
Processed 256 lines
2024-06-18 17:38:01,116 - INFO
Processed 320 lines
2024-06-18 17:38:01,135 - INFO
Processed 384 lines
2024-06-18 17:38:01,153 - INFO
Processed 448 lines
2024-06-18 17:38:01,173 - INFO
Processed 512 lines
2024-06-18 17:38:01,192 - INFO
Processed 576 lines
2024-06-18 17:38:01,211 - INFO
Processed 640 lines
2024-06-18 17:38:01,231 - INFO
Processed 704 lines
2024-06-18 17:38:01,251 - INFO
Processed 768 lines
2024-06-18 17:38:01,270 - INFO
Processed 832 lines
2024-06-18 17:38:01,292 - INFO
Processed 896 lines
2024-06-18 17:38:01,312 - INFO
Processed 960 lines
2024-06-18 17:38:01,330 - INFO
Proce

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


2024-06-18 17:38:06,376 - INFO
===== Starting dataset token split generation for language nl with token length 250 =====
2024-06-18 17:38:06,376 - INFO
Opened file: EMEA/250/EMEA-c-250.nl-train.jsonl
2024-06-18 17:38:06,400 - INFO
Processed 64 lines
2024-06-18 17:38:06,423 - INFO
Processed 128 lines
2024-06-18 17:38:06,444 - INFO
Processed 192 lines
2024-06-18 17:38:06,464 - INFO
Processed 256 lines
2024-06-18 17:38:06,485 - INFO
Processed 320 lines
2024-06-18 17:38:06,506 - INFO
Processed 384 lines
2024-06-18 17:38:06,525 - INFO
Processed 448 lines
2024-06-18 17:38:06,545 - INFO
Processed 512 lines
2024-06-18 17:38:06,564 - INFO
Processed 576 lines
2024-06-18 17:38:06,583 - INFO
Processed 640 lines
2024-06-18 17:38:06,602 - INFO
Processed 704 lines
2024-06-18 17:38:06,621 - INFO
Processed 768 lines
2024-06-18 17:38:06,642 - INFO
Processed 832 lines
2024-06-18 17:38:06,661 - INFO
Processed 896 lines
2024-06-18 17:38:06,680 - INFO
Processed 960 lines
2024-06-18 17:38:06,700 - INFO
Proce

In [4]:
# Step 5. Train the model + perform extraction

# run this directly in terminal, model cannot be loaded in notebook due to memory constraints I think, crashes kernel
# NOTE: I cannot run this locally, so I run this on a HPC of the university
# Uploaded full contents of datasets + EMEA folders to Habrok so it has all data for training + extraction

# !python train.py --config_file exp-configs/EMEA/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE1}-{LANG1}.json
# !python extraction.py --config_file exp-configs/{DATASET_DIR}/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE1}-{LANG1}.json --model_dir finetuned/en-100-100-125M --cache_dir cache

# !python extraction.py --config_file exp-configs/EMEA/100/config-125M-en.json --model_dir finetuned/en-100-100-125M --cache_dir cache

2024-06-18 13:31:00,922 - INFO - Parsing arguments...
Parsing arguments...
2024-06-18 13:31:00,923 - INFO - Model directory provided: finetuned/en-100-100-125M
Model directory provided: finetuned/en-100-100-125M
2024-06-18 13:31:00,923 - INFO - Executing extraction on finetuned model.
Executing extraction on finetuned model.
2024-06-18 13:31:00,933 - INFO - Default device: mps
Default device: mps
2024-06-18 13:31:00,933 - INFO - Loading tokenizer...
Loading tokenizer...
2024-06-18 13:31:01,033 - INFO - Loading model...
Loading model...


In [6]:
# Decode the model generations from the numpy files to jsonl files
# NOTE: numpy files have been downloaded from the HPC where they were generated

from transformers import AutoTokenizer
import os
import numpy as np
from experiment_lib import (
    generations_to_jsonl,
    load_constants_from_config,
    generate_exid_list,
)
import json


def decode_generations(tokenizer, exids, num_trials, exp_base):
    for i in range(0, num_trials):
        file_path = os.path.join(exp_base, f"generations/{i}.npy")
        data = np.load(file_path)
        print(f"Data shape: {str(data.shape)}")

        output_file_path = os.path.join(
            exp_base, f"decoded/decoded_strings_trial_{i}.jsonl"
        )
        output_dir = os.path.dirname(output_file_path)
        os.makedirs(output_dir, exist_ok=True)
        if os.path.exists(output_file_path):
            print("Trial already decoded, skipping...")
        else:
            generations_to_jsonl(output_file_path, data, tokenizer, exids)

    print("done")


def decoding(path):

    with open(path, "r") as f:
        config = json.load(f)

    (
        ROOT_DIR,
        DATASET_DIR,
        SOURCE_DIR,
        DATASET_NAME,
        EXPERIMENT_NAME,
        NUM_TRIALS,
        PREFIX_LEN,
        SUFFIX_LEN,
        PREPREFIX_LEN,
        LANGUAGE,
        SPLIT,
        EXAMPLE_TOKEN_LEN,
        SOURCE_FILE,
        BATCH_SIZE,
        MODEL_NAME,
        TRAIN_FILE,
        VAL_FILE,
        VAL_SPLIT,
        SEED,
    ) = load_constants_from_config(config)

    NUM_TRIALS = 100

    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    experiment_base = os.path.join(ROOT_DIR, DATASET_DIR, LANGUAGE, EXPERIMENT_NAME)

    # pretain
    # exids_path = os.path.join(
    #     SOURCE_DIR,
    #     DATASET_DIR,
    #     "csv",
    #     str(EXAMPLE_TOKEN_LEN),
    #     "common_exids-" + str(EXAMPLE_TOKEN_LEN) + ".csv",
    # )
    # exids = generate_exid_list(exids_path)

    # Train exids only
    exids_path = os.path.join(DATASET_DIR, str(EXAMPLE_TOKEN_LEN), "split_indices.json")
    with open(exids_path, 'r') as f:
        for line in f:
            obj = json.loads(line)
            exids = obj["train"]

        print(exids[:10])

    decode_generations(tokenizer, exids, NUM_TRIALS, experiment_base)

In [7]:
# decoding("exp-configs/EMEA/150/config-125M-en.json")
# decoding("exp-configs/EMEA/200/config-125M-en.json")
# decoding("exp-configs/EMEA/250/config-125M-en.json")

decoding("exp-configs/EMEA/100/config-125M-en.json")
# decoding("exp-configs/EMEA/150/config-125M-nl.json")
# decoding("exp-configs/EMEA/200/config-125M-nl.json")
# decoding("exp-configs/EMEA/250/config-125M-nl.json")

# decoding("exp-configs/EMEA/200/config-1.3B-nl.json")
# decoding("exp-configs/EMEA/150/config-1.3B-nl.json")
# decoding("exp-configs/EMEA/250/config-1.3B-nl.json")

[8542, 8107, 4428, 10340, 1678, 190, 7870, 5254, 654, 7084]
Data shape: (9900, 100)
Decoded strings saved to: %s tmp/EMEA/en/en-100-100-125M/decoded/decoded_strings_trial_0.jsonl
Data shape: (9900, 100)
Decoded strings saved to: %s tmp/EMEA/en/en-100-100-125M/decoded/decoded_strings_trial_1.jsonl
Data shape: (9900, 100)
Decoded strings saved to: %s tmp/EMEA/en/en-100-100-125M/decoded/decoded_strings_trial_2.jsonl
Data shape: (9900, 100)
Decoded strings saved to: %s tmp/EMEA/en/en-100-100-125M/decoded/decoded_strings_trial_3.jsonl
Data shape: (9900, 100)
Decoded strings saved to: %s tmp/EMEA/en/en-100-100-125M/decoded/decoded_strings_trial_4.jsonl
Data shape: (9900, 100)
Decoded strings saved to: %s tmp/EMEA/en/en-100-100-125M/decoded/decoded_strings_trial_5.jsonl
Data shape: (9900, 100)
Decoded strings saved to: %s tmp/EMEA/en/en-100-100-125M/decoded/decoded_strings_trial_6.jsonl
Data shape: (9900, 100)
Decoded strings saved to: %s tmp/EMEA/en/en-100-100-125M/decoded/decoded_strings_tr

In [3]:
# Calculate BLEU and METEOR scores for the generated outputs

# !python calculate_scores.py --config_file exp-configs/{DATASET_DIR}/100/config-{MODEL_SIZE2}-{LANG2}.json
# !python calculate_scores.py --config_file exp-configs/{DATASET_DIR}/150/config-{MODEL_SIZE2}-{LANG2}.json
# !python calculate_scores.py --config_file exp-configs/{DATASET_DIR}/200/config-{MODEL_SIZE2}-{LANG2}.json
# !python calculate_scores.py --config_file exp-configs/{DATASET_DIR}/250/config-{MODEL_SIZE2}-{LANG2}.json

# !python calculate_scores.py --config_file exp-configs/{DATASET_DIR}/100/config-{MODEL_SIZE2}-{LANG1}.json

# !python calculate_scores.py --config_file exp-configs/{DATASET_DIR}/100/config-{MODEL_SIZE3}-{LANG1}.json

# !python calculate_scores.py --config_file exp-configs/{DATASET_DIR}/150/config-{MODEL_SIZE3}-{LANG1}.json
# !python calculate_scores.py --config_file exp-configs/{DATASET_DIR}/200/config-{MODEL_SIZE3}-{LANG1}.json

!python calculate_scores.py --config_file exp-configs/{DATASET_DIR}/100/config-{MODEL_SIZE1}-{LANG1}.json




2024-06-27 23:19:58,864 - INFO - ===== Starting BLEU- & METEOR-score calculation between generated and original text in language en for 50 prefix & suffix length =====
===== Starting BLEU- & METEOR-score calculation between generated and original text in language en for 50 prefix & suffix length =====
2024-06-27 23:19:58,864 - INFO - ===== Decoding original preprefixes, prefixes & suffixes =====
===== Decoding original preprefixes, prefixes & suffixes =====
2024-06-27 23:19:58,864 - INFO - Loading split indices from EMEA/100/split_indices.json
Loading split indices from EMEA/100/split_indices.json
2024-06-27 23:20:00,872 - INFO - Starting BLEU-score calculation for trial 0
Starting BLEU-score calculation for trial 0
2024-06-27 23:20:00,874 - INFO - Saving BLEU scores for trial 0 to tmp/EMEA/en/en-100-100-125M/bleu_scores/bleu_scores_trial_0.jsonl
Saving BLEU scores for trial 0 to tmp/EMEA/en/en-100-100-125M/bleu_scores/bleu_scores_trial_0.jsonl
The hypothesis contains 0 counts of 2-gra

In [69]:
# Evaluate the model outputs: sort and merge scores into single files to simplify analysis & plotting
# !python evaluation.py --config_file exp-configs/{DATASET_DIR}/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE1}-{LANG1}.json
# !python evaluation.py --config_file exp-configs/{DATASET_DIR}/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE2}-{LANG1}.json
# !python evaluation.py --config_file exp-configs/{DATASET_DIR}/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE3}-{LANG1}.json

# !python evaluation.py --config_file exp-configs/{DATASET_DIR}/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE1}-{LANG2}.json
# !python evalu;.ation.py --config_file exp-configs/{DATASET_DIR}/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE2}-{LANG2}.json
# !python evaluation.py --config_file exp-configs/{DATASET_DIR}/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE3}-{LANG2}.json

# !python evaluation.py --config_file exp-configs/{DATASET_DIR}/100/config-{MODEL_SIZE2}-{LANG1}.json --trained True
# !python evaluation.py --config_file exp-configs/{DATASET_DIR}/100/config-{MODEL_SIZE3}-{LANG1}.json --trained True
# !python evaluation.py --config_file exp-configs/{DATASET_DIR}/150/config-{MODEL_SIZE3}-{LANG1}.json --trained True
# !python evaluation.py --config_file exp-configs/{DATASET_DIR}/150/config-{MODEL_SIZE1}-{LANG1}.json --trained True


!python evaluation.py --config_file exp-configs/{DATASET_DIR}/150/config-{MODEL_SIZE2}-{LANG1}.json --trained True

# !python evaluation.py --config_file exp-configs/{DATASET_DIR}/150/config-{MODEL_SIZE2}-{LANG1}.json --trained True
# !python evaluation.py --config_file exp-configs/{DATASET_DIR}/200/config-{MODEL_SIZE2}-{LANG1}.json --trained True
# !python evaluation.py --config_file exp-configs/{DATASET_DIR}/250/config-{MODEL_SIZE2}-{LANG1}.json --trained True

2024-06-27 11:54:30,301 - INFO
Evaluating scores on finetuned model.
2024-06-27 11:54:39,226 - INFO
==== Starting evaluation ====
2024-06-27 11:54:39,226 - INFO
Experiment name: en-150-100-1.3B
2024-06-27 11:54:39,226 - INFO
Language: en
2024-06-27 11:54:39,226 - INFO
Model: EleutherAI/gpt-neo-1.3B
2024-06-27 11:54:39,226 - INFO
Loading list of example IDs for dataset EMEA...
2024-06-27 11:54:39,227 - INFO
Loaded 9900 example IDs
2024-06-27 11:54:41,233 - INFO
Bleu scores for this experiment previously merged, skipping...
2024-06-27 11:54:41,233 - INFO
Sorting BLEU scores...
2024-06-27 11:54:41,233 - INFO
Output file tmp/EMEA/en/en-150-100-1.3B/bleu_scores/sorted_compl_bleu_scores.jsonl already exists and is not empty, skipping...
2024-06-27 11:54:41,233 - INFO
Sorted BLEU scores saved to tmp/EMEA/en/en-150-100-1.3B/bleu_scores/sorted_compl_bleu_scores.jsonl
2024-06-27 11:54:41,233 - INFO
Decoding losses...
2024-06-27 11:54:41,235 - INFO
Decoded losses for trial 0 already computed, ski

In [67]:
# !python accuracy.py --config_file exp-configs/{DATASET_DIR}/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE1}-{LANG1}.json
# !python accuracy.py --config_file exp-configs/{DATASET_DIR}/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE2}-{LANG1}.json
# !python accuracy.py --config_file exp-configs/{DATASET_DIR}/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE3}-{LANG1}.json
# # !python accuracy.py --config_file exp-configs/{DATASET_DIR}/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE4}-{LANG1}.json

# !python accuracy.py --config_file exp-configs/{DATASET_DIR}/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE1}-{LANG2}.json
# !python accuracy.py --config_file exp-configs/{DATASET_DIR}/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE2}-{LANG2}.json
# !python accuracy.py --config_file exp-configs/{DATASET_DIR}/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE3}-{LANG2}.json
# !python accuracy.py --config_file exp-configs/{DATASET_DIR}/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE4}-{LANG2}.json

# !python accuracy.py --config_file exp-configs/{DATASET_DIR}/100/config-{MODEL_SIZE2}-{LANG1}.json
# !python accuracy.py --config_file exp-configs/{DATASET_DIR}/150/config-{MODEL_SIZE2}-{LANG1}.json
# !python accuracy.py --config_file exp-configs/{DATASET_DIR}/200/config-{MODEL_SIZE2}-{LANG1}.json
# !python accuracy.py --config_file exp-configs/{DATASET_DIR}/250/config-{MODEL_SIZE2}-{LANG1}.json
# !python accuracy.py --config_file exp-configs/EMEA/150/config-125M-nl.json
# !python accuracy.py --config_file exp-configs/EMEA/200/config-125M-nl.json
# !python accuracy.py --config_file exp-configs/EMEA/250/config-125M-nl.json

!python accuracy.py --config_file exp-configs/{DATASET_DIR}/100/config-{MODEL_SIZE3}-{LANG2}.json
!python accuracy.py --config_file exp-configs/{DATASET_DIR}/150/config-{MODEL_SIZE3}-{LANG2}.json
!python accuracy.py --config_file exp-configs/{DATASET_DIR}/200/config-{MODEL_SIZE3}-{LANG2}.json

2024-06-26 23:51:42,573 - INFO - Finished counting amount of correct guesses.
Finished counting amount of correct guesses.
2024-06-26 23:51:42,573 - INFO - Saving output to tmp/EMEA/nl/nl-100-100-2.7B/accuracy.jsonl
Saving output to tmp/EMEA/nl/nl-100-100-2.7B/accuracy.jsonl
2024-06-26 23:51:44,355 - INFO - Finished counting amount of correct guesses.
Finished counting amount of correct guesses.
2024-06-26 23:51:44,355 - INFO - Saving output to tmp/EMEA/nl/nl-150-100-2.7B/accuracy.jsonl
Saving output to tmp/EMEA/nl/nl-150-100-2.7B/accuracy.jsonl
2024-06-26 23:51:46,087 - INFO - Finished counting amount of correct guesses.
Finished counting amount of correct guesses.
2024-06-26 23:51:46,087 - INFO - Saving output to tmp/EMEA/nl/nl-200-100-2.7B/accuracy.jsonl
Saving output to tmp/EMEA/nl/nl-200-100-2.7B/accuracy.jsonl


In [10]:
from nltk.translate.bleu_score import sentence_bleu
import json
import nltk

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B")


# Function to calculate the BLEU score between the reference and candidate text
def calc_bleu_score(reference, candidate):
    return sentence_bleu([reference], candidate)


suff_file = "datasets/europarl/nl/100/EleutherAI/gpt-neo-1.3B/_suffix.jsonl"
with open(suff_file, "r", encoding="utf-8", newline="") as file:
    suffix_lines = file.readlines()

index = 1567
# index = 19

json_line = json.loads(suffix_lines[index])
exid = json_line["exid"]
suffix = json.loads(suffix_lines[index])["text"].strip()


# trial 44 should give 1 for this one
# guess = {"exid": "402237", "text": "Aan de orde is de aanbeveling voor de tweede lezing (A5-0099/2003) van mevrouw Schörling, namens de Commissie milieubeheer, volksgezondheid en consumentenbeleid, betreffende het gemeenschappelijk standpunt, door de Raad vastgesteld met het oog op de aanneming"}

# guess = {"exid": "402237", "text": "Aan de orde is de aanbeveling voor de tweede lezing (A5-0099/2003) van mevrouw Schörling, namens de Commissie milieubeheer, volksgezondheid en consumentenbescherming, over de mededeling van de Commissie aan de Raad en het Europees Parlement betreffende de voorkoming van de ondoord"}

guess = {
    "exid": "402237",
    "text": "Aan de orde is de aanbeveling voor de tweede lezing (A5-0099/2003) van mevrouw Schörling, namens de Commissie milieubeheer Van Liettjes, zegt het Europees Parlement om een zeer geval op het belang van onze lezing en de ontwikkeling van de Europese Unie (EVE",
}

candidate = guess["text"]


suffix_ref = tokenizer.tokenize(suffix)
suffix_ref = [s.replace("Ġ", " ") for s in suffix_ref]
cand = tokenizer.tokenize(candidate)


cand = cand[50:]
suffix_cand = [c.replace("Ġ", " ") for c in cand]


print(suffix_ref)
print(suffix_cand)

print(len(suffix_ref))
print(len(suffix_cand))


suffix_score = calc_bleu_score(suffix_ref, suffix_cand)
print(suffix_score)

[',', ' vol', 'ks', 'ge', 'z', 'ond', 'heid', ' en', ' cons', 'ument', 'en', 'be', 'le', 'id', ',', ' bet', 're', 'ff', 'ende', ' he', 't', ' gem', 'e', 'ens', 'ch', 'app', 'el', 'ijk', ' stand', 'p', 'unt', ',', ' door', ' de', ' Ra', 'ad', ' vast', 'gest', 'e', 'ld', ' met', ' he', 't', ' o', 'og', ' op', ' de', ' a', 'ann', 'eming']
[' Van', ' L', 'iet', 't', 'j', 'es', ',', ' z', 'eg', 't', ' he', 't', ' Europe', 'es', ' Par', 'lement', ' om', ' e', 'en', ' z', 'eer', ' g', 'eval', ' op', ' he', 't', ' bel', 'ang', ' van', ' on', 'ze', ' le', 'zing', ' en', ' de', ' on', 'tw', 'ik', 'ke', 'ling', ' van', ' de', ' Euro', 'p', 'ese', ' Un', 'ie', ' (', 'E', 'VE']
50
50
4.591835960079284e-155


The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [29]:
# Count numnber of unique sentences

import numpy as np

# Load the data
data = np.load(
    "datasets/EMEA/nl/250/EleutherAI/gpt-neo-2.7B/train_dataset.npy", allow_pickle=True
)

# Convert the lists of tokens to tuples so they can be put in a set
data = [tuple(sentence) for sentence in data]

# Count the number of unique sentences
num_unique_sentences = len(set(data))

print(f"The number of unique sentences is {num_unique_sentences}.")
print(f"the total number of sentences is {len(data)}.")

The number of unique sentences is 9893.
the total number of sentences is 9900.


In [59]:
import os


def rename_npy_files(directory):
    # List all files in the specified directory
    files = os.listdir(directory)

    # Filter for .npy files
    npy_files = [f for f in files if f.endswith(".npy")]

    # Create a temporary name for each file to avoid conflicts
    for file in npy_files:
        base_name = os.path.splitext(file)[0]

        try:
            original_number = int(base_name)
            temp_filename = f"{original_number}_temp.npy"
            original_path = os.path.join(directory, file)
            temp_path = os.path.join(directory, temp_filename)
            os.rename(original_path, temp_path)

        except ValueError:
            print(f"Skipping file {file} as it does not have a numeric base name")

    # Rename temporary files to the final names
    temp_files = [f for f in os.listdir(directory) if f.endswith("_temp.npy")]

    for temp_file in temp_files:
        base_name = os.path.splitext(temp_file)[0]

        try:
            original_number = int(base_name.split("_")[0])
            new_number = original_number + 89
            new_filename = f"{new_number}.npy"
            temp_path = os.path.join(directory, temp_file)
            final_path = os.path.join(directory, new_filename)
            os.rename(temp_path, final_path)

            print(f"Renamed {temp_file} to {new_filename}")

        except ValueError:
            print(f"Skipping file {temp_file} as it does not have a numeric base name")


rename_npy_files("tmp/EMEA/en/en-150-100-2.7B-2/generations")
rename_npy_files("tmp/EMEA/en/en-150-100-2.7B-2/losses")

Renamed 7_temp.npy to 96.npy
Renamed 6_temp.npy to 95.npy
Renamed 0_temp.npy to 89.npy
Renamed 1_temp.npy to 90.npy
Renamed 4_temp.npy to 93.npy
Renamed 5_temp.npy to 94.npy
Renamed 3_temp.npy to 92.npy
Renamed 2_temp.npy to 91.npy
Renamed 9_temp.npy to 98.npy
Renamed 8_temp.npy to 97.npy
Renamed 10_temp.npy to 99.npy
Renamed 7_temp.npy to 96.npy
Renamed 6_temp.npy to 95.npy
Renamed 0_temp.npy to 89.npy
Renamed 1_temp.npy to 90.npy
Renamed 4_temp.npy to 93.npy
Renamed 5_temp.npy to 94.npy
Renamed 3_temp.npy to 92.npy
Renamed 2_temp.npy to 91.npy
Renamed 9_temp.npy to 98.npy
Renamed 8_temp.npy to 97.npy
Renamed 10_temp.npy to 99.npy


In [15]:
from experiment_lib import generations_to_jsonl
from transformers import AutoTokenizer
import os

tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-125M")
data = np.load("datasets/EMEA/en/100/EleutherAI/gpt-neo-125M/train_prefix.npy")

output_file = "test/100-125M-en.jsonl"
os.makedirs(os.path.dirname(output_file), exist_ok=True)

file = "EMEA/100/split_indices.json"
with open(file, "r") as f:
    split_indices = json.load(f)
            # this gives a list of indices present in the training dataset
    exids = split_indices["train"]

print(exids[:10])

generations_to_jsonl(output_file, data, tokenizer, exids)

[8542, 8107, 4428, 10340, 1678, 190, 7870, 5254, 654, 7084]
Decoded strings saved to: %s test/100-125M-en.jsonl


In [17]:
import numpy as np
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-125M")

def convert_to_text(data):
    # Decode each item in the data array to text
    text_data = [tokenizer.decode(item, skip_special_tokens=True) for item in data]
    return "\n".join(text_data)

def combine_npy_files_and_convert_to_text():
    # Load the data from each file
    # preprefix_data = np.load('preprefix.npy', allow_pickle=True)
    prefix_data = np.load('datasets/EMEA/en/100/EleutherAI/gpt-neo-125M/train_prefix.npy', allow_pickle=True)
    suffix_data = np.load('datasets/EMEA/en/100/EleutherAI/gpt-neo-125M/train_suffix.npy', allow_pickle=True)
    
    # Concatenate the arrays
    # combined_data = np.concatenate((preprefix_data, prefix_data, suffix_data))
    combined_data = np.concatenate((prefix_data, suffix_data))

    # Convert the combined array to text
    text_data = convert_to_text(combined_data)
    
    # Save the combined array to a new file
    np.save('test/combined.npy', combined_data)
    
    # Save the text data to a file
    with open('test/combined_text.txt', 'w') as text_file:
        text_file.write(text_data)
    
    print("Combined data saved to combined.npy.")
    print("Text data saved to combined_text.txt.")

# Call the function to execute the operations
combine_npy_files_and_convert_to_text()

Combined data saved to combined.npy.
Text data saved to combined_text.txt.


In [24]:
def check_sentence_existence_with_counts(train_file, check_file):
    # Read the second file and store sentences in a dictionary with their counts
    sentences_dict = {}
    with open(train_file, 'r') as f:
        for line in f:
            sentence = line.strip()
            if sentence in sentences_dict:
                sentences_dict[sentence] += 1
            else:
                sentences_dict[sentence] = 1
    
    count  = 0

    # Loop over each sentence in the first file
    with open(check_file, 'r') as f:
        for sentence in f:
            sentence = sentence.strip()
            # Check if the sentence is in the dictionary
            if sentence in sentences_dict:
                # If found, return True
                print("found")
                count += 1

    # If no sentence from the first file is found in the second, return False
    return count

# Example usage
train_set = "test/train-en.txt"
other_file = "test/combined_text.txt"
result = check_sentence_existence_with_counts(train_set, other_file)
print(result)

0
