In [29]:
LANG1 = "en"
LANG2 = "nl"
EXAMPLE_TOKEN_LEN = 250
MODEL_SIZE1 = "125M"
MODEL_SIZE2 = "1.3B"
MODEL_SIZE3 = "2.7B"
MODEL_SIZE4 = "6B"

DATASET_DIR = "EMEA"
DATASET_NAME = "europarl-v7.nl-en"
SOURCE_DIR = "./datasets"
TARGET_DIR = "tmp"

# Create config file
config = {
    "dataset_dir": "EMEA",
    "dataset_name": "EMEA-c",
    "source_dir": SOURCE_DIR,
    "example_token_len": EXAMPLE_TOKEN_LEN,
    "root_dir": "tmp",
    "experiment_name": "test1",
    "model_name": "gpt2",
    "num_trials": 1,
    "language": "en",
    "split": "train",
    "suffix_len": 50,
    "prefix_len": 50,
    "example_token_len": 100,
    "preprefix_len": 0,
    "source_file": "train_dataset.npy",
    "batch_size": 64,
    "model": "EleutherAI/gpt-neo-2.7B",
    "train_file": "test/100/train-en.txt",
    "validation_file": "test/100/validation-en.txt",
    "validation_split_percentage": 0.1,
    "seed": 42,
    "num_trial": 50,
}

# Save to file
import json

with open("config.json", "w") as f:
    json.dump(config, f, indent=4)

In [77]:
# Step 1. Preprocess the data 

# increase number of usable sentences
# runs for both languages
!python preprocessing.py --config_file config.json

2024-06-27 13:16:26,868 - INFO - Parsing arguments...
Parsing arguments...
2024-06-27 13:16:26,869 - INFO - Loading tokenizer...
Loading tokenizer...
2024-06-27 13:16:27,112 - INFO - ==== Starting data preprocessing script ====
==== Starting data preprocessing script ====
2024-06-27 13:16:27,112 - INFO - This may take a while depending on the size of the dataset...
This may take a while depending on the size of the dataset...
2024-06-27 13:16:27,112 - INFO - Counting tokens for en...
Counting tokens for en...
Generating byte offset dataset from file:  test/EMEA.en
2024-06-27 13:16:46,824 - INFO - Number of samples >= 100 tokens in ./test/test/csv/100/EMEA.en.csv: 2848
Number of samples >= 100 tokens in ./test/test/csv/100/EMEA.en.csv: 2848
2024-06-27 13:16:46,824 - INFO - Counting tokens for nl...
Counting tokens for nl...
Generating byte offset dataset from file:  test/EMEA.nl
2024-06-27 13:17:13,646 - INFO - Number of samples >= 100 tokens in ./test/test/csv/100/EMEA.nl.csv: 17156
Nu

In [80]:
# Step 2. Process the data to correct format

# NOTE: change dataset name to name + "-c" in running this right after preprocessing!!!
# gets dataset in the correct format for the experiment
!python process_data.py --config_file config.json

2024-06-27 13:18:15,796 - INFO - Parsing arguments...
Parsing arguments...
2024-06-27 13:18:15,796 - INFO - Loading tokenizer...
Loading tokenizer...
2024-06-27 13:18:16,088 - INFO - ==== Sarting data processing script ====
==== Sarting data processing script ====
2024-06-27 13:18:16,088 - INFO - This may take a while depending on the size of the dataset...
This may take a while depending on the size of the dataset...
2024-06-27 13:18:16,088 - INFO - Counting tokens for en...
Counting tokens for en...
Generating byte offset dataset from file:  test/100/EMEA-c.en
2024-06-27 13:18:29,580 - INFO - Number of samples >= 100 tokens in ./test/test/csv/100/EMEA-c.en.csv: 31176
Number of samples >= 100 tokens in ./test/test/csv/100/EMEA-c.en.csv: 31176
2024-06-27 13:18:29,580 - INFO - Filtering sentences for en...
Filtering sentences for en...
2024-06-27 13:18:29,625 - INFO - Generating JSONL for en...
Generating JSONL for en...
2024-06-27 13:18:29,989 - INFO - Counting tokens for nl...
Countin

In [85]:
import numpy as np
import os


# Shrink the dataset to a smaller size
def shrink_datasets(path1, path2, size):
    with open(path1, "r") as f:
        data1 = f.readlines()
    with open(path2, "r") as f:
        data2 = f.readlines()

    num_indices = int(len(data1))
    indices = np.random.choice(num_indices, size, replace=False)

    new_data1 = [data1[i] for i in indices]
    new_data2 = [data2[i] for i in indices]

    with open(path1, "w") as f:
        f.writelines(new_data1)
    with open(path2, "w") as f:
        f.writelines(new_data2)


size = 11000

path1 = os.path.join(
    DATASET_DIR, str(EXAMPLE_TOKEN_LEN), f"{DATASET_NAME}-{EXAMPLE_TOKEN_LEN}.{LANG1}"
)
path2 = os.path.join(
    DATASET_DIR, str(EXAMPLE_TOKEN_LEN), f"{DATASET_NAME}-{EXAMPLE_TOKEN_LEN}.{LANG2}"
)
shrink_datasets(path1, path2, size)

path1 = os.path.join(
    DATASET_DIR,
    str(EXAMPLE_TOKEN_LEN),
    f"{DATASET_NAME}-{EXAMPLE_TOKEN_LEN}.{LANG1}.jsonl",
)
path2 = os.path.join(
    DATASET_DIR,
    str(EXAMPLE_TOKEN_LEN),
    f"{DATASET_NAME}-{EXAMPLE_TOKEN_LEN}.{LANG2}.jsonl",
)
shrink_datasets(path1, path2, size)

In [2]:
def update_batch_size(directory):
    for filename in os.listdir(directory):
        if filename.endswith(".json"):
            filepath = os.path.join(directory, filename)
            with open(filepath, "r") as json_file:
                data = json.load(json_file)
            if "batch_size" in data:
                data["batch_size"] = 64
                with open(filepath, "w") as json_file:
                    json.dump(data, json_file, indent=4)


def update_trials(directory):
    for filename in os.listdir(directory):
        if filename.endswith(".json"):
            filepath = os.path.join(directory, filename)
            with open(filepath, "r") as json_file:
                data = json.load(json_file)
            if "num_trials" in data:
                data["num_trials"] = 50
                with open(filepath, "w") as json_file:
                    json.dump(data, json_file, indent=4)


dir = "exp-configs/EMEA/100"
dir2 = "exp-configs/EMEA/150"
dir3 = "exp-configs/EMEA/200"
dir4 = "exp-configs/EMEA/250"


update_trials(dir)
update_trials(dir2)
update_trials(dir3)
update_trials(dir4)

NameError: name 'os' is not defined

In [29]:
# Step 3. (optional) Split data to train and eval sets to train the model

# This will be done for both languages
# model size not relevant here, put in any config file as input

# 11k examples in data: 10k in train, 1k in eval
!python split_train_val.py --config_file config.json

2024-06-28 01:09:39,763 - INFO - ==== Starting data train+val split script ====
==== Starting data train+val split script ====
Traceback (most recent call last):
  File "/Users/annavisman/stack/RUG/CS/Year3/thesis/thesis-llm-privacy/split_train_val.py", line 156, in <module>
    main()
  File "/Users/annavisman/stack/RUG/CS/Year3/thesis/thesis-llm-privacy/split_train_val.py", line 81, in main
    with open(dataset_path, "r") as f:
         ^^^^^^^^^^^^^^^^^^^^^^^
FileNotFoundError: [Errno 2] No such file or directory: 'test/100/EMEA-c.en'


In [144]:
# Step 4. Split (training) data to (pre)prefix set and suffix set

# supply the training dataset here only if you want to split the training data
!python split_dataset.py --config_file exp-configs/{DATASET_DIR}/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE1}-{LANG1}.json
# !python split_dataset.py --config_file exp-configs/{DATASET_DIR}/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE2}-{LANG1}.json
# !python split_dataset.py --config_file exp-configs/{DATASET_DIR}/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE3}-{LANG1}.json
# !python split_dataset.py --config_file exp-configs/{DATASET_DIR}/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE4}-{LANG1}.json

# !python split_dataset.py --config_file exp-configs/{DATASET_DIR}/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE1}-{LANG2}.json
# !python split_dataset.py --config_file exp-configs/{DATASET_DIR}/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE2}-{LANG2}.json
# !python split_dataset.py --config_file exp-configs/{DATASET_DIR}/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE3}-{LANG2}.json
# !python split_dataset.py --config_file exp-configs/{DATASET_DIR}/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE4}-{LANG2}.json


^C
Traceback (most recent call last):
  File "/Users/annavisman/stack/RUG/CS/Year3/thesis/thesis-llm-privacy/split_dataset.py", line 8, in <module>
    from experiment_lib import load_constants_from_config
  File "/Users/annavisman/stack/RUG/CS/Year3/thesis/thesis-llm-privacy/experiment_lib.py", line 4, in <module>
    import matplotlib.pyplot as plt
  File "/opt/miniconda3/envs/torch/lib/python3.11/site-packages/matplotlib/pyplot.py", line 66, in <module>
    from matplotlib.figure import Figure, FigureBase, figaspect
  File "/opt/miniconda3/envs/torch/lib/python3.11/site-packages/matplotlib/figure.py", line 43, in <module>
    from matplotlib import _blocking_input, backend_bases, _docstring, projections
  File "/opt/miniconda3/envs/torch/lib/python3.11/site-packages/matplotlib/projections/__init__.py", line 55, in <module>
    from .. import axes, _docstring
  File "/opt/miniconda3/envs/torch/lib/python3.11/site-packages/matplotlib/axes/__init__.py", line 1, in <module>
    from . i

In [4]:
# Step 5. Train the model + perform extraction

# run this directly in terminal, model cannot be loaded in notebook due to memory constraints I think, crashes kernel
# NOTE: I cannot run this locally, so I run this on a HPC of the university
# Uploaded full contents of datasets + EMEA folders to Habrok so it has all data for training + extraction

# !python train.py --config_file exp-configs/EMEA/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE1}-{LANG1}.json
# !python extraction.py --config_file exp-configs/{DATASET_DIR}/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE1}-{LANG1}.json --model_dir finetuned/en-100-100-125M --cache_dir cache

# !python extraction.py --config_file exp-configs/EMEA/100/config-125M-en.json --model_dir finetuned/en-100-100-125M --cache_dir cache

2024-06-18 13:31:00,922 - INFO - Parsing arguments...
Parsing arguments...
2024-06-18 13:31:00,923 - INFO - Model directory provided: finetuned/en-100-100-125M
Model directory provided: finetuned/en-100-100-125M
2024-06-18 13:31:00,923 - INFO - Executing extraction on finetuned model.
Executing extraction on finetuned model.
2024-06-18 13:31:00,933 - INFO - Default device: mps
Default device: mps
2024-06-18 13:31:00,933 - INFO - Loading tokenizer...
Loading tokenizer...
2024-06-18 13:31:01,033 - INFO - Loading model...
Loading model...


In [18]:
# Decode the model generations from the numpy files to jsonl files
# NOTE: numpy files have been downloaded from the HPC where they were generated

from transformers import AutoTokenizer
import os
import numpy as np
from experiment_lib import (
    generations_to_jsonl,
    load_constants_from_config,
    generate_exid_list,
)
import json


def decode_generations(tokenizer, exids, num_trials, exp_base):
    for i in range(0, num_trials):
        file_path = os.path.join(exp_base, f"generations/{i}.npy")
        data = np.load(file_path)
        print(f"Data shape: {str(data.shape)}")

        output_file_path = os.path.join(
            exp_base, f"decoded/decoded_strings_trial_{i}.jsonl"
        )
        output_dir = os.path.dirname(output_file_path)
        os.makedirs(output_dir, exist_ok=True)
        if os.path.exists(output_file_path):
            print("Trial already decoded, skipping...")
        else:
            generations_to_jsonl(output_file_path, data, tokenizer, exids)

    print("done")


def decoding(path):

    with open(path, "r") as f:
        config = json.load(f)

    (
        ROOT_DIR,
        DATASET_DIR,
        SOURCE_DIR,
        DATASET_NAME,
        EXPERIMENT_NAME,
        NUM_TRIALS,
        PREFIX_LEN,
        SUFFIX_LEN,
        PREPREFIX_LEN,
        LANGUAGE,
        SPLIT,
        EXAMPLE_TOKEN_LEN,
        SOURCE_FILE,
        BATCH_SIZE,
        MODEL_NAME,
        TRAIN_FILE,
        VAL_FILE,
        VAL_SPLIT,
        SEED,
    ) = load_constants_from_config(config)

    NUM_TRIALS = 100

    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    experiment_base = os.path.join(ROOT_DIR, DATASET_DIR, LANGUAGE, EXPERIMENT_NAME)

    # pretain
    # exids_path = os.path.join(
    #     SOURCE_DIR,
    #     DATASET_DIR,
    #     "csv",
    #     str(EXAMPLE_TOKEN_LEN),
    #     "common_exids-" + str(EXAMPLE_TOKEN_LEN) + ".csv",
    # )
    # exids = generate_exid_list(exids_path)

    # Train exids only
    exids_path = os.path.join(DATASET_DIR, str(EXAMPLE_TOKEN_LEN), "split_indices.json")
    with open(exids_path, 'r') as f:
        for line in f:
            obj = json.loads(line)
            exids = obj["train"]

        print(exids[:10])

    decode_generations(tokenizer, exids, NUM_TRIALS, experiment_base)

In [19]:
# decoding("exp-configs/EMEA/200/config-2.7B-en.json")
decoding("exp-configs/EMEA/250/config-1.3B-en.json")
# decoding("exp-configs/EMEA/200/config-125M-en.json")

# decoding("exp-configs/EMEA/100/config-2.7B-nl.json")
# decoding("exp-configs/EMEA/100/config-125M-nl.json")
# decoding("exp-configs/EMEA/100/config-1.3B-nl.json")
# decoding("exp-configs/EMEA/100/config-2.7B-nl.json")

# decoding("exp-configs/EMEA/150/config-125M-nl.json")
# decoding("exp-configs/EMEA/200/config-125M-nl.json")
# decoding("exp-configs/EMEA/250/config-125M-nl.json")

# decoding("exp-configs/EMEA/200/config-1.3B-nl.json")
# decoding("exp-configs/EMEA/150/config-1.3B-nl.json")
# decoding("exp-configs/EMEA/250/config-1.3B-nl.json")

[8542, 8107, 4428, 10340, 1678, 190, 7870, 5254, 654, 7084]
Data shape: (9900, 250)
Decoded strings saved to: %s tmp/EMEA/en/en-250-100-1.3B/decoded/decoded_strings_trial_0.jsonl
Data shape: (9900, 250)
Decoded strings saved to: %s tmp/EMEA/en/en-250-100-1.3B/decoded/decoded_strings_trial_1.jsonl
Data shape: (9900, 250)
Decoded strings saved to: %s tmp/EMEA/en/en-250-100-1.3B/decoded/decoded_strings_trial_2.jsonl
Data shape: (9900, 250)
Decoded strings saved to: %s tmp/EMEA/en/en-250-100-1.3B/decoded/decoded_strings_trial_3.jsonl
Data shape: (9900, 250)
Decoded strings saved to: %s tmp/EMEA/en/en-250-100-1.3B/decoded/decoded_strings_trial_4.jsonl
Data shape: (9900, 250)
Decoded strings saved to: %s tmp/EMEA/en/en-250-100-1.3B/decoded/decoded_strings_trial_5.jsonl
Data shape: (9900, 250)
Decoded strings saved to: %s tmp/EMEA/en/en-250-100-1.3B/decoded/decoded_strings_trial_6.jsonl
Data shape: (9900, 250)
Decoded strings saved to: %s tmp/EMEA/en/en-250-100-1.3B/decoded/decoded_strings_tr

In [31]:
# Calculate BLEU scores for the generated outputs
!python calculate_scores.py --config_file exp-configs/{DATASET_DIR}/100/config-{MODEL_SIZE1}-{LANG1}.json
# !python calculate_scores.py --config_file exp-configs/{DATASET_DIR}/200/config-{MODEL_SIZE2}-{LANG1}.json
# !python calculate_scores.py --config_file exp-configs/{DATASET_DIR}/200/config-{MODEL_SIZE3}-{LANG1}.json

2024-07-01 00:12:11,334 - INFO - ===== Starting BLEU-score calculation between generated and original text in language en for 50 prefix & suffix length =====
===== Starting BLEU-score calculation between generated and original text in language en for 50 prefix & suffix length =====
2024-07-01 00:12:11,334 - INFO - ===== Decoding original preprefixes, prefixes & suffixes =====
===== Decoding original preprefixes, prefixes & suffixes =====
2024-07-01 00:12:11,334 - INFO - Loading split indices from EMEA/100/split_indices.json
Loading split indices from EMEA/100/split_indices.json
Decoded strings saved to: %s ./datasets/EMEA/en/100/EleutherAI/gpt-neo-125M/train_suffix.jsonl
2024-07-01 00:12:16,815 - INFO - Filtered suffixes to only include exids in the exids list
Filtered suffixes to only include exids in the exids list
2024-07-01 00:12:16,836 - INFO - Saved filtered suffixes toEMEA/100/prompt-train_dataset_suffixes.jsonl
Saved filtered suffixes toEMEA/100/prompt-train_dataset_suffixes.js

In [32]:
# Evaluate the model outputs: sort and merge scores into single files to simplify analysis & plotting

# !python evaluation.py --config_file exp-configs/{DATASET_DIR}/200/config-{MODEL_SIZE3}-{LANG1}.json --trained True
# !python evaluation.py --config_file exp-configs/{DATASET_DIR}/200/config-{MODEL_SIZE2}-{LANG1}.json --trained True
!python evaluation.py --config_file exp-configs/{DATASET_DIR}/100/config-{MODEL_SIZE1}-{LANG1}.json --trained True

2024-07-01 00:14:54,261 - INFO
Evaluating scores on finetuned model.
2024-07-01 00:14:56,397 - INFO
==== Starting evaluation ====
2024-07-01 00:14:56,397 - INFO
Experiment name: en-100-100-125M
2024-07-01 00:14:56,398 - INFO
Language: en
2024-07-01 00:14:56,398 - INFO
Model: EleutherAI/gpt-neo-125M
2024-07-01 00:14:56,398 - INFO
Loading list of example IDs for dataset EMEA...
2024-07-01 00:14:56,398 - INFO
Loading exids from EMEA/100/prompt-train_dataset_exids.json
2024-07-01 00:14:56,399 - INFO
Loaded 3327 example IDs
2024-07-01 00:14:58,406 - INFO
Processing example 2...
2024-07-01 00:14:58,453 - INFO
Merged BLEU scores for exid 2
2024-07-01 00:14:58,453 - INFO
Processing example 4...
2024-07-01 00:14:58,479 - INFO
Merged BLEU scores for exid 4
2024-07-01 00:14:58,479 - INFO
Processing example 7...
2024-07-01 00:14:58,502 - INFO
Merged BLEU scores for exid 7
2024-07-01 00:14:58,502 - INFO
Processing example 11...
2024-07-01 00:14:58,523 - INFO
Merged BLEU scores for exid 11
2024-07-0

In [33]:
# !python accuracy.py --config_file exp-configs/{DATASET_DIR}/200/config-{MODEL_SIZE3}-{LANG1}.json
# !python accuracy.py --config_file exp-configs/{DATASET_DIR}/200/config-{MODEL_SIZE2}-{LANG1}.json
!python accuracy.py --config_file exp-configs/{DATASET_DIR}/100/config-{MODEL_SIZE1}-{LANG1}.json

2024-07-01 00:16:37,410 - INFO - Finished counting amount of correct guesses.
Finished counting amount of correct guesses.
2024-07-01 00:16:37,410 - INFO - Saving output to tmp/EMEA/en/en-100-100-125M/accuracy.jsonl
Saving output to tmp/EMEA/en/en-100-100-125M/accuracy.jsonl


In [29]:
# Count numnber of unique sentences

import numpy as np

# Load the data
data = np.load(
    "datasets/EMEA/nl/250/EleutherAI/gpt-neo-2.7B/train_dataset.npy", allow_pickle=True
)

# Convert the lists of tokens to tuples so they can be put in a set
data = [tuple(sentence) for sentence in data]

# Count the number of unique sentences
num_unique_sentences = len(set(data))

print(f"The number of unique sentences is {num_unique_sentences}.")
print(f"the total number of sentences is {len(data)}.")

The number of unique sentences is 9893.
the total number of sentences is 9900.


In [27]:
import os

def rename_npy_files(directory):
    # List all files in the specified directory
    files = os.listdir(directory)

    # Filter for .npy files
    npy_files = [f for f in files if f.endswith(".npy")]

    # Create a temporary name for each file to avoid conflicts
    for file in npy_files:
        base_name = os.path.splitext(file)[0]

        try:
            original_number = int(base_name)
            temp_filename = f"{original_number}_temp.npy"
            original_path = os.path.join(directory, file)
            temp_path = os.path.join(directory, temp_filename)
            os.rename(original_path, temp_path)

        except ValueError:
            print(f"Skipping file {file} as it does not have a numeric base name")

    # Rename temporary files to the final names
    temp_files = [f for f in os.listdir(directory) if f.endswith("_temp.npy")]

    for temp_file in temp_files:
        base_name = os.path.splitext(temp_file)[0]

        try:
            original_number = int(base_name.split("_")[0])
            new_number = original_number + 36
            new_filename = f"{new_number}.npy"
            temp_path = os.path.join(directory, temp_file)
            final_path = os.path.join(directory, new_filename)
            os.rename(temp_path, final_path)

            print(f"Renamed {temp_file} to {new_filename}")

        except ValueError:
            print(f"Skipping file {temp_file} as it does not have a numeric base name")


rename_npy_files("tmp/EMEA/nl/nl-250-100-2.7B-2/generations")
rename_npy_files("tmp/EMEA/nl/nl-250-100-2.7B-2/losses")

Renamed 15_temp.npy to 51.npy
Renamed 14_temp.npy to 50.npy
Renamed 7_temp.npy to 43.npy
Renamed 6_temp.npy to 42.npy
Renamed 21_temp.npy to 57.npy
Renamed 20_temp.npy to 56.npy
Renamed 36_temp.npy to 72.npy
Renamed 31_temp.npy to 67.npy
Renamed 30_temp.npy to 66.npy
Renamed 12_temp.npy to 48.npy
Renamed 13_temp.npy to 49.npy
Renamed 18_temp.npy to 54.npy
Renamed 19_temp.npy to 55.npy
Renamed 0_temp.npy to 36.npy
Renamed 1_temp.npy to 37.npy
Renamed 26_temp.npy to 62.npy
Renamed 27_temp.npy to 63.npy
Renamed 35_temp.npy to 71.npy
Renamed 34_temp.npy to 70.npy
Renamed 4_temp.npy to 40.npy
Renamed 5_temp.npy to 41.npy
Renamed 22_temp.npy to 58.npy
Renamed 23_temp.npy to 59.npy
Renamed 16_temp.npy to 52.npy
Renamed 17_temp.npy to 53.npy
Renamed 28_temp.npy to 64.npy
Renamed 29_temp.npy to 65.npy
Renamed 3_temp.npy to 39.npy
Renamed 2_temp.npy to 38.npy
Renamed 25_temp.npy to 61.npy
Renamed 24_temp.npy to 60.npy
Renamed 9_temp.npy to 45.npy
Renamed 8_temp.npy to 44.npy
Renamed 11_temp.npy 

In [203]:
from experiment_lib import generations_to_jsonl
from transformers import AutoTokenizer
import os

MODEL = "gpt-neo-125M"
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/" + MODEL)

# Load the data that was used to prompt the model
data = np.load(os.path.join(SOURCE_DIR, DATASET_DIR, LANG1, str(EXAMPLE_TOKEN_LEN), "EleutherAI/" + MODEL, "train_dataset.npy"))

output_file = os.path.join(DATASET_DIR, str(EXAMPLE_TOKEN_LEN), "prompt_dataset.jsonl")
os.makedirs(os.path.dirname(output_file), exist_ok=True)

split_file = os.path.join(DATASET_DIR, str(EXAMPLE_TOKEN_LEN), "split_indices.json")
with open(split_file, "r") as f:
    split_indices = json.load(f)
            # this gives a list of indices present in the training dataset
    exids = split_indices["train"]

generations_to_jsonl(output_file, data, tokenizer, exids)

Decoded strings saved to: %s EMEA/250/prompt_dataset.jsonl


In [204]:
# to txt file
in_file = os.path.join(DATASET_DIR, str(EXAMPLE_TOKEN_LEN), "prompt_dataset.jsonl")
out_file = os.path.join(DATASET_DIR, str(EXAMPLE_TOKEN_LEN), "prompt_dataset.txt")

with open(out_file, "w") as f, open(in_file, "r") as f2:
    for line in f2:
        obj = json.loads(line)
        f.write(obj["text"] + "\n") 

In [205]:
import json

# finding the intersection of the prompt dataset and the training dataset

in_file = os.path.join(DATASET_DIR, str(EXAMPLE_TOKEN_LEN), "prompt_dataset.jsonl")
train_file = os.path.join(DATASET_DIR, str(EXAMPLE_TOKEN_LEN), "train-en.txt")

with open(in_file, "r") as prompt_file, open(train_file, "r") as train_file:
    prompts = prompt_file.readlines()
    train_lines = train_file.readlines()

print(len(prompts))
print(len(train_lines))

out_file = os.path.join(DATASET_DIR, str(EXAMPLE_TOKEN_LEN), "prompt-train_dataset.jsonl")
count = 0

counted = []

with open(out_file, "w") as f:
    for line in prompts:
        obj = json.loads(line) 
        l = (obj["text"]).split()
        for index, train in enumerate(train_lines):
            t = train.split()
            # if the sentences are the same
            # and the training sentence has not been counted yet (to avoid duplicate exids)
            if l == t and obj["exid"] not in counted:
                # if we find a matching sentence, save the exid to a file
                counted.append(obj["exid"])
                # add to total number of matches
                count += 1
                # save the matching json sentence from prompt set to a file
                json.dump(obj, f, ensure_ascii=False)
                f.write("\n")
                # Replace the processed line with None
                # train_lines[index] = None

print(count)

9900
9900
6101


In [234]:
import json

def find_and_save_duplicates(input_file_path, output_file_path):
    # Step 2: Initialize a dictionary to track occurrences and exids
    sentence_details = {}
    
    # Step 3: Read the input file and count sentences, storing exids
    with open(input_file_path, 'r') as input_file:
        for line in input_file:
            obj = json.loads(line)
            text = obj.get('text', '')
            exid = obj.get('exid', '')
            if text:
                if text in sentence_details:
                    sentence_details[text]['count'] += 1
                    sentence_details[text]['exids'].append(exid)
                else:
                    sentence_details[text] = {'count': 1, 'exids': [exid]}
    
    # Step 4: Filter for duplicates and prepare the output structure
    duplicates = {text: details for text, details in sentence_details.items() if details['count'] > 1}
     
    # Step 5: Write duplicates to the output file
    with open(output_file_path, 'w') as output_file:
        for text, details in duplicates.items():
            # Creating a dict to represent the JSON structure including exids
            duplicate_entry = {'text': text, 'count': details['count'], 'exids': details['exids']}
            json.dump(duplicate_entry, output_file, ensure_ascii=False)
            output_file.write('\n')  # Ensure each entry is on a new line

# Example usage
input_file_path = os.path.join(DATASET_DIR, str(EXAMPLE_TOKEN_LEN), "prompt-train_dataset.jsonl")
output_file_path = os.path.join(DATASET_DIR, str(EXAMPLE_TOKEN_LEN), "prompt-train_datasetDUP.jsonl")
find_and_save_duplicates(input_file_path, output_file_path)

# Count number of repeated sentences in the output file
with open(output_file_path, 'r') as f:
    count = 0
    for line in f:
        obj = json.loads(line)
        count += obj['count']

print(count)

4


In [235]:
# to txt file
in_file = os.path.join(DATASET_DIR, str(EXAMPLE_TOKEN_LEN), "prompt-train_dataset.jsonl")
out_file = os.path.join(DATASET_DIR, str(EXAMPLE_TOKEN_LEN), "prompt-train_dataset.txt")

with open(out_file, "w") as f, open(in_file, "r") as f2:
    for line in f2:
        obj = json.loads(line)
        f.write(obj["text"] + "\n") 

In [236]:
file = os.path.join(DATASET_DIR, str(EXAMPLE_TOKEN_LEN), "prompt-train_dataset.jsonl")


with open(file, "r") as f:
    exids = []
    for line in f:
        obj = json.loads(line)
        exids.append(obj["exid"])

print(len(exids))

# save to file
out = os.path.join(DATASET_DIR, str(EXAMPLE_TOKEN_LEN), "prompt-train_dataset_exids.json")
with open(out, "w") as f:
    json.dump(exids, f, indent=4)

6101


In [21]:
import os
import json

EXP_NAME = "en-250-100-1.3B"
dir_path = os.path.join(TARGET_DIR, DATASET_DIR, LANG1, EXP_NAME, "decoded")
exids_file = os.path.join(DATASET_DIR, str(EXAMPLE_TOKEN_LEN), "prompt-train_dataset_exids.json")

def filter_decoded_generations(dir_path, exids_file):

    # Load exids once, assuming they are the same for all files
    with open(exids_file, "r") as e:
        exids = json.load(e)

    # 100 trials
    for i in range(100):
        # Correctly format the filenames using f-strings
        gens = os.path.join(dir_path, f"decoded_strings_trial_{i}.jsonl")
        new_gens = os.path.join(dir_path, f"decoded_strings_trial_{i}_filtered.jsonl")

        with open(gens, "r") as f, open(new_gens, "w") as n:
            for line in f:
                obj = json.loads(line)
                exid = obj.get("exid")  # Use get to avoid KeyError if exid is missing
                if exid in exids:
                    json.dump(obj, n, ensure_ascii=False)
                    n.write("\n")

filter_decoded_generations(dir_path, exids_file)

