In [5]:
LANG1 = "en"
LANG2 = "nl"
EXAMPLE_TOKEN_LEN = 100
MODEL_SIZE1 = "125M"
MODEL_SIZE2 = "1.3B"
MODEL_SIZE3 = "2.7B"
MODEL_SIZE4 = "6B"

DATASET_DIR = "EMEA"
DATASET_NAME = "europarl-v7.nl-en"
SOURCE_DIR = "./datasets"
TARGET_DIR = "tmp"

# Create config file
config = {
    "dataset_dir": "EMEA",
    "dataset_name": "EMEA-c",
    "source_dir": SOURCE_DIR,
    "example_token_len": EXAMPLE_TOKEN_LEN,
    "root_dir": "tmp",
    "experiment_name": "test1",
    "model_name": "gpt2",
    "num_trials": 1,
    "language": "en",
    "split": "train",
    "suffix_len": 50,
    "prefix_len": 50,
    "example_token_len": 100,
    "preprefix_len": 0,
    "source_file": "train_dataset.npy",
    "batch_size": 64,
    "model": "EleutherAI/gpt-neo-2.7B",
    "train_file": "test/100/train-en.txt",
    "validation_file": "test/100/validation-en.txt",
    "validation_split_percentage": 0.1,
    "seed": 42,
    "num_trial": 50,
}

# Save to file
import json

with open("config.json", "w") as f:
    json.dump(config, f, indent=4)

In [77]:
# Step 1. Preprocess the data 

# increase number of usable sentences
# runs for both languages
!python preprocessing.py --config_file config.json

2024-06-27 13:16:26,868 - INFO - Parsing arguments...
Parsing arguments...
2024-06-27 13:16:26,869 - INFO - Loading tokenizer...
Loading tokenizer...
2024-06-27 13:16:27,112 - INFO - ==== Starting data preprocessing script ====
==== Starting data preprocessing script ====
2024-06-27 13:16:27,112 - INFO - This may take a while depending on the size of the dataset...
This may take a while depending on the size of the dataset...
2024-06-27 13:16:27,112 - INFO - Counting tokens for en...
Counting tokens for en...
Generating byte offset dataset from file:  test/EMEA.en
2024-06-27 13:16:46,824 - INFO - Number of samples >= 100 tokens in ./test/test/csv/100/EMEA.en.csv: 2848
Number of samples >= 100 tokens in ./test/test/csv/100/EMEA.en.csv: 2848
2024-06-27 13:16:46,824 - INFO - Counting tokens for nl...
Counting tokens for nl...
Generating byte offset dataset from file:  test/EMEA.nl
2024-06-27 13:17:13,646 - INFO - Number of samples >= 100 tokens in ./test/test/csv/100/EMEA.nl.csv: 17156
Nu

In [80]:
# Step 2. Process the data to correct format

# NOTE: change dataset name to name + "-c" in running this right after preprocessing!!!
# gets dataset in the correct format for the experiment
!python process_data.py --config_file config.json

2024-06-27 13:18:15,796 - INFO - Parsing arguments...
Parsing arguments...
2024-06-27 13:18:15,796 - INFO - Loading tokenizer...
Loading tokenizer...
2024-06-27 13:18:16,088 - INFO - ==== Sarting data processing script ====
==== Sarting data processing script ====
2024-06-27 13:18:16,088 - INFO - This may take a while depending on the size of the dataset...
This may take a while depending on the size of the dataset...
2024-06-27 13:18:16,088 - INFO - Counting tokens for en...
Counting tokens for en...
Generating byte offset dataset from file:  test/100/EMEA-c.en
2024-06-27 13:18:29,580 - INFO - Number of samples >= 100 tokens in ./test/test/csv/100/EMEA-c.en.csv: 31176
Number of samples >= 100 tokens in ./test/test/csv/100/EMEA-c.en.csv: 31176
2024-06-27 13:18:29,580 - INFO - Filtering sentences for en...
Filtering sentences for en...
2024-06-27 13:18:29,625 - INFO - Generating JSONL for en...
Generating JSONL for en...
2024-06-27 13:18:29,989 - INFO - Counting tokens for nl...
Countin

In [85]:
import numpy as np
import os


# Shrink the dataset to a smaller size
def shrink_datasets(path1, path2, size):
    with open(path1, "r") as f:
        data1 = f.readlines()
    with open(path2, "r") as f:
        data2 = f.readlines()

    num_indices = int(len(data1))
    indices = np.random.choice(num_indices, size, replace=False)

    new_data1 = [data1[i] for i in indices]
    new_data2 = [data2[i] for i in indices]

    with open(path1, "w") as f:
        f.writelines(new_data1)
    with open(path2, "w") as f:
        f.writelines(new_data2)


size = 11000

path1 = os.path.join(
    DATASET_DIR, str(EXAMPLE_TOKEN_LEN), f"{DATASET_NAME}-{EXAMPLE_TOKEN_LEN}.{LANG1}"
)
path2 = os.path.join(
    DATASET_DIR, str(EXAMPLE_TOKEN_LEN), f"{DATASET_NAME}-{EXAMPLE_TOKEN_LEN}.{LANG2}"
)
shrink_datasets(path1, path2, size)

path1 = os.path.join(
    DATASET_DIR,
    str(EXAMPLE_TOKEN_LEN),
    f"{DATASET_NAME}-{EXAMPLE_TOKEN_LEN}.{LANG1}.jsonl",
)
path2 = os.path.join(
    DATASET_DIR,
    str(EXAMPLE_TOKEN_LEN),
    f"{DATASET_NAME}-{EXAMPLE_TOKEN_LEN}.{LANG2}.jsonl",
)
shrink_datasets(path1, path2, size)

In [2]:
def update_batch_size(directory):
    for filename in os.listdir(directory):
        if filename.endswith(".json"):
            filepath = os.path.join(directory, filename)
            with open(filepath, "r") as json_file:
                data = json.load(json_file)
            if "batch_size" in data:
                data["batch_size"] = 64
                with open(filepath, "w") as json_file:
                    json.dump(data, json_file, indent=4)


def update_trials(directory):
    for filename in os.listdir(directory):
        if filename.endswith(".json"):
            filepath = os.path.join(directory, filename)
            with open(filepath, "r") as json_file:
                data = json.load(json_file)
            if "num_trials" in data:
                data["num_trials"] = 50
                with open(filepath, "w") as json_file:
                    json.dump(data, json_file, indent=4)


dir = "exp-configs/EMEA/100"
dir2 = "exp-configs/EMEA/150"
dir3 = "exp-configs/EMEA/200"
dir4 = "exp-configs/EMEA/250"


update_trials(dir)
update_trials(dir2)
update_trials(dir3)
update_trials(dir4)

NameError: name 'os' is not defined

In [29]:
# Step 3. (optional) Split data to train and eval sets to train the model

# This will be done for both languages
# model size not relevant here, put in any config file as input

# 11k examples in data: 10k in train, 1k in eval
!python split_train_val.py --config_file config.json

2024-06-28 01:09:39,763 - INFO - ==== Starting data train+val split script ====
==== Starting data train+val split script ====
Traceback (most recent call last):
  File "/Users/annavisman/stack/RUG/CS/Year3/thesis/thesis-llm-privacy/split_train_val.py", line 156, in <module>
    main()
  File "/Users/annavisman/stack/RUG/CS/Year3/thesis/thesis-llm-privacy/split_train_val.py", line 81, in main
    with open(dataset_path, "r") as f:
         ^^^^^^^^^^^^^^^^^^^^^^^
FileNotFoundError: [Errno 2] No such file or directory: 'test/100/EMEA-c.en'


In [144]:
# Step 4. Split (training) data to (pre)prefix set and suffix set

# supply the training dataset here only if you want to split the training data
!python split_dataset.py --config_file exp-configs/{DATASET_DIR}/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE1}-{LANG1}.json
# !python split_dataset.py --config_file exp-configs/{DATASET_DIR}/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE2}-{LANG1}.json
# !python split_dataset.py --config_file exp-configs/{DATASET_DIR}/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE3}-{LANG1}.json
# !python split_dataset.py --config_file exp-configs/{DATASET_DIR}/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE4}-{LANG1}.json


^C
Traceback (most recent call last):
  File "/Users/annavisman/stack/RUG/CS/Year3/thesis/thesis-llm-privacy/split_dataset.py", line 8, in <module>
    from experiment_lib import load_constants_from_config
  File "/Users/annavisman/stack/RUG/CS/Year3/thesis/thesis-llm-privacy/experiment_lib.py", line 4, in <module>
    import matplotlib.pyplot as plt
  File "/opt/miniconda3/envs/torch/lib/python3.11/site-packages/matplotlib/pyplot.py", line 66, in <module>
    from matplotlib.figure import Figure, FigureBase, figaspect
  File "/opt/miniconda3/envs/torch/lib/python3.11/site-packages/matplotlib/figure.py", line 43, in <module>
    from matplotlib import _blocking_input, backend_bases, _docstring, projections
  File "/opt/miniconda3/envs/torch/lib/python3.11/site-packages/matplotlib/projections/__init__.py", line 55, in <module>
    from .. import axes, _docstring
  File "/opt/miniconda3/envs/torch/lib/python3.11/site-packages/matplotlib/axes/__init__.py", line 1, in <module>
    from . i

In [4]:
# Step 5. Train the model + perform extraction

# run this directly in terminal, model cannot be loaded in notebook due to memory constraints I think, crashes kernel
# NOTE: I cannot run this locally, so I run this on a HPC of the university
# Uploaded full contents of datasets + EMEA folders to Habrok so it has all data for training + extraction

# !python train.py --config_file exp-configs/EMEA/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE1}-{LANG1}.json
# !python extraction.py --config_file exp-configs/{DATASET_DIR}/{EXAMPLE_TOKEN_LEN}/config-{MODEL_SIZE1}-{LANG1}.json --model_dir finetuned/en-100-100-125M --cache_dir cache

# !python extraction.py --config_file exp-configs/EMEA/100/config-125M-en.json --model_dir finetuned/en-100-100-125M --cache_dir cache

2024-06-18 13:31:00,922 - INFO - Parsing arguments...
Parsing arguments...
2024-06-18 13:31:00,923 - INFO - Model directory provided: finetuned/en-100-100-125M
Model directory provided: finetuned/en-100-100-125M
2024-06-18 13:31:00,923 - INFO - Executing extraction on finetuned model.
Executing extraction on finetuned model.
2024-06-18 13:31:00,933 - INFO - Default device: mps
Default device: mps
2024-06-18 13:31:00,933 - INFO - Loading tokenizer...
Loading tokenizer...
2024-06-18 13:31:01,033 - INFO - Loading model...
Loading model...


In [8]:
# Decode the model generations from the numpy files to jsonl files
# NOTE: numpy files have been downloaded from the HPC where they were generated

from transformers import AutoTokenizer
import os
import numpy as np
from experiment_lib import (
    generations_to_jsonl,
    load_constants_from_config,
    generate_exid_list,
)
import json


def decode_generations(tokenizer, exids, num_trials, exp_base):
    for i in range(0, num_trials):
        file_path = os.path.join(exp_base, f"generations/{i}.npy")
        data = np.load(file_path)
        print(f"Data shape: {str(data.shape)}")

        output_file_path = os.path.join(
            exp_base, f"decoded/decoded_strings_trial_{i}.jsonl"
        )
        output_dir = os.path.dirname(output_file_path)
        os.makedirs(output_dir, exist_ok=True)
        if os.path.exists(output_file_path):
            print("Trial already decoded, skipping...")
        else:
            generations_to_jsonl(output_file_path, data, tokenizer, exids)

    print("done")


def decoding(path):

    with open(path, "r") as f:
        config = json.load(f)

    (
        ROOT_DIR,
        DATASET_DIR,
        SOURCE_DIR,
        DATASET_NAME,
        EXPERIMENT_NAME,
        NUM_TRIALS,
        PREFIX_LEN,
        SUFFIX_LEN,
        PREPREFIX_LEN,
        LANGUAGE,
        SPLIT,
        EXAMPLE_TOKEN_LEN,
        SOURCE_FILE,
        BATCH_SIZE,
        MODEL_NAME,
        TRAIN_FILE,
        VAL_FILE,
        VAL_SPLIT,
        SEED,
    ) = load_constants_from_config(config)

    NUM_TRIALS = 100

    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    experiment_base = os.path.join(ROOT_DIR, DATASET_DIR, LANGUAGE, EXPERIMENT_NAME)

    # pretain
    # exids_path = os.path.join(
    #     SOURCE_DIR,
    #     DATASET_DIR,
    #     "csv",
    #     str(EXAMPLE_TOKEN_LEN),
    #     "common_exids-" + str(EXAMPLE_TOKEN_LEN) + ".csv",
    # )
    # exids = generate_exid_list(exids_path)

    # Train exids only
    exids_path = os.path.join(DATASET_DIR, str(EXAMPLE_TOKEN_LEN), "split_indices.json")
    with open(exids_path, 'r') as f:
        for line in f:
            obj = json.loads(line)
            exids = obj["train"]

    decode_generations(tokenizer, exids, NUM_TRIALS, experiment_base)

In [14]:
# decoding("exp-configs/EMEA/250/config-1.3B-en.json")
# decoding("exp-configs/EMEA/250/config-2.7B-en.json")
# decoding("exp-configs/EMEA/250/config-125M-en.json")

# decoding("exp-configs/EMEA/150/config-125M-nl.json")
# decoding("exp-configs/EMEA/150/config-1.3B-nl.json")
# decoding("exp-configs/EMEA/150/config-2.7B-nl.json")

# decoding("exp-configs/EMEA/200/config-125M-nl.json")
# decoding("exp-configs/EMEA/200/config-1.3B-nl.json")
# decoding("exp-configs/EMEA/200/config-2.7B-nl.json")

# decoding("exp-configs/EMEA/250/config-125M-nl.json")
# decoding("exp-configs/EMEA/250/config-1.3B-nl.json")
# decoding("exp-configs/EMEA/250/config-2.7B-nl.json")

# decoding("exp-configs/EMEA/epoch/5/config-2.7B-nl.json")
# decoding("exp-configs/EMEA/epoch/10/config-2.7B-nl.json")
decoding("exp-configs/EMEA/epoch/20/config-2.7B-nl.json")
# decoding("exp-configs/EMEA/epoch/50/config-2.7B-nl.json")

decoding("exp-configs/EMEA/epoch/5/config-2.7B-en.json")
decoding("exp-configs/EMEA/epoch/10/config-2.7B-en.json")
decoding("exp-configs/EMEA/epoch/20/config-2.7B-en.json")
# decoding("exp-configs/EMEA/epoch/50/config-2.7B-en.json")


Data shape: (9900, 100)
Trial already decoded, skipping...
Data shape: (9900, 100)
Trial already decoded, skipping...
Data shape: (9900, 100)
Trial already decoded, skipping...
Data shape: (9900, 100)
Trial already decoded, skipping...
Data shape: (9900, 100)
Trial already decoded, skipping...
Data shape: (9900, 100)
Trial already decoded, skipping...
Data shape: (9900, 100)
Trial already decoded, skipping...
Data shape: (9900, 100)
Trial already decoded, skipping...
Data shape: (9900, 100)
Trial already decoded, skipping...
Data shape: (9900, 100)
Trial already decoded, skipping...
Data shape: (9900, 100)
Trial already decoded, skipping...
Data shape: (9900, 100)
Trial already decoded, skipping...
Data shape: (9900, 100)
Trial already decoded, skipping...
Data shape: (9900, 100)
Trial already decoded, skipping...
Data shape: (9900, 100)
Trial already decoded, skipping...
Data shape: (9900, 100)
Trial already decoded, skipping...
Data shape: (9900, 100)
Trial already decoded, skipping.

In [3]:
# Calculate BLEU scores for the generated outputs

# !python calculate_scores.py --config_file exp-configs/{DATASET_DIR}/150/config-{MODEL_SIZE3}-{LANG1}.json
# !python calculate_scores.py --config_file exp-configs/{DATASET_DIR}/150/config-{MODEL_SIZE2}-{LANG1}.json
# !python calculate_scores.py --config_file exp-configs/{DATASET_DIR}/150/config-{MODEL_SIZE1}-{LANG1}.json

# !python calculate_scores.py --config_file exp-configs/{DATASET_DIR}/150/config-{MODEL_SIZE3}-{LANG2}.json
# !python calculate_scores.py --config_file exp-configs/{DATASET_DIR}/150/config-{MODEL_SIZE2}-{LANG2}.json
# !python calculate_scores.py --config_file exp-configs/{DATASET_DIR}/150/config-{MODEL_SIZE1}-{LANG2}.json

# !python calculate_scores.py --config_file exp-configs/{DATASET_DIR}/200/config-{MODEL_SIZE1}-{LANG1}.json
# !python calculate_scores.py --config_file exp-configs/{DATASET_DIR}/200/config-{MODEL_SIZE2}-{LANG1}.json
# !python calculate_scores.py --config_file exp-configs/{DATASET_DIR}/200/config-{MODEL_SIZE3}-{LANG1}.json

# !python calculate_scores.py --config_file exp-configs/{DATASET_DIR}/250/config-{MODEL_SIZE1}-{LANG1}.json
# !python calculate_scores.py --config_file exp-configs/{DATASET_DIR}/250/config-{MODEL_SIZE2}-{LANG1}.json
# !python calculate_scores.py --config_file exp-configs/{DATASET_DIR}/250/config-{MODEL_SIZE3}-{LANG1}.json

!python calculate_scores.py --config_file exp-configs/EMEA/epoch/5/config-1.3B-nl.json
!python calculate_scores.py --config_file exp-configs/EMEA/epoch/10/config-1.3B-nl.json
!python calculate_scores.py --config_file exp-configs/EMEA/epoch/20/config-1.3B-nl.json
!python calculate_scores.py --config_file exp-configs/EMEA/epoch/50/config-1.3B-nl.json

2024-07-08 12:20:47,596 - INFO - ===== Starting BLEU-score calculation between generated and original text in language nl for 50 prefix & suffix length =====
===== Starting BLEU-score calculation between generated and original text in language nl for 50 prefix & suffix length =====
2024-07-08 12:20:47,596 - INFO - ===== Decoding original preprefixes, prefixes & suffixes =====
===== Decoding original preprefixes, prefixes & suffixes =====
2024-07-08 12:20:47,596 - INFO - Loading split indices from EMEA/100/split_indices.json
Loading split indices from EMEA/100/split_indices.json
Decoded strings saved to: %s ./datasets/EMEA/nl/100/EleutherAI/gpt-neo-1.3B/train_suffix.jsonl
2024-07-08 12:20:53,020 - INFO - Filtered suffixes to only include exids in the exids list
Filtered suffixes to only include exids in the exids list
2024-07-08 12:20:53,036 - INFO - Saved filtered suffixes toEMEA/100/prompt-train_dataset_suffixes-nl.jsonl
Saved filtered suffixes toEMEA/100/prompt-train_dataset_suffixes

In [4]:
# Evaluate the model outputs: sort and merge scores into single files to simplify analysis & plotting

# !python evaluation.py --config_file exp-configs/{DATASET_DIR}/100/config-{MODEL_SIZE1}-{LANG1}.json --trained True
# !python evaluation.py --config_file exp-configs/{DATASET_DIR}/100/config-{MODEL_SIZE2}-{LANG1}.json --trained True
# !python evaluation.py --config_file exp-configs/{DATASET_DIR}/100/config-{MODEL_SIZE3}-{LANG1}.json --trained True

# !python evaluation.py --config_file exp-configs/{DATASET_DIR}/150/config-{MODEL_SIZE1}-{LANG1}.json --trained True
# !python evaluation.py --config_file exp-configs/{DATASET_DIR}/150/config-{MODEL_SIZE2}-{LANG1}.json --trained True
# !python evaluation.py --config_file exp-configs/{DATASET_DIR}/150/config-{MODEL_SIZE3}-{LANG1}.json --trained True

# !python evaluation.py --config_file exp-configs/{DATASET_DIR}/200/config-{MODEL_SIZE1}-{LANG1}.json --trained True
# !python evaluation.py --config_file exp-configs/{DATASET_DIR}/200/config-{MODEL_SIZE2}-{LANG1}.json --trained True
# !python evaluation.py --config_file exp-configs/{DATASET_DIR}/200/config-{MODEL_SIZE3}-{LANG1}.json --trained True

# !python evaluation.py --config_file exp-configs/{DATASET_DIR}/250/config-{MODEL_SIZE3}-{LANG1}.json --trained True
# !python evaluation.py --config_file exp-configs/{DATASET_DIR}/250/config-{MODEL_SIZE2}-{LANG1}.json --trained True
# !python evaluation.py --config_file exp-configs/{DATASET_DIR}/250/config-{MODEL_SIZE1}-{LANG1}.json --trained True

!python evaluation.py --config_file exp-configs/EMEA/epoch/5/config-1.3B-nl.json --trained Trueå
!python evaluation.py --config_file exp-configs/EMEA/epoch/10/config-1.3B-nl.json --trained True
!python evaluation.py --config_file exp-configs/EMEA/epoch/20/config-1.3B-nl.json --trained True
!python evaluation.py --config_file exp-configs/EMEA/epoch/50/config-1.3B-nl.json --trained True

2024-07-08 12:29:25,054 - INFO
Evaluating scores on finetuned model.
2024-07-08 12:29:26,988 - INFO
==== Starting evaluation ====
2024-07-08 12:29:26,988 - INFO
Experiment name: nl-100-100-1.3B-E5
2024-07-08 12:29:26,988 - INFO
Language: nl
2024-07-08 12:29:26,988 - INFO
Model: EleutherAI/gpt-neo-1.3B
2024-07-08 12:29:26,988 - INFO
Loading list of example IDs for dataset EMEA...
2024-07-08 12:29:26,989 - INFO
Loading exids from EMEA/100/prompt-train_dataset-exids-intersect.json
2024-07-08 12:29:26,989 - INFO
Loaded 3259 example IDs
2024-07-08 12:29:28,996 - INFO
Pulling BLEU scores from tmp/EMEA/nl/nl-100-100-1.3B-E5/bleu_scores
2024-07-08 12:29:31,001 - INFO
Processing example 2...
2024-07-08 12:29:31,042 - INFO
Merged BLEU scores for exid 2
2024-07-08 12:29:31,043 - INFO
Processing example 4...
2024-07-08 12:29:31,069 - INFO
Merged BLEU scores for exid 4
2024-07-08 12:29:31,069 - INFO
Processing example 7...
2024-07-08 12:29:31,090 - INFO
Merged BLEU scores for exid 7
2024-07-08 12:2

In [5]:
# !python accuracy.py --config_file exp-configs/{DATASET_DIR}/250/config-{MODEL_SIZE3}-{LANG1}.json
# !python accuracy.py --config_file exp-configs/{DATASET_DIR}/250/config-{MODEL_SIZE2}-{LANG2}.json
# !python accuracy.py --config_file exp-configs/{DATASET_DIR}/250/config-{MODEL_SIZE1}-{LANG2}.json

!python accuracy.py --config_file exp-configs/EMEA/epoch/5/config-1.3B-nl.json
!python accuracy.py --config_file exp-configs/EMEA/epoch/10/config-1.3B-nl.json
!python accuracy.py --config_file exp-configs/EMEA/epoch/20/config-1.3B-nl.json
!python accuracy.py --config_file exp-configs/EMEA/epoch/50/config-1.3B-nl.json

# !python accuracy.py --config_file exp-configs/{DATASET_DIR}/250/config-{MODEL_SIZE2}-{LANG1}.json
# !python accuracy.py --config_file exp-configs/{DATASET_DIR}/250/config-{MODEL_SIZE3}-{LANG1}.json
# !python accuracy.py --config_file exp-configs/{DATASET_DIR}/250/config-{MODEL_SIZE1}-{LANG1}.json


2024-07-08 12:35:51,172 - INFO - Finished counting amount of correct guesses.
Finished counting amount of correct guesses.
2024-07-08 12:35:51,172 - INFO - Saving output to tmp/EMEA/nl/nl-100-100-1.3B-E5/accuracy.jsonl
Saving output to tmp/EMEA/nl/nl-100-100-1.3B-E5/accuracy.jsonl
2024-07-08 12:35:53,473 - INFO - Finished counting amount of correct guesses.
Finished counting amount of correct guesses.
2024-07-08 12:35:53,473 - INFO - Saving output to tmp/EMEA/nl/nl-100-100-1.3B-E10/accuracy.jsonl
Saving output to tmp/EMEA/nl/nl-100-100-1.3B-E10/accuracy.jsonl
2024-07-08 12:35:55,518 - INFO - Finished counting amount of correct guesses.
Finished counting amount of correct guesses.
2024-07-08 12:35:55,518 - INFO - Saving output to tmp/EMEA/nl/nl-100-100-1.3B-E20/accuracy.jsonl
Saving output to tmp/EMEA/nl/nl-100-100-1.3B-E20/accuracy.jsonl
2024-07-08 12:35:57,463 - INFO - Finished counting amount of correct guesses.
Finished counting amount of correct guesses.
2024-07-08 12:35:57,463 - I

In [29]:
# Count numnber of unique sentences

import numpy as np

# Load the data
data = np.load(
    "datasets/EMEA/nl/250/EleutherAI/gpt-neo-2.7B/train_dataset.npy", allow_pickle=True
)

# Convert the lists of tokens to tuples so they can be put in a set
data = [tuple(sentence) for sentence in data]

# Count the number of unique sentences
num_unique_sentences = len(set(data))

print(f"The number of unique sentences is {num_unique_sentences}.")
print(f"the total number of sentences is {len(data)}.")

The number of unique sentences is 9893.
the total number of sentences is 9900.


In [73]:
import json

def count_unique_sentences(token_len):
    # Initialize an empty set to store unique sentences
    unique_sentences = set()

    file = f"EMEA/{token_len}/prompt-train_dataset.jsonl"

    exids = f"EMEA/{token_len}/prompt-train_dataset-exids-intersect.json"

    def search_sentence(exid, objects):
        for obj in objects:
            if obj["exid"] == exid:
                return obj
        return None

    # Load the data from the .jsonl file
    with open(file, "r") as file, open(exids, "r") as exids_file:
        json_objects = [json.loads(line) for line in file.readlines()]

        exids = json.load(exids_file)

        for exid in exids:
            obj = search_sentence(exid, json_objects)
            if obj is not None:
                sentence = obj["text"]
                if isinstance(sentence, list):
                    sentence = tuple(sentence)
                unique_sentences.add(sentence)

    num_unique_sentences = len(unique_sentences)
    print(f"The total number of sentences is {len(exids)}.")
    print(f"The number of unique sentences is {num_unique_sentences}.")

count_unique_sentences(100)
count_unique_sentences(150)
count_unique_sentences(200)
count_unique_sentences(250)


The total number of sentences is 3259.
The number of unique sentences is 3249.
The total number of sentences is 3655.
The number of unique sentences is 3650.
The total number of sentences is 4419.
The number of unique sentences is 4414.
The total number of sentences is 6054.
The number of unique sentences is 6053.


In [11]:
import os

def rename_npy_files(directory):
    # List all files in the specified directory
    files = os.listdir(directory)

    # Filter for .npy files
    npy_files = [f for f in files if f.endswith(".npy")]

    # Create a temporary name for each file to avoid conflicts
    for file in npy_files:
        base_name = os.path.splitext(file)[0]

        try:
            original_number = int(base_name)
            temp_filename = f"{original_number}_temp.npy"
            original_path = os.path.join(directory, file)
            temp_path = os.path.join(directory, temp_filename)
            os.rename(original_path, temp_path)

        except ValueError:
            print(f"Skipping file {file} as it does not have a numeric base name")

    # Rename temporary files to the final names
    temp_files = [f for f in os.listdir(directory) if f.endswith("_temp.npy")]

    for temp_file in temp_files:
        base_name = os.path.splitext(temp_file)[0]

        try:
            original_number = int(base_name.split("_")[0])
            new_number = original_number + 50
            new_filename = f"{new_number}.npy"
            temp_path = os.path.join(directory, temp_file)
            final_path = os.path.join(directory, new_filename)
            os.rename(temp_path, final_path)

            print(f"Renamed {temp_file} to {new_filename}")

        except ValueError:
            print(f"Skipping file {temp_file} as it does not have a numeric base name")


rename_npy_files("tmp/EMEA/en/en-100-100-2.7B-E5-2/generations")
rename_npy_files("tmp/EMEA/en/en-100-100-2.7B-E5-2/losses")


Renamed 15_temp.npy to 65.npy
Renamed 14_temp.npy to 64.npy
Renamed 7_temp.npy to 57.npy
Renamed 6_temp.npy to 56.npy
Renamed 21_temp.npy to 71.npy
Renamed 20_temp.npy to 70.npy
Renamed 41_temp.npy to 91.npy
Renamed 40_temp.npy to 90.npy
Renamed 36_temp.npy to 86.npy
Renamed 37_temp.npy to 87.npy
Renamed 46_temp.npy to 96.npy
Renamed 47_temp.npy to 97.npy
Renamed 31_temp.npy to 81.npy
Renamed 30_temp.npy to 80.npy
Renamed 12_temp.npy to 62.npy
Renamed 13_temp.npy to 63.npy
Renamed 18_temp.npy to 68.npy
Renamed 19_temp.npy to 69.npy
Renamed 0_temp.npy to 50.npy
Renamed 1_temp.npy to 51.npy
Renamed 26_temp.npy to 76.npy
Renamed 27_temp.npy to 77.npy
Renamed 42_temp.npy to 92.npy
Renamed 43_temp.npy to 93.npy
Renamed 35_temp.npy to 85.npy
Renamed 34_temp.npy to 84.npy
Renamed 48_temp.npy to 98.npy
Renamed 49_temp.npy to 99.npy
Renamed 4_temp.npy to 54.npy
Renamed 5_temp.npy to 55.npy
Renamed 22_temp.npy to 72.npy
Renamed 23_temp.npy to 73.npy
Renamed 16_temp.npy to 66.npy
Renamed 17_temp.

In [133]:
from experiment_lib import generations_to_jsonl
from transformers import AutoTokenizer
import os

# use this model???
MODEL = "gpt-neo-125M"
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/" + MODEL)

# Load the data that was used to prompt the model

# english
# dutch
data = np.load(os.path.join(SOURCE_DIR, DATASET_DIR, LANG2, str(EXAMPLE_TOKEN_LEN), "EleutherAI/" + MODEL, "train_dataset.npy"))

output_file = os.path.join(DATASET_DIR, str(EXAMPLE_TOKEN_LEN), f"prompt_dataset-{LANG2}.jsonl")
os.makedirs(os.path.dirname(output_file), exist_ok=True)

split_file = os.path.join(DATASET_DIR, str(EXAMPLE_TOKEN_LEN), "split_indices.json")
with open(split_file, "r") as f:
    split_indices = json.load(f)
            # this gives a list of indices present in the training dataset
    exids = split_indices["train"]

generations_to_jsonl(output_file, data, tokenizer, exids)

Decoded strings saved to: %s EMEA/250/prompt_dataset-nl.jsonl


In [134]:
# to txt file
in_file = os.path.join(DATASET_DIR, str(EXAMPLE_TOKEN_LEN), f"prompt_dataset-{LANG2}.jsonl")
out_file = os.path.join(DATASET_DIR, str(EXAMPLE_TOKEN_LEN), f"prompt_dataset-{LANG2}.txt")

with open(out_file, "w") as f, open(in_file, "r") as f2:
    for line in f2:
        obj = json.loads(line)
        f.write(obj["text"] + "\n") 

In [135]:
import json

# finding the intersection of the prompt dataset and the training dataset

in_file = os.path.join(DATASET_DIR, str(EXAMPLE_TOKEN_LEN), f"prompt_dataset-{LANG2}.jsonl")
train_file = os.path.join(DATASET_DIR, str(EXAMPLE_TOKEN_LEN), f"train-{LANG2}.txt")

with open(in_file, "r") as prompt_file, open(train_file, "r") as train_file:
    prompts = prompt_file.readlines()
    train_lines = train_file.readlines()

print(len(prompts))
print(len(train_lines))

out_file = os.path.join(DATASET_DIR, str(EXAMPLE_TOKEN_LEN), f"prompt-train_dataset-{LANG2}.jsonl")
count = 0

counted = []

marker = 0
with open(out_file, "w") as f:
    # loop over all train lines
    for line in train_lines:
        # rest the marker to zero, so we know we have not found a match yet for this sentence
        marker = 0
        # split the sentence into tokens
        t = line.split()
        # loop over all prompt lines
        for index, prompt in enumerate(prompts):
    
            # if we have not marked the train line as matched yet
            if marker == 0:

                # load the json sentence
                prompt = json.loads(prompt)
                # grab the exid
                exid = prompt["exid"]
                # split the prompt into tokens
                p = (prompt["text"]).split()
            
                # if the sentences are the same
                # and the prompt sentence has not been counted yet (to avoid duplicate exids)
                if p == t and exid not in counted:
                    # if we find a matching sentence, save the exid to a file
                    counted.append(prompt["exid"])
                    # add to total number of matches
                    count += 1
                    # set marker to 1
                    marker = 1
                    # save the matching json sentence from prompt set to a file
                    json.dump(prompt, f, ensure_ascii=False)
                    f.write("\n")
                    # Replace the processed line with None
                    # train_lines[index] = None
# train line has been matched, move on to the next
            else: break

print(count)

9900
9900
6133


In [136]:
import json

def find_and_save_duplicates(input_file_path, output_file_path):
    # Step 2: Initialize a dictionary to track occurrences and exids
    sentence_details = {}
    
    # Step 3: Read the input file and count sentences, storing exids
    with open(input_file_path, 'r') as input_file:
        for line in input_file:
            obj = json.loads(line)
            text = obj.get('text', '')
            exid = obj.get('exid', '')
            if text:
                if text in sentence_details:
                    sentence_details[text]['count'] += 1
                    sentence_details[text]['exids'].append(exid)
                else:
                    sentence_details[text] = {'count': 1, 'exids': [exid]}
    
    # Step 4: Filter for duplicates and prepare the output structure
    duplicates = {text: details for text, details in sentence_details.items() if details['count'] > 1}
     
    # Step 5: Write duplicates to the output file
    with open(output_file_path, 'w') as output_file:
        for text, details in duplicates.items():
            # Creating a dict to represent the JSON structure including exids
            duplicate_entry = {'text': text, 'count': details['count'], 'exids': details['exids']}
            json.dump(duplicate_entry, output_file, ensure_ascii=False)
            output_file.write('\n')  # Ensure each entry is on a new line

# Example usage
input_file_path = os.path.join(DATASET_DIR, str(EXAMPLE_TOKEN_LEN), f"prompt-train_dataset-{LANG2}.jsonl")
output_file_path = os.path.join(DATASET_DIR, str(EXAMPLE_TOKEN_LEN), f"prompt-train_datasetDUP-{LANG2}.jsonl")
find_and_save_duplicates(input_file_path, output_file_path)

# Count number of repeated sentences in the output file
with open(output_file_path, 'r') as f:
    count = 0
    for line in f:
        obj = json.loads(line)
        count += obj['count']

print(count)

6


In [137]:
# to txt file
in_file = os.path.join(DATASET_DIR, str(EXAMPLE_TOKEN_LEN), f"prompt-train_dataset-{LANG2}.jsonl")
out_file = os.path.join(DATASET_DIR, str(EXAMPLE_TOKEN_LEN), f"prompt-train_dataset-{LANG2}.txt")

with open(out_file, "w") as f, open(in_file, "r") as f2:
    for line in f2:
        obj = json.loads(line)
        f.write(obj["text"] + "\n") 

In [138]:
file = os.path.join(DATASET_DIR, str(EXAMPLE_TOKEN_LEN), f"prompt-train_dataset-{LANG2}.jsonl")


with open(file, "r") as f:
    exids = []
    for line in f:
        obj = json.loads(line)
        exids.append(obj["exid"])

print(len(exids))

# save to file
out = os.path.join(DATASET_DIR, str(EXAMPLE_TOKEN_LEN), f"prompt-train_dataset_exids-{LANG2}.json")
with open(out, "w") as f:
    json.dump(exids, f, indent=4)

6133


In [140]:
import json

# INTERSECT EN & NL!!!!!!!!
# Path to your .jsonl file
file_path = f"EMEA/{str(EXAMPLE_TOKEN_LEN)}/prompt-train_dataset-nl.jsonl"
file_path2 = f"EMEA/{str(EXAMPLE_TOKEN_LEN)}/prompt-train_dataset.jsonl"
def unique_sentences(file_path):
    # Set to keep track of unique exids
    exids_seen = set()
    # List to keep track of duplicates
    duplicates = []

    exids = []
    # Open and read the .jsonl file
    with open(file_path, "r") as file:
        lines = file.readlines()
        for line in lines:
            # skip first line
            data = json.loads(line)
            exid = data["exid"]
            # Check if exid is already seen
            exids.append(exid)
            if exid in exids_seen:
                duplicates.append(exid)
            else:
                exids_seen.add(exid)

    if duplicates:
        print("Duplicate exids found:", duplicates)
    else:
        print("No duplicate exids found.")

    return exids

exids = unique_sentences(file_path)
exids2 = unique_sentences(file_path2)

print(len(exids))
print(len(exids2))

# find the intersection of the two sets

print("intersection")
intersection = set(exids).intersection(set(exids2))

diff = set(exids).difference(set(exids2))
# find the 
print(len(intersection))
print(len(diff))

# write the intersection to a file

# USE THESE TO FILTER THE DECODED GENERATIONS
out = f"EMEA/{str(EXAMPLE_TOKEN_LEN)}/prompt-train_dataset-exids-intersect.json"
with open(out, "w") as f:
    json.dump(list(intersection), f, indent=4)



No duplicate exids found.
No duplicate exids found.
6133
6101
intersection
6054
79


In [6]:
import os
import json

def filter_decoded_generations(dir_path, exids_file):

    # Load exids once, assuming they are the same for all files
    with open(exids_file, "r") as e:
        exids = json.load(e)

    # 100 trials
    for i in range(100):
        gens = os.path.join(dir_path, f"decoded_strings_trial_{i}.jsonl")
        new_gens = os.path.join(dir_path, f"decoded_strings_trial_{i}_filtered.jsonl")

        with open(gens, "r") as f, open(new_gens, "w") as n:
            for line in f:
                obj = json.loads(line)
                exid = obj.get("exid")
                if exid in exids:
                    json.dump(obj, n, ensure_ascii=False)
                    n.write("\n")

def filter(language, model_size, epoch=1):
    if epoch != 1:
        EXP_NAME = language + "-" + str(EXAMPLE_TOKEN_LEN) + "-100-" + model_size+ "-" + str(epoch)
    else: 
        EXP_NAME = language + "-" + str(EXAMPLE_TOKEN_LEN) + "-100-" + model_size

    dir_path = os.path.join(TARGET_DIR, DATASET_DIR, language, EXP_NAME, "decoded")
    exids_file = os.path.join(DATASET_DIR, str(EXAMPLE_TOKEN_LEN), "prompt-train_dataset-exids-intersect.json")
    filter_decoded_generations(dir_path, exids_file)

LANG = "nl"
MODEL_SIZE = "2.7B"

# filter("en", "125M")
# # filter("nl", "125M")
# filter("en", "1.3B")
# # filter("nl", "1.3B")
# filter("en", "2.7B")
# # filter("nl", "2.7B")

filter(LANG, MODEL_SIZE, "E5")
filter(LANG, MODEL_SIZE, "E10")
filter(LANG, MODEL_SIZE, "E20")
filter(LANG, MODEL_SIZE, "E50")

FileNotFoundError: [Errno 2] No such file or directory: 'tmp/EMEA/nl/nl-100-100-2.7B-E5/decoded/decoded_strings_trial_0.jsonl'

In [3]:
def calc(nom, denom):
    return (nom / denom) * 100

print(calc(3219, 3249))
print(calc(3017, 3249))
print(calc(2834, 3249))
print(calc(2601, 3249))

# print(calc(288, 325900))
# print(calc(399, 325900))




99.07663896583564
92.85934133579563
87.22683902739304
80.05540166204986


In [69]:
file_path = "tmp/EMEA/en/en-100-100-1.3B-E5"

out_file = "tmp/EMEA/en/en-100-100-1.3B-E5/correct_sentences.jsonl"
with open(f"{file_path}/accuracy.jsonl", "r") as f, open(out_file, "w") as out:
    lines = f.readlines()
    for line in lines[1:]:
        obj = json.loads(line)
        exid = obj["exid"]
        correct_guesses = obj["trials_correct"]
        sentences = []

        for trial in correct_guesses:
            trial_num = trial["trial"]

            # open trial file
            trial_file = f"{file_path}/decoded/decoded_strings_trial_{trial_num}_filtered.jsonl"
            # grab the sentence of the exid
            with open(trial_file, "r") as f2:
                for line in f2:
                    obj = json.loads(line)
                    if obj["exid"] == exid:
                        sentence = obj["text"]
                        sentences.append(sentence)
                        break
        object_new = {
            "exid": exid,
            "sentences": sentences
        }

        json.dump(object_new, out, ensure_ascii=False)
        out.write("\n")

KeyboardInterrupt: 