In [4]:
# imports
import json
import matplotlib.pyplot as plt
import numpy as np
import os
from transformers import AutoTokenizer
from experiment_lib import losses_to_jsonl

In [29]:
# define experiment constants

# results are saved in this directory
ROOT_DIR = "tmp/"
# the name of the experiment
EXPERIMENT_NAME = "nl-100-100-125M"
# path where the split datasets are stored
SOURCE_DIR = "./datasets"
# file name of the (split) tokenized version of the dataset
SOURCE_FILE = "train_dataset.npy"
# number of trials to run in the experiment
NUM_TRIALS = 100
# language of the setup
LANGUAGE = "nl"
# split of the dataset to use
SPLIT = "train"
# path to the dataset raw files
DATASET_DIR = "europarl"
# path to the dataset files
DATASET_FILE = "europarl-v7.nl-en"

# sequence parameters
SUFFIX_LEN = 50
PREFIX_LEN = 50
EXAMPLE_TOKEN_LEN = 100
PREPREFIX_LEN = 0

BATCH_SIZE = 32

MODEL = "EleutherAI/gpt-neo-125M"

# create config.json from constants
config = {
    "root_dir": ROOT_DIR,
    "experiment_name": EXPERIMENT_NAME,
    "dataset_dir": DATASET_DIR,
    "dataset_file": DATASET_FILE,
    "num_trials": NUM_TRIALS,
    "language": LANGUAGE,
    "split": SPLIT,
    "suffix_len": SUFFIX_LEN,
    "prefix_len": PREFIX_LEN,
    "example_token_len": EXAMPLE_TOKEN_LEN,
    "preprefix_len": PREPREFIX_LEN,
    "source_dir": SOURCE_DIR,
    "source_file": SOURCE_FILE,
    "batch_size": BATCH_SIZE,
    "model": MODEL
}

with open("config.json", "w") as f:
    json.dump(config, f, indent=4)

In [28]:
# 1. Split the examples into two parts: prefix and suffix (and preprefix)

# assumption: data is prepared in jsonlines format with sentences all of desired token length
!python split_dataset.py --config_file config.json

2024-06-02 19:48:15,679 - INFO
===== Starting dataset token split generation for language en with token length 100 =====
2024-06-02 19:48:15,680 - INFO
Opened file: europarl/europarl-v7.nl-en-100.en.jsonl
2024-06-02 19:48:15,690 - INFO
Processed 32 lines
2024-06-02 19:48:15,698 - INFO
Processed 64 lines
2024-06-02 19:48:15,706 - INFO
Processed 96 lines
2024-06-02 19:48:15,713 - INFO
Processed 128 lines
2024-06-02 19:48:15,721 - INFO
Processed 160 lines
2024-06-02 19:48:15,728 - INFO
Processed 192 lines
2024-06-02 19:48:15,735 - INFO
Processed 224 lines
2024-06-02 19:48:15,742 - INFO
Processed 256 lines
2024-06-02 19:48:15,749 - INFO
Processed 288 lines
2024-06-02 19:48:15,756 - INFO
Processed 320 lines
2024-06-02 19:48:15,763 - INFO
Processed 352 lines
2024-06-02 19:48:15,770 - INFO
Processed 384 lines
2024-06-02 19:48:15,777 - INFO
Processed 416 lines
2024-06-02 19:48:15,783 - INFO
Processed 448 lines
2024-06-02 19:48:15,790 - INFO
Processed 480 lines
2024-06-02 19:48:15,796 - INFO
Pr

In [1]:
# 2. Generate model output using prefixes as prompts

# run this directly in terminal, model cannot be loaded in notebook due to memory constraints I think, crashes kernel

# will not overwrite results
!python extraction.py --config_file config.json

2024-05-30 23:12:58,729 - INFO
Parsing arguments...
2024-05-30 23:12:58,736 - INFO
Default device: mps
2024-05-30 23:12:58,736 - INFO
Loading tokenizer...
2024-05-30 23:12:59,093 - INFO
Loading model...


In [35]:
# 3. Evaluate the model output against the dataset
!python calculate_bleu.py --config_file config.json

[34m[1mwandb[0m: Currently logged in as: [33manna-visman[0m ([33mannavisman[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /Users/annavisman/.netrc
[34m[1mwandb[0m: Tracking run with wandb version 0.17.0
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/Users/annavisman/stack/RUG/CS/Year3/thesis/thesis-llm-privacy/wandb/run-20240602_202209-t0l9j5zu[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mEvaluation BLEU Score - nl-100-100-125M - EleutherAI/gpt-neo-125M[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/annavisman/thesis-llm-privacy[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/annavisman/thesis-llm-privacy/runs/t0l9j5zu[0m
2024-06-02 20:22:12,441 - INFO
===== Starting BLEU-score calculation between generated and original text in language nl for 50 prefix & suffix length =====
2024-