In [12]:
# imports
import json
import matplotlib.pyplot as plt
import numpy as np
import os
from transformers import AutoTokenizer

In [13]:
# define experiment constants

# results are saved in this directory
ROOT_DIR = "tmp/"
# the name of the experiment
EXPERIMENT_NAME = "nl-100-100-1.3B(test)"
# path where the split datasets are stored
SOURCE_DIR = "./datasets"
# file name of the (split) tokenized version of the dataset
SOURCE_FILE = "train_dataset.npy"
# number of trials to run in the experiment
NUM_TRIALS = 100
# language of the setup
LANGUAGE = "nl"
# split of the dataset to use
SPLIT = "train"
# path to the dataset raw files
DATASET_DIR = "europarl"
# path to the dataset files
DATASET_FILE = "europarl-v7.nl-en"

# sequence parameters
SUFFIX_LEN = 50
PREFIX_LEN = 50
EXAMPLE_TOKEN_LEN = 100
PREPREFIX_LEN = 0

BATCH_SIZE = 32

MODEL = "EleutherAI/gpt-neo-1.3B"

In [10]:
# create config.json from constants
config = {
    "root_dir": ROOT_DIR,
    "experiment_name": EXPERIMENT_NAME,
    "dataset_dir": DATASET_DIR,
    "dataset_file": DATASET_FILE,
    "num_trials": NUM_TRIALS,
    "language": LANGUAGE,
    "split": SPLIT,
    "suffix_len": SUFFIX_LEN,
    "prefix_len": PREFIX_LEN,
    "example_token_len": EXAMPLE_TOKEN_LEN,
    "preprefix_len": PREPREFIX_LEN,
    "source_dir": SOURCE_DIR,
    "source_file": SOURCE_FILE,
    "batch_size": BATCH_SIZE,
    "model": MODEL
}

with open("config.json", "w") as f:
    json.dump(config, f, indent=4)

In [16]:
# 1. Split the examples into two parts: prefix and suffix (and preprefix)

# assumption: data is prepared in jsonlines format with sentences all of desired token length
!python split_dataset.py --config_file config.json

2024-05-30 22:17:43,344 - INFO
===== Starting dataset token split generation for language nl with token length 100 =====
2024-05-30 22:17:43,344 - INFO
Opened file: nl-en/europarl-v7.nl-en-100.nl.jsonl
2024-05-30 22:17:43,351 - INFO
Processed 32 lines
2024-05-30 22:17:43,357 - INFO
Processed 64 lines
2024-05-30 22:17:43,362 - INFO
Processed 96 lines
2024-05-30 22:17:43,367 - INFO
Processed 128 lines
2024-05-30 22:17:43,371 - INFO
Processed 160 lines
2024-05-30 22:17:43,377 - INFO
Processed 192 lines
2024-05-30 22:17:43,382 - INFO
Processed 224 lines
2024-05-30 22:17:43,386 - INFO
Processed 256 lines
2024-05-30 22:17:43,391 - INFO
Processed 288 lines
2024-05-30 22:17:43,396 - INFO
Processed 320 lines
2024-05-30 22:17:43,401 - INFO
Processed 352 lines
2024-05-30 22:17:43,406 - INFO
Processed 384 lines
2024-05-30 22:17:43,411 - INFO
Processed 416 lines
2024-05-30 22:17:43,416 - INFO
Processed 448 lines
2024-05-30 22:17:43,420 - INFO
Processed 480 lines
2024-05-30 22:17:43,425 - INFO
Proce

In [1]:
# 2. Generate model output using prefixes as prompts

# run this directly in terminal, model cannot be loaded in notebook due to memory constraints I think, crashes kernel

# will not overwrite results
!python extraction.py --config_file config.json

2024-05-30 23:12:58,729 - INFO
Parsing arguments...
2024-05-30 23:12:58,736 - INFO
Default device: mps
2024-05-30 23:12:58,736 - INFO
Loading tokenizer...
2024-05-30 23:12:59,093 - INFO
Loading model...


In [16]:
# 3. Evaluate the model output against the dataset
!python calculate_bleu.py --config_file config.json

[34m[1mwandb[0m: Currently logged in as: [33manna-visman[0m ([33mannavisman[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Tracking run with wandb version 0.17.0
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/Users/annavisman/stack/RUG/CS/Year3/thesis/thesis-llm-privacy/wandb/run-20240601_211616-rv0ewi60[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mEvaluation BLEU Score - nl-100-100-1.3B(test) - nl - 50 - 100[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/annavisman/thesis-llm-privacy[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/annavisman/thesis-llm-privacy/runs/rv0ewi60[0m
2024-06-01 21:16:18,843 - INFO
===== Starting evaluation of similarity between generated and original text in language nl for 50 prefix & suffix length =====
2024-06-01 21:16:18,843 - INFO
===== Decoding original prefixes & suffixes =====
2024-06-01 21:16:18,84