In [6]:
# imports
import json
import matplotlib.pyplot as plt
import numpy as np
import os
from transformers import AutoTokenizer
from experiment_lib import *


In [13]:
# define experiment constants

# results are saved in this directory
ROOT_DIR = "tmp/"
# the name of the experiment
EXPERIMENT_NAME = "nl-100-100-1.3B"
# path where the split datasets are stored
SOURCE_DIR = "./datasets"
# file name of the (split) tokenized version of the dataset
SOURCE_FILE = "train_dataset.npy"
# number of trials to run in the experiment
NUM_TRIALS = 100
# language of the setup
LANGUAGE = "nl"
# split of the dataset to use
SPLIT = "train"
# path to the dataset raw files
DATASET_DIR = "europarl"
# path to the dataset files
DATASET_FILE = "europarl-v7.nl-en.nl"

# sequence parameters
SUFFIX_LEN = 50
PREFIX_LEN = 50
EXAMPLE_TOKEN_LEN = 100
PREPREFIX_LEN = 0

BATCH_SIZE = 32

MODEL = "EleutherAI/gpt-neo-1.3B"

# create config.json from constants
config = {
    "root_dir": ROOT_DIR,
    "experiment_name": EXPERIMENT_NAME,
    "dataset_dir": DATASET_DIR,
    "dataset_file": DATASET_FILE,
    "num_trials": NUM_TRIALS,
    "language": LANGUAGE,
    "split": SPLIT,
    "suffix_len": SUFFIX_LEN,
    "prefix_len": PREFIX_LEN,
    "example_token_len": EXAMPLE_TOKEN_LEN,
    "preprefix_len": PREPREFIX_LEN,
    "source_dir": SOURCE_DIR,
    "source_file": SOURCE_FILE,
    "batch_size": BATCH_SIZE,
    "model": MODEL
}

with open("config.json", "w") as f:
    json.dump(config, f, indent=4)

In [8]:
# 1. Split the examples into two parts: prefix and suffix (and preprefix)

# assumption: data is prepared in jsonlines format with sentences all of desired token length
!python split_dataset.py --config_file exp-configs/EMEA/config-125M-nl.json

2024-06-08 14:30:07,630 - INFO
===== Starting dataset token split generation for language en with token length 100 =====
2024-06-08 14:30:07,630 - INFO
Opened file: EMEA/EMEA-c-100.en-train.jsonl
2024-06-08 14:30:07,639 - INFO
Processed 32 lines
2024-06-08 14:30:07,647 - INFO
Processed 64 lines
2024-06-08 14:30:07,654 - INFO
Processed 96 lines
2024-06-08 14:30:07,661 - INFO
Processed 128 lines
2024-06-08 14:30:07,668 - INFO
Processed 160 lines
2024-06-08 14:30:07,675 - INFO
Processed 192 lines
2024-06-08 14:30:07,681 - INFO
Processed 224 lines
2024-06-08 14:30:07,687 - INFO
Processed 256 lines
2024-06-08 14:30:07,693 - INFO
Processed 288 lines
2024-06-08 14:30:07,699 - INFO
Processed 320 lines
2024-06-08 14:30:07,706 - INFO
Processed 352 lines
2024-06-08 14:30:07,712 - INFO
Processed 384 lines
2024-06-08 14:30:07,718 - INFO
Processed 416 lines
2024-06-08 14:30:07,724 - INFO
Processed 448 lines
2024-06-08 14:30:07,729 - INFO
Processed 480 lines
2024-06-08 14:30:07,736 - INFO
Processed 5

In [1]:
# 2. Generate model output using prefixes as prompts

# run this directly in terminal, model cannot be loaded in notebook due to memory constraints I think, crashes kernel

# will not overwrite results
!python extraction.py --config_file config.json

2024-05-30 23:12:58,729 - INFO
Parsing arguments...
2024-05-30 23:12:58,736 - INFO
Default device: mps
2024-05-30 23:12:58,736 - INFO
Loading tokenizer...
2024-05-30 23:12:59,093 - INFO
Loading model...


In [15]:
# 3. Evaluate the model output against the dataset
!python calculate_bleu.py --config_file exp-configs/EMEA/100/config-125M-en.json

!python calculate_bleu.py --config_file exp-configs/EMEA/100/config-1.3B-en.json

!python calculate_bleu.py --config_file exp-configs/EMEA/100/config-2.7B-en.json

2024-06-12 17:48:40,800 - INFO - ===== Starting BLEU-score calculation between generated and original text in language nl for 50 prefix & suffix length =====
===== Starting BLEU-score calculation between generated and original text in language nl for 50 prefix & suffix length =====
2024-06-12 17:48:40,800 - INFO - ===== Decoding original prefixes & suffixes =====
===== Decoding original prefixes & suffixes =====
Decoded strings saved to: %s ./datasets/EMEA/nl/100/EleutherAI/gpt-neo-1.3B/train_prefix.jsonl
Decoded strings saved to: %s ./datasets/EMEA/nl/100/EleutherAI/gpt-neo-1.3B/train_suffix.jsonl
2024-06-12 17:48:41,126 - INFO - Starting BLEU-score calculation for trial 0
Starting BLEU-score calculation for trial 0
2024-06-12 17:48:41,126 - INFO - Saving BLEU scores for trial 0 to tmp/EMEA/nl/nl-100-100-1.3B/scores/bleu_scores_trial_0.jsonl
Saving BLEU scores for trial 0 to tmp/EMEA/nl/nl-100-100-1.3B/scores/bleu_scores_trial_0.jsonl
2024-06-12 17:48:43,984 - INFO - Finished BLEU-sco

In [1]:
!python evaluation.py --config_file exp-configs/EMEA/100/config-125M-en.json

2024-06-13 14:29:04,123 - INFO
Model directory not provided, using default model specified in config.
2024-06-13 14:29:04,358 - INFO
==== Starting evaluation ====
2024-06-13 14:29:04,358 - INFO
Experiment name: nl-100-100-2.7B
2024-06-13 14:29:04,358 - INFO
Language: nl
2024-06-13 14:29:04,358 - INFO
Model: EleutherAI/gpt-neo-2.7B
2024-06-13 14:29:04,358 - INFO
Loading list of example IDs for dataset EMEA...
2024-06-13 14:29:04,360 - INFO
Loaded 8309 example IDs
2024-06-13 14:29:04,360 - INFO
Bleu scores for this experiment previously merged, skipping...
2024-06-13 14:29:04,360 - INFO
Sorting BLEU scores...
2024-06-13 14:29:04,360 - INFO
Output file tmp/EMEA/nl/nl-100-100-2.7B/scores/sorted_compl_bleu_scores.jsonl already exists and is not empty, skipping...
2024-06-13 14:29:04,360 - INFO
Sorted BLEU scores saved to tmp/EMEA/nl/nl-100-100-2.7B/scores/sorted_compl_bleu_scores.jsonl
2024-06-13 14:29:04,360 - INFO
Processing example 27...
2024-06-13 14:29:04,457 - INFO
Merged METEOR score

In [1]:
!python calculate_meteor.py --config_file exp-configs/EMEA/100/config-2.7B-nl.json

!python calculate_meteor.py --config_file exp-configs/EMEA/100/config-1.3B-nl.json

!python calculate_meteor.py --config_file exp-configs/EMEA/100/config-125M-nl.json


[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/annavisman/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
2024-06-13 13:44:21,016 - INFO - ===== Starting METEOR-score calculation between generated and original text in language nl for 50 prefix & suffix length =====
===== Starting METEOR-score calculation between generated and original text in language nl for 50 prefix & suffix length =====
2024-06-13 13:44:21,016 - INFO - ===== Decoding original prefixes & suffixes =====
===== Decoding original prefixes & suffixes =====
2024-06-13 13:44:21,022 - INFO - Starting METEOR-score calculation for trial 0
Starting METEOR-score calculation for trial 0
2024-06-13 13:44:21,022 - INFO - Saving METEOR scores for trial 0 to tmp/EMEA/nl/nl-100-100-2.7B/meteor_scores/meteor_scores_trial_0.jsonl
Saving METEOR scores for trial 0 to tmp/EMEA/nl/nl-100-100-2.7B/meteor_scores/meteor_scores_trial_0.jsonl
2024-06-13 13:44:26,231 - INFO - Finished METEOR-score calculation 

In [8]:
#decode the results
from transformers import AutoTokenizer
import os
import numpy as np

tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-2.7B")

experiment_base = os.path.join("tmp", "EMEA", "nl", "nl-100-100-2.7B")

SOURCE_DIR = "./datasets"
DATASET_DIR = "EMEA"
EXAMPLE_TOKEN_LEN = 100
NUM_TRIALS = 100


exids = os.path.join(SOURCE_DIR, DATASET_DIR, "csv", str(EXAMPLE_TOKEN_LEN), "common_exids-"+str(EXAMPLE_TOKEN_LEN)+".csv")

print(exids)
for i in range(0, NUM_TRIALS):
        file_path = os.path.join(experiment_base, f"generations/{i}.npy")
        data = np.load(file_path)
        print("Data shape: %s", str(data.shape))

        output_file_path = os.path.join(experiment_base, f"decoded/decoded_strings_trial_{i}.jsonl")
        output_dir = os.path.dirname(output_file_path)
        os.makedirs(output_dir, exist_ok=True)
        generations_to_jsonl(output_file_path, data, tokenizer, exids)


print("done")


./datasets/EMEA/csv/100/common_exids-100.csv
Data shape: %s (7478, 100)
Decoded strings saved to: %s tmp/EMEA/nl/nl-100-100-2.7B/decoded/decoded_strings_trial_0.jsonl
Data shape: %s (7478, 100)
Decoded strings saved to: %s tmp/EMEA/nl/nl-100-100-2.7B/decoded/decoded_strings_trial_1.jsonl
Data shape: %s (7478, 100)
Decoded strings saved to: %s tmp/EMEA/nl/nl-100-100-2.7B/decoded/decoded_strings_trial_2.jsonl
Data shape: %s (7478, 100)
Decoded strings saved to: %s tmp/EMEA/nl/nl-100-100-2.7B/decoded/decoded_strings_trial_3.jsonl
Data shape: %s (7478, 100)
Decoded strings saved to: %s tmp/EMEA/nl/nl-100-100-2.7B/decoded/decoded_strings_trial_4.jsonl
Data shape: %s (7478, 100)
Decoded strings saved to: %s tmp/EMEA/nl/nl-100-100-2.7B/decoded/decoded_strings_trial_5.jsonl
Data shape: %s (7478, 100)
Decoded strings saved to: %s tmp/EMEA/nl/nl-100-100-2.7B/decoded/decoded_strings_trial_6.jsonl
Data shape: %s (7478, 100)
Decoded strings saved to: %s tmp/EMEA/nl/nl-100-100-2.7B/decoded/decoded_s