In [23]:
# imports
import json
import matplotlib.pyplot as plt
import numpy as np
import os

In [24]:
# define experiment constants

# results are saved in this directory
ROOT_DIR = "tmp/"
# the name of the experiment
EXPERIMENT_NAME = "test"
# path to the (pretraining) dataset of the model
DATASET_DIR = "./datasets/en/200/"
# file name of the tokenized version of the dataset
DATASET_FILE = "train_dataset.npy"
# number of trials to run in the experiment
NUM_TRIALS = 10
# language of the setup
LANGUAGE = "en"
# split of the dataset to use
SPLIT = "train"

# sequence parameters
SUFFIX_LEN = 50
PREFIX_LEN = 50
EXAMPLE_TOKEN_LEN = 200
PREPREFIX_LEN = 100

In [26]:
# create config.json from constants
config = {
    "root_dir": ROOT_DIR,
    "experiment_name": EXPERIMENT_NAME,
    "dataset_dir": DATASET_DIR,
    "dataset_file": DATASET_FILE,
    "num_trials": NUM_TRIALS,
    "language": LANGUAGE,
    "split": SPLIT,
    "suffix_len": SUFFIX_LEN,
    "prefix_len": PREFIX_LEN,
    "example_token_len": EXAMPLE_TOKEN_LEN,
    "preprefix_len": PREPREFIX_LEN
}

with open("config.json", "w") as f:
    json.dump(config, f, indent=4)

In [None]:
# 1. Split the examples into two parts: prefix and suffix (and preprefix)

# assumption: data is prepared in jsonlines format with desired token length
!python split_dataset.py --config_file config.json

In [None]:
# 2. Generate model output using prefixes as prompts

!python extraction.py --config_file config.json

In [None]:
# 3. Evaluate the model output against the dataset
!python evaluation.py --config_file config.json

In [None]:
# 4. Plot distribution of scores

def read_bleu_scores(file_path):
    scores = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            data = json.loads(line)
            scores.append(data['score'])
    return scores

def plot_bleu_distribution(scores, num_bins=10):
    plt.figure(figsize=(10, 6))
    
    # Compute histogram
    counts, bins = np.histogram(scores, bins=num_bins, range=(0, 1))
    
    # Plot histogram as bar chart
    plt.bar(bins[:-1], counts, width=(bins[1] - bins[0]), edgecolor='black', align='edge')

    # add a grid on image
    plt.grid(True)

    # Set titles and labels
    plt.title('Distribution of BLEU Scores')
    plt.xlabel('BLEU Score')
    plt.ylabel('Frequency')
    
    plt.xticks(np.linspace(0, 1, num_bins + 1))
    
    # Show plot
    plt.show()

if __name__ == "__main__":

    trial = 0
    
    # Path to the .jsonl file containing BLEU scores
    bleu_scores_file = os.path.join(ROOT_DIR, EXPERIMENT_NAME, "scores", f"bleu_scores_trial_{trial}")
    
    # Read BLEU scores from the file
    bleu_scores = read_bleu_scores(bleu_scores_file)
    
    # Plot the distribution of BLEU scores
    plot_bleu_distribution(bleu_scores)