In [3]:
import sys
sys.path.append('/data/andy_lee/EvalLatentKnowledge/Probes')
import Generate_Embeddings

In [23]:
import logging, json
import pandas as pd
import numpy as np

In [22]:
from pathlib import Path
from typing import Dict, List
from Generate_Embeddings import init_model, load_data, process_batch, save_data
from tqdm import tqdm

In [10]:
try:
    with open("/data/andy_lee/EvalLatentKnowledge/Probes/config.json") as config_file:
        print('config.json fetched...')
        config_parameters = json.load(config_file)
except FileNotFoundError:
    print("Configuration file not found. Please ensure the file exists and the path is correct.")
    logging.error("Configuration file not found. Please ensure the file exists and the path is correct.")
except PermissionError:
    print("Permission denied. Please check your file permissions.")
    logging.error("Permission denied. Please check your file permissions.")
except json.JSONDecodeError:
    print("Configuration file is not valid JSON. Please check the file's contents.")
    logging.error("Configuration file is not valid JSON. Please check the file's contents.")

config.json fetched...


In [25]:
# Name of the language model to use: '6.7b', '2.7b', '1.3b', '350m'"
model_name = config_parameters["model"]
should_remove_period = config_parameters["remove_period"]
layers_to_process = config_parameters["layers_to_use"]
dataset_names = config_parameters["list_of_datasets"]
true_false = config_parameters["true_false"]
BATCH_SIZE = config_parameters["batch_size"]
# dataset_path = Path(config_parameters["dataset_path"])
dataset_path = Path('/data/andy_lee/EvalLatentKnowledge/Probes/datasets') 
output_path = Path('/data/andy_lee/EvalLatentKnowledge/Probes/processed_dataset_path')

In [26]:
print(f"model: {model_name}")
print(f"should_remove_period: {should_remove_period}")
print(f"layers_to_process: {layers_to_process}")
print(f"dataset_names: {dataset_names}")
print(f"true_false: {true_false}")
print(f"BATCH_SIZE: {BATCH_SIZE}")
print(f"dataset_path: {dataset_path}")
print(f"output_path: {output_path}")

model: 6.7b
should_remove_period: True
layers_to_process: [-1, -4, -6, -8]
dataset_names: ['facts', 'elements']
true_false: True
BATCH_SIZE: 32
dataset_path: /data/andy_lee/EvalLatentKnowledge/Probes/datasets
output_path: /data/andy_lee/EvalLatentKnowledge/Probes/processed_dataset_path


In [17]:
model_output_per_layer: Dict[int, pd.DataFrame] = {}

model, tokenizer = init_model(model_name)
if model is None or tokenizer is None:
    print("Model or tokenizer initialization failed.")
    logging.error("Model or tokenizer initialization failed.")

config.json:   0%|          | 0.00/651 [00:00<?, ?B/s]

pytorch_model.bin.index.json:   0%|          | 0.00/41.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.96G [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.36G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/685 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/441 [00:00<?, ?B/s]

In [27]:
"""
Loads configuration parameters, initializes the model and tokenizer, and processes datasets.

Configuration parameters are loaded from a JSON file named "BenConfigMultiLayer.json". 
These parameters specify the model to use, whether to remove periods from the end of sentences, 
which layers of the model to use for generating embeddings, the list of datasets to process, 
and the paths to the input datasets and output location.

The script processes each dataset according to the configuration parameters, generates embeddings for 
each sentence in the dataset using the specified model and layers, and saves the processed data to a CSV file. 
If processing a dataset or saving the data fails, the script logs an error message and continues with the next dataset.
"""

for dataset_name in tqdm(dataset_names, desc="Processing datasets"):
    # Increase the threshold parameter to a large number
    np.set_printoptions(threshold=np.inf)
    dataset = load_data(dataset_path, dataset_name, true_false=true_false)
    if dataset is None:
        continue

    num_batches = len(dataset) // BATCH_SIZE + (len(dataset) % BATCH_SIZE != 0)

    for layer in layers_to_process:
        model_output_per_layer[layer] = dataset.copy()
        model_output_per_layer[layer]['embeddings'] = pd.Series(dtype='object')

    for batch_num in tqdm(range(num_batches), desc=f"Processing batches in {dataset_name}"):
        start_idx = batch_num * BATCH_SIZE
        actual_batch_size = min(BATCH_SIZE, len(dataset) - start_idx)
        end_idx = start_idx + actual_batch_size
        batch = dataset.iloc[start_idx:end_idx]
        batch_prompts = batch['statement'].tolist()
        batch_embeddings = process_batch(batch_prompts, model, tokenizer, layers_to_process, should_remove_period)

        for layer in layers_to_process:
            for i, idx in enumerate(range(start_idx, end_idx)):
                model_output_per_layer[layer].at[idx, 'embeddings'] = batch_embeddings[layer][i]

        if batch_num % 10 == 0:
            logging.info(f"Processing batch {batch_num}")

    for layer in layers_to_process:
        save_data(model_output_per_layer[layer], output_path, dataset_name, model_name, layer, should_remove_period)

Processing datasets:   0%|          | 0/2 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Processing batches in facts: 100%|██████████| 20/20 [05:29<00:00, 16.45s/it]
Processing batches in elements: 100%|██████████| 30/30 [08:00<00:00, 16.00s/it]
Processing datasets: 100%|██████████| 2/2 [13:47<00:00, 413.98s/it]
