# Grokking LLM Emotional Latent Space in Human Interpretable Dimensions
### Parsing LLM emptional latent space dataset.
### This is the notebook version of ../scripts/grok_intrinsic_geometry.py
### cayden, Aman

In [25]:
USE_PCA = False
PLOT_LR = True
PLOT_ALL_DATA = True
PCA_COMPONENTS = 20
KNN_CLUSTERS = 5
TOTAL_LAYERS = 12
OUTPUT_DIR = "../cache/low_high_arousal_0330b2024"
DATASET_JSON = "../cache/gpt2_low_high_arousal_0330b2024.json"
MODEL_NAME = "gpt2"

# SETUP
if USE_PCA:
    print("Using PCA, thus PLOT_LR will not run.")
    
# make output-dir if it doesn't exist. Confirm overwrite if it existsyy
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)
        
# open data file
latent_space_data = None
json_file_path = DATASET_JSON
print("Loading training data JSON...")

with open(json_file_path, 'r') as file:
    latent_space_data = json.load(file)

Loading training data JSON...


FileNotFoundError: [Errno 2] No such file or directory: '../cache/gpt2_low_high_arousal_0330b2024.json'

In [24]:
# Imports 

import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import plotly.graph_objects as go
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm 
import json
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.decomposition import PCA
import json
import random
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
import argparse
import pdb
import re

In [None]:
# All helper functions

def load_and_split_data(data, train_ratio=0.6, e1_ratio=0.15, e2_ratio=0.15, e3_ratio=0.1):
    # get all the possible adjectives prompts in the dataset
    all_adjectives = list(set(entry['adjective'] for entry in data))
    print("all_adjectives")
    print(all_adjectives)
    all_prompts = list(set(entry['prompt_template'] for entry in data))
    print("all_prompts")
    print(all_prompts)

    # get the size of each eval set
    # SLOW PART STARTS HERE>>>
    # e1_size_adj = int(len(all_adjectives) * e1_ratio)
    # e2_size_prompts = int(len(all_prompts) * e2_ratio)
    # e3_size = int(len(data) * e3_ratio)
    # # get the special adjectives/prompts to hold out for evals
    # e1_adjs = random.sample(all_adjectives, e1_size_adj)
    # e2_prompts = random.sample(all_prompts, e2_size_prompts)
    # e3_objs = random.sample(data, e3_size)

    # # make those eval sets
    # E1_set = [entry for entry in data if entry['adjective'] in e1_adjs]
    # E2_set = [entry for entry in data if entry['prompt_template'] in e2_prompts]
    # E3_set = [entry for entry in data if entry in e3_objs]
    # get the size of each eval set
    e1_size_adj = int(len(all_adjectives) * e1_ratio)
    e2_size_prompts = int(len(all_prompts) * e2_ratio)
    e3_size = int(len(data) * e3_ratio)

    # get the special adjectives/prompts to hold out for evals
    e1_adjs = random.sample(all_adjectives, e1_size_adj)
    e2_prompts = random.sample(all_prompts, e2_size_prompts)
    e3_objs = random.sample(data, e3_size)

    # make those eval sets and training set in a single pass
    E1_set = []
    E2_set = []
    E3_set = []
    train_set = []

    for entry in data:
        if entry['adjective'] in e1_adjs:
            E1_set.append(entry)
        elif entry['prompt_template'] in e2_prompts:
            E2_set.append(entry)
        elif entry in e3_objs:
            E3_set.append(entry)
        else:
            train_set.append(entry)

    # training data is what's left over
    # remaining_data = [entry for entry in data if entry not in E1_set + E2_set + E3_set]
    # train_size = int(len(remaining_data) * train_ratio)
    # train_set = random.sample(remaining_data, train_size)
    train_size = int(len(train_set) * train_ratio)
    train_set = random.sample(train_set, train_size)

    print("Initial dataset size: ", len(data))
    print("Training set size: ", len(train_set))
    print("E1 set size: ", len(E1_set))
    print("E2 set size: ", len(E2_set))
    print("E3 set size: ", len(E3_set))

    print("--- Data loaded.")
    # SLOW PART ENDS HERE <<<
    return train_set, E1_set, E2_set, E3_set

def extract_features_labels(data, layers_to_use=None):
    """
    Extract features and labels from the data.

    Parameters:
    - data: The dataset containing latent vectors and labels.
    - layers_to_use: Optional list of integers specifying which transformer layers to use. If None, all layers are used.

    Returns:
    - A tuple of (features, labels), where features is a NumPy array of the flattened selected layers and labels is a NumPy array of the binary labels.
    """
    features = []
    labels = []

    for item in data:
        latent_vectors = item['latent_space']

        if layers_to_use is not None:
            # Filter the latent vectors to only include the specified layers
            latent_vectors = [latent_vectors[i] for i in layers_to_use]

        # Flatten the selected layers into a single list for each data entry
        # Adjust flattening to account for the two levels of nesting now
        flattened_vector = [val for layer in latent_vectors for head in layer for val in head]

        features.append(flattened_vector)
        labels.append(1 if item['class_0_true'] else 0)

    return np.array(features), np.array(labels)

def preprocess_features(features, mean=None, scale=None, pca_model=None, use_pca=False):
    # Initialize the scaler
    scaler = StandardScaler()

    # Fit or apply normalization
    if mean is None:
        normalized_features = scaler.fit_transform(features)
        mean = scaler.mean_
        scale = scaler.scale_
    else:
        scaler.mean_ = mean
        scaler.scale_ = scale
        normalized_features = scaler.transform(features)

    # Apply PCA if enabled
    if use_pca:
        if pca_model is None:
            pca_model = PCA(n_components=PCA_COMPONENTS)
            pca_features = pca_model.fit_transform(normalized_features)
        else:
            pca_features = pca_model.transform(normalized_features)
        return pca_features, mean, scale, pca_model
    else:
        return normalized_features, mean, scale, None

def train_lr_classifier(features, labels):
    classifier = LogisticRegression(max_iter=1000)
    classifier.fit(features, labels)
    return classifier

def test_lr_classifier(classifier, features, labels, out_path = None):
    y_pred = classifier.predict(features)
    classification_report_ = classification_report(labels, y_pred)
    print("Classification Report:\n", classification_report_)

    if out_path is not None: 
        out_path = os.path.join(out_path)
        with open(out_path, 'w') as f: 
            f.write(classification_report_)

def train_knn_classifier(features, labels, n_neighbors=5):
    """
    Train a K-Nearest Neighbors classifier with the given features and labels.

    Parameters:
    - features: The feature matrix for training data.
    - labels: The label vector for training data.
    - n_neighbors: The number of neighbors to use for k-nearest neighbors voting.

    Returns:
    - The trained KNN classifier.
    """
    classifier = KNeighborsClassifier(n_neighbors=n_neighbors)
    classifier.fit(features, labels)
    return classifier

def sanitize_filename(filename):
    """
    Sanitizes a string to be safe for use as a filename by removing or replacing characters
    that are not allowed or recommended in Windows and UNIX/Linux filesystems.
    
    Args:
    filename (str): The original filename string to sanitize.
    
    Returns:
    str: A sanitized version of the filename.
    """
    # Remove characters that are invalid for Windows or UNIX/Linux filesystems
    sanitized = re.sub(r'[<>:"/\\|?*\x00-\x1F]', '', filename)
    # Replace leading and trailing periods and spaces (Windows)
    sanitized = re.sub(r'^[. ]+', '', sanitized)
    sanitized = re.sub(r'[. ]+$', '', sanitized)
    # Replace multiple consecutive spaces with a single space
    sanitized = re.sub(r' +', ' ', sanitized)
    # Ensure the filename is not too long
    sanitized = sanitized[:255]
    return sanitized

def test_knn_classifier(classifier, features, labels, out_path = None):
    """
    Test (evaluate) the trained K-Nearest Neighbors classifier on a test dataset.

    Parameters:
    - classifier: The trained KNN classifier.
    - features: The feature matrix for test data.
    - labels: The label vector for test data.

    Prints:
    - Classification report including precision, recall, and F1-score.
    """
    y_pred = classifier.predict(features)
    classification_report_ = classification_report(labels, y_pred)
    print("Classification Report:\n", classification_report_)
    if out_path is not None:
        with open(out_path, 'w') as f: 
            f.write(classification_report_)

def plot_3d_pca(data, labels, adjectives=None, output_dir = None, prompt=""):
    # Normalize labels for color scaling
    colors = np.array(labels) - np.min(labels)
    colors = colors / np.max(colors)
    
    # Create a 3D scatter plot
    fig = go.Figure(data=[go.Scatter3d(
        x=data[:, 0],
        y=data[:, 1],
        z=data[:, 2],
        text=adjectives,  # Use adjectives as markers' text
        mode='markers+text',  # Display both markers and text
        marker=dict(
            size=5,
            color=colors,  # Use normalized labels for color
            opacity=0.8
        )
    )])
    
    # Customize layout
    fig.update_layout(
        title=f'3D PCA Visualization, prompt={prompt}',
        scene=dict(
            xaxis_title='PC1',
            yaxis_title='PC2',
            zaxis_title='PC3'
        )
    )
    
    # Show plot in notebook or export as desired
    fig.show()
    # To export to HTML, uncomment the following line:
    # fig.write_html('3d_pca_visualization.html')
    if output_dir: 
        out_path = os.path.join(output_dir, f"3d_pca_visualization{sanitize_filename(prompt)}.html")
        print("OUTPUT PATH FOR PCA: ", out_path)
        fig.write_html(out_path)

def plot_mean_coefficients_per_layer_with_plotly(lr_classifier, layers_to_use, output_dir=None):
    """
    Plot the mean coefficient weights per layer for a trained Logistic Regression classifier using Plotly.

    Parameters:
    - lr_classifier: The trained Logistic Regression classifier.
    - layers_to_use: The layers we trained the classifier on
    """
    coefficients = lr_classifier.coef_.flatten()  # Extract model coefficients
    print("Number of coefficients: {}".format(len(coefficients)))
    total_layers = len(layers_to_use)

    # Assuming equal feature contribution from each layer if not specified
    features_per_layer = len(coefficients) // total_layers

    print("Features per layer: {}".format(features_per_layer))

    # Calculate mean coefficient weight per layer
    mean_coefficients_per_layer = [np.abs(np.mean(coefficients[i*features_per_layer:(i+1)*features_per_layer])) for i in range(total_layers)]

    # Plotting with Plotly
    fig = go.Figure(data=[go.Bar(
        x=[f'Layer {i}' for i in layers_to_use],
        y=mean_coefficients_per_layer,
        marker_color=np.where(np.array(mean_coefficients_per_layer) > 0, 'blue', 'red')  # Color code positive and negative
    )])

    fig.update_layout(
        title='Mean Coefficient Weights per Layer in Logistic Regression Classifier',
        xaxis_title='Layer',
        yaxis_title='Mean Coefficient Value',
        template='plotly_white'
    )

    fig.show()
    # output to disk if cache_dir is specified
    if output_dir:
        print("Saving figure uwu")
        out_path = os.path.join(output_dir, "mean_coefficients_per_layer.html")
        fig.write_html(out_path)
        print("Done -- saved to ", out_path)

In [22]:
# Load training data
train_set, e1_set, e2_set, e3_set = load_and_split_data(latent_space_data)

print(f"Training set size: {len(train_set)}")
print(f"E1 set size: {len(e1_set)}")
print(f"E2 set size: {len(e2_set)}")
print(f"E3 set size: {len(e3_set)}")

NameError: name 'load_and_split_data' is not defined

In [None]:
# Preprocessing data, extract features, normalize, PCA, etc.

# Drop/only use certain layers
#layers_to_use = [0,4,7,9,11] 
layers_to_use = list(range(0,TOTAL_LAYERS))

# Preprocess features (normalize and optionally apply PCA) for the training set
train_features, train_labels = extract_features_labels(train_set, layers_to_use)
train_preprocessed_features, mean_norm, scale_norm, pca_model = preprocess_features(train_features, use_pca=USE_PCA)

# Preprocess features for E1 set (if applicable, uncomment and use as needed)
e1_features, e1_labels = extract_features_labels(e1_set)
e1_preprocessed_features, _, _, _ = preprocess_features(e1_features, mean_norm, scale_norm, pca_model, use_pca=USE_PCA)

# Preprocess features for E2 set using the same normalization and PCA model
e2_features, e2_labels = extract_features_labels(e2_set, layers_to_use)
e2_preprocessed_features, _, _, _ = preprocess_features(e2_features, mean_norm, scale_norm, pca_model, use_pca=USE_PCA)

# Preprocess features for E3 set (if applicable, uncomment and use as needed)
e3_features, e3_labels = extract_features_labels(e3_set)
e3_preprocessed_features, _, _, _ = preprocess_features(e3_features, mean_norm, scale_norm, pca_model, use_pca=USE_PCA)

In [None]:
# Train classifier
lr_classifier = train_lr_classifier(train_preprocessed_features, train_labels)

# save weights of lr_classifier in OUTPUT_DIR/weights.npz
np.savez(os.path.join(OUTPUT_DIR, "weights.npz"), lr_classifier.coef_, lr_classifier.intercept_)

# train KNN classifier
knn_classifier = train_knn_classifier(train_preprocessed_features, train_labels, n_neighbors=KNN_CLUSTERS)

In [None]:
# Assess Classifier, Save Results

print("LR Classifier test on E1:")
test_lr_classifier(lr_classifier, e1_preprocessed_features, e1_labels, 
                   out_path=os.path.join(OUTPUT_DIR, "lr_classifier_eval_e1.txt"))

print("KNN Classifier test on E1:")
test_knn_classifier(knn_classifier, e1_preprocessed_features, e1_labels, 
                    out_path=os.path.join(OUTPUT_DIR, "knn_classifier_eval_e1.txt"))

# 
print("LR Classifier test on E2:")
test_lr_classifier(lr_classifier, e2_preprocessed_features, e2_labels, 
                   out_path=os.path.join(OUTPUT_DIR, "lr_classifier_eval_e2.txt"))
print("KNN Classifier test on E2:")
test_knn_classifier(knn_classifier, e2_preprocessed_features, e2_labels, 
                    out_path=os.path.join(OUTPUT_DIR, "knn_classifier_eval_e2.txt"))

print("LR Classifier test on E3:")
test_lr_classifier(lr_classifier, e3_preprocessed_features, e3_labels, 
                    out_path=os.path.join(OUTPUT_DIR, "lr_classifier_eval_e3.txt"))
print("KNN Classifier test on E3:")
test_knn_classifier(knn_classifier, e3_preprocessed_features, e3_labels, 
                    out_path=os.path.join(OUTPUT_DIR, "knn_classifier_eval_e3.txt"))

print("Combining all output texts with titles into a main results.txt") 
with open(os.path.join(OUTPUT_DIR, "lr_classifier_eval_e1.txt"), 'r') as f: 
    e1_text = "\n=== E1 LINEAR CLASSIFIER EVAL ===\n"
    e1_text += f.read()
with open(os.path.join(OUTPUT_DIR, "knn_classifier_eval_e1.txt"), 'r') as f:
    e1_text += "\n=== E1 KNN CLASSIFIER EVAL ===\n"
    e1_text += f.read()
with open(os.path.join(OUTPUT_DIR, "lr_classifier_eval_e2.txt"), 'r') as f:
    e2_text = "\n=== E2 LINEAR CLASSIFIER EVAL ===\n"
    e2_text += f.read()
with open(os.path.join(OUTPUT_DIR, "knn_classifier_eval_e2.txt"), 'r') as f:
    e2_text += "\n=== E2 KNN CLASSIFIER EVAL ===\n"
    e2_text += f.read()
with open(os.path.join(OUTPUT_DIR, "lr_classifier_eval_e3.txt"), 'r') as f:
    e3_text = "\n=== E3 LINEAR CLASSIFIER EVAL ===\n"
    e3_text += f.read()
with open(os.path.join(OUTPUT_DIR, "knn_classifier_eval_e3.txt"), 'r') as f:
    e3_text += "\n=== E3 KNN CLASSIFIER EVAL ===\n"
    e3_text += f.read()

use_pca_text = f"=== USE_PCA = {USE_PCA} ===\n"

# write e1_text + e2_text + e3_test to a results.txt in the out dir
with open(os.path.join(OUTPUT_DIR, "results.txt"), 'w') as f:
    f.write(use_pca_text + e1_text + e2_text + e3_text)

In [None]:
# Plot/Visualize - dataset, adjectives, latent space, classifier

# View the dataset PCA
all_features, all_labels = extract_features_labels(latent_space_data)
adjectives = [item['adjective'] for item in latent_space_data] # Assuming 'data' is your entire dataset # Optionally, if you want to visualize using specific labels or adjectives
prompts = [item['prompt_template'] for item in latent_space_data] # Assuming 'data' is your entire dataset # Optionally, if you want to visualize using specific labels or adjectives

all_adjectives = list(set(entry['adjective'] for entry in latent_space_data))
all_prompts = list(set(entry['prompt_template'] for entry in latent_space_data))
# pdb.set_trace()


if PLOT_ALL_DATA:
    # all data plot
    all_preprocessed_features, _, _, pca_model = preprocess_features(all_features, use_pca = True)
    plot_3d_pca(all_preprocessed_features, all_labels, adjectives=adjectives, output_dir = OUTPUT_DIR, prompt="ALL PROMPTS")
    for prompt in all_prompts: 
        prompt_mask = [item['prompt_template'] == prompt for item in latent_space_data]

        # pdb.set_trace()
        all_features_filtered = all_features[prompt_mask, :]
        all_labels_filtered = all_labels[prompt_mask]
        all_preprocessed_features, _, _, pca_model = preprocess_features(all_features_filtered, use_pca = True)
        # pdb.set_trace()
        plot_3d_pca(all_preprocessed_features, all_labels_filtered, adjectives=np.array(adjectives)[prompt_mask].tolist(), output_dir = OUTPUT_DIR, prompt=prompt)

# View the linear regression classifier coefficients per layer (only makes sense to do if we haven't PCA'ed)
if not USE_PCA and PLOT_LR:
    plot_mean_coefficients_per_layer_with_plotly(lr_classifier, layers_to_use, output_dir = OUTPUT_DIR)


In [None]:
# Nudge the latent space

def adjust_feature_vector(feature_vector, lr_classifier, layers_to_use, step_size=0.01):
    """
    Adjusts the feature vector towards a specific class using the logistic regression classifier coefficients.

    Parameters:
    - feature_vector: The original feature vector to adjust.
    - lr_classifier: The trained logistic regression classifier.
    - layers_to_use: List of integers specifying which transformer layers were used.
    - step_size: The magnitude of the adjustment step (default: 0.01).

    Returns:
    - adjusted_feature_vector: The adjusted feature vector.
    """
    # Extract the logistic regression coefficients
    coefficients = lr_classifier.coef_[0]

    # Calculate the total number of features per layer
    total_features = len(coefficients) // len(layers_to_use)

    # Adjust the feature vector
    adjusted_feature_vector = feature_vector.copy()
    for i, layer_idx in enumerate(layers_to_use):
        start_idx = i * total_features
        end_idx = (i+1) * total_features
        # Adjust features for the current layer based on the coefficients
        adjusted_feature_vector[start_idx:end_idx] += coefficients[start_idx:end_idx] * step_size
    
    return adjusted_feature_vector

def get_tokenizer_and_model(model_name): 
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token
    model = AutoModelForCausalLM.from_pretrained(model_name).to('cuda')

    return tokenizer, model

def get_bob_vals(past_kvs): 
    """
    Args: 
        `past_kvs`: model output['past_key_values'] from running a batch of 
        left-padded sentences through the model.

        Accepts `past_kvs`, a tuple of length NUM_LAYERS (32), each containing a 
        2-long tuple (for keys and values respectively), each containing a torch 
        Tensor of shape [batch, num_heads, seq_len, head_dim] (for values). 

    Returns: 
        `bob_kvs`: list of length BATCH_SIZE with some numpy arrays representing 
        of shape [num_layers, num_heads, head_dim]
    """

    # iterate thru batch size 
    BATCH_SIZE = past_kvs[0][1].shape[0]

    batch_bob_values = []
    for batch_el in range(BATCH_SIZE): 
        # aggregate representations from across the layers 
        bob_numpy_arrays = []
        for layer in range(len(past_kvs)): 
            bob_layer_l_value = past_kvs[layer][1][batch_el, :, -1, :].detach().cpu().numpy()
            # print("Bob layer_l_value shape: ", bob_layer_l_value.shape)

            # unsqueeze on dimension zero
            bob_numpy_arrays.append(bob_layer_l_value[np.newaxis, ...])
        
        # merge on axis 0
        bob_numpy_arrays_conc = np.concatenate(bob_numpy_arrays, axis=0)
        # print("Bob numpy arrays shape (post-concatenation to combine layers)", bob_numpy_arrays_conc.shape)
        # bob_numpy_arrays now has shape n_layers = 32, n_heads = 8, embed_dim=128

        # add it to the list
        batch_bob_values.append(bob_numpy_arrays_conc)


    return batch_bob_values

def get_latent_space(full_prompt_list, model, tokenizer):
    """
    Generate value representations for prompts.

    Args:
        full_prompt_list: Full list of prompts.
        model: Hugging Face model.
        tokenizer: Hugging Face tokenizer.

    Returns:
        list: Full list of prompts with value representations. 
              This will ALWAYS be the representation of the final token. 
    """
    final_prompt_list = []
    for i in tqdm(range(len(full_prompt_list))):
        prompt_ids_i = full_prompt_list[i]['final_prompt_ids'] # 1-dim list
        # make into 2-dim tensor
        prompt_ids_i = torch.tensor(prompt_ids_i).unsqueeze(0).to(model.device)
        # check that the final token is the token of interest
        assert tokenizer.decode(prompt_ids_i[0, -1]) == full_prompt_list[i]['token_of_interest']
        # get the hidden states
        outputs = model.forward(prompt_ids_i, return_dict=True)
        past_kv = outputs['past_key_values']
        # past_kv is a tuple of length num_layers
        # past_kv[0] is a tuple of length 2 (keys, values)
        # past_kv[0][1] is a tensor of shape [batch=1, num_heads=12, num_tokens, dim_head=64]
        #  --> num_heads * dim_head = 12*64 = d_model = 768
        bob_reps = get_bob_vals(past_kv)
        assert len(bob_reps) == 1
        final_prompt_list.append(full_prompt_list[i])
        final_prompt_list[-1]['latent_space'] = bob_reps[0]
    return final_prompt_list

def np_to_lists(final_prompt_list): 
    """ Convert each final_prompt_list[i]['latent_space'] from a list of numpy arrays to a list of lists.
    """
    for i in tqdm(range(len(final_prompt_list))): 
        final_prompt_list[i]['latent_space'] = final_prompt_list[i]['latent_space'].tolist()
    return final_prompt_list

def main(args):
    print(f"\nLoading tokenizer and model `{args.model_name}`...")
    tokenizer, model = get_tokenizer_and_model(args)
    print("Done!\n")

    # ensure output path doesn't exist. prompt the user if it does 
    if os.path.exists(args.out_path):
        print(f"\nOutput path {args.out_path} already exists. Overwrite? (y/n) ")
        if input().lower() != 'y':
            print("Exiting...")
            return

    print("\nGenerating combined prompt list from adjectives * prompt templates...")
    full_prompt_list = get_full_prompt_list(args, tokenizer)
    print("Done!")

    # get value representations
    print("\nGetting value representations of final tokens...")
    final_prompt_list = get_latent_space(full_prompt_list, model, tokenizer)
    print("Done!")

    # convert numpy arrays to lists
    print("\nConverting numpy arrays to lists...")
    final_prompt_list = np_to_lists(final_prompt_list)
    print("Done!")

    # save to file
    print("\nSaving to file...")
    with open(args.out_path, 'w') as f:
        json.dump(final_prompt_list, f, indent=4)
    print("Done! Thank you for shopping at the Language Game!")


# spin up GPT2
tokenizer, model = get_tokenizer_and_model(MODEL_NAME)

# select a feature vector to mess with
# select random row from latent_space_data
# run that single row through extract_feature_labels()
# fv_to_test =

# push feature vector continually more and more in the direction we choose
num_steps = 100
step_size = 0.01
for i in num_steps:
    # Adjust a feature vector towards the favored class
    fv_to_test = adjust_feature_vector(fv_to_test, lr_classifier, layers_to_use, step_size=step_size)

    # Inject the new adjusted feature vector into GPT2 here
    # use an inject version of get_value_reps.py

    # Generate the next 3 tokens to see if they went in the direction that we want OR just look at the probability distribu
    # genereate with GPT2 here