In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from tqdm import tqdm

from xgboost import XGBClassifier
from xgboost.core import XGBoostError

import csv

import cupy as cp

import gc

In [2]:
df = pd.read_csv("./story_dataset_2.csv")
df

Unnamed: 0,prompt_id,prompt,story,len_generated_story
0,1,Once upon a time there was a boy,Once upon a time there was a boy named Timmy. ...,320
1,1,Once upon a time there was a boy,Once upon a time there was a boy named Timmy. ...,363
2,1,Once upon a time there was a boy,Once upon a time there was a boy named Timmy. ...,244
3,1,Once upon a time there was a boy,Once upon a time there was a boy who loved to ...,304
4,1,Once upon a time there was a boy,Once upon a time there was a boy named Timmy. ...,294
...,...,...,...,...
1995,2,Once upon a time there was a poor boy,Once upon a time there was a poor boy named Ti...,287
1996,2,Once upon a time there was a poor boy,Once upon a time there was a poor boy named Ti...,306
1997,2,Once upon a time there was a poor boy,Once upon a time there was a poor boy named Ti...,318
1998,2,Once upon a time there was a poor boy,Once upon a time there was a poor boy named Ti...,297


In [13]:
hidden_states_by_layer = {}
NUM_PROMPTS = 2
# NUM_PROMPTS = 2
layer = 8

#Load hidden states to generate first token
for prompt_id in range(1, NUM_PROMPTS + 1):
    with np.load(f'./hidden_states_2/first_prompt_{prompt_id}.npz') as loaded_data:
        for i in tqdm(range(1000)):
            curr_hidden_states = loaded_data[f"arr_{i}"]
            # print(curr_hidden_states.shape)
            
            #By layer
            curr_layer_hidden_states = curr_hidden_states[layer][0][-1].astype('float32') #FAISS expects data in type float32 instead of float64 - saves memory too!
            # print(curr_layer_hidden_states.shape)
            
            if(f"layer_{layer}" in hidden_states_by_layer):
                hidden_states_by_layer[f"layer_{layer}"].append(curr_layer_hidden_states)
            else:
                hidden_states_by_layer[f"layer_{layer}"] = [curr_layer_hidden_states]

100%|█████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 1229.20it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 1117.01it/s]


In [4]:
layer_hs_array = hidden_states_by_layer[f"layer_{layer}"]
print(layer_hs_array[0].shape)
print(layer_hs_array[-1].shape)
#1x512


for hs in range(len(layer_hs_array)):
    layer_hs_array[hs] = layer_hs_array[hs].flatten()

(512,)
(512,)


In [5]:
min_story_len = min(df["len_generated_story"])
min_story_len

157

In [6]:
max_story_len = max(df[:NUM_PROMPTS*1000]["len_generated_story"])

In [7]:
df[df["len_generated_story"] >= max_story_len]

Unnamed: 0,prompt_id,prompt,story,len_generated_story
1622,2,Once upon a time there was a poor boy,Once upon a time there was a poor boy named Ti...,522


In [8]:
def safe_split(total, train_ratio=0.8):
    train = int(train_ratio * total)   # floor
    test  = total - train              # leftover
    return train, test

def build_dataset(curr_context_level_hs, curr_labels, train_ratio=0.8):
    """
    Groups samples by prompt_id, splits safely into train/test,
    and returns X_train, y_train, X_test, y_test.
    """

    unique_ids = sorted(set(curr_labels))

    X_train_list = []
    X_test_list = []
    y_train_list = []
    y_test_list = []

    for pid in unique_ids:

        # get samples for this prompt id
        mask = (curr_labels == pid)
        X_pid = curr_context_level_hs[mask]
        y_pid = curr_labels[mask]

        total = len(X_pid)
        train_n, test_n = safe_split(total, train_ratio)

        # split
        X_train_list.append(X_pid[:train_n])
        y_train_list.append(y_pid[:train_n])

        X_test_list.append(X_pid[train_n:])
        y_test_list.append(y_pid[train_n:])

    # concatenate all prompt-id blocks
    X_train = np.concatenate(X_train_list, axis=0)
    y_train = np.concatenate(y_train_list, axis=0)
    X_test  = np.concatenate(X_test_list, axis=0)
    y_test  = np.concatenate(y_test_list, axis=0)

    # return cp.array(X_train), cp.array(y_train), cp.array(X_test), cp.array(y_test)
    return X_train, y_train, X_test, y_test
    

In [9]:
#Context-level analysis
# min_story_len = min(df["len_new_story"])
# with open("results.csv", "w+", newline='') as csvfile:
#     csv_writer = csv.writer(csvfile, delimiter=',')
#     csv_writer.writerow(['Context Level', 'Accuracy'])

# with open("./results_same_one_token_context.csv", "w+", newline='') as csvfile:
#     csv_writer = csv.writer(csvfile, delimiter=',')
#     header = ['Layer', 'Context_Level', 'Train_Accuracy', 'Test_Accuracy']
#     prompt_headers = []
#     for i in range(1, 11):
#         prompt_headers.extend([f"Prompt_{i}_Train_Accuracy", f"Prompt_{i}_Test_Accuracy"])

#     header.extend(prompt_headers)
#     csv_writer.writerow(header)

curr_labels = df.prompt_id.to_numpy() - 1
print(curr_labels)
curr_context_level_hs = np.array(layer_hs_array)

unique_ids = sorted(set(curr_labels))

X_train, y_train, X_test, y_test = build_dataset(curr_context_level_hs, curr_labels)

# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train)
# X_test_scaled = scaler.transform(X_test)

# from sklearn.decomposition import PCA
# pca = PCA(n_components=0.95)

# X_train_pca = pca.fit_transform(X_train_scaled)
# X_test_pca = pca.transform(X_test_scaled)

# print("Train Data Shape: ", X_train_pca.shape)
# # print("Train Labels Shape: ", y_train.shape)
# print("Test Data Shape: ", X_test_pca.shape)
# # print("Test Labels Shape: ", y_test.shape)

# X_train_pca = cp.array(X_train_pca)
# y_train = cp.array(y_train)
# X_test_pca = cp.array(X_test_pca)
# y_test = cp.array(y_test)

print("Train Data Shape: ", X_train.shape)
#     print("Train Labels Shape: ", y_train.shape)
print("Test Data Shape: ", X_test.shape)
#     print("Test Labels Shape: ", y_test.shape)

X_train = cp.array(X_train)
y_train = cp.array(y_train)
X_test = cp.array(X_test)
y_test = cp.array(y_test)

# classifier = XGBClassifier(seed = 42, objective = 'multi:softmax', eval_metric = "merror", num_class = len(unique_ids))
# classifier = XGBClassifier(seed = 42, objective = 'multi:softmax', eval_metric = "merror", num_class = len(unique_ids), tree_method='gpu_hist', predictor='gpu_predictor')
classifier = XGBClassifier(seed = 42, objective = 'multi:softmax', eval_metric = "merror", num_class = 2, tree_method='hist', device='cuda')
classifier.fit(X_train, y_train)
preds_train = classifier.predict(X_train)
preds = classifier.predict(X_test)

# classifier.fit(X_train_pca, y_train)
# preds = classifier.predict(X_test_pca)


#     print("Predictions Shape: ", preds.shape)

#     print("Unique Preds: ", pd.Series(preds).value_counts())
#     print("Unique Test Labels: ", pd.Series(y_test).value_counts())

train_accuracy = np.mean(cp.array(preds_train) == y_train)
accuracy = np.mean(cp.array(preds) == y_test)

print(f"Train Accuracy: {train_accuracy}")
print(f"Test Accuracy: {accuracy}")
# with open("results.csv", "a+", newline='') as csvfile:
#     csv_writer = csv.writer(csvfile, delimiter=',')
#     csv_writer.writerow([X_train.shape[1] // 512, accuracy])

prompt_accs = []

for i in range(2):
    mask = y_test == i
    prompt_test = y_test[mask]
    prompt_preds = cp.array(preds)[mask]
    prompt_test_acc = np.mean(prompt_preds == prompt_test)

    mask_train = y_train == i
    prompt_train = y_train[mask_train]
    prompt_preds_train = cp.array(preds_train)[mask_train]
    prompt_train_acc = np.mean(prompt_preds_train == prompt_train)

    print(f"Prompt {i + 1} Train Accuracy: {prompt_train_acc}")
    print(f"Prompt {i + 1} Test Accuracy: {prompt_test_acc}")

    prompt_accs.append(prompt_train_acc)
    prompt_accs.append(prompt_test_acc)


with open("./results_same_one_token_context.csv", "a+", newline='') as csvfile:
    csv_writer = csv.writer(csvfile, delimiter=',')
    values = [layer, 1, train_accuracy, accuracy]
    values.extend(prompt_accs)
    csv_writer.writerow(values)


del classifier, X_train, y_train, X_test, y_test, preds

# XGBoost cleanup
try:
    booster = classifier.get_booster()
    del booster
except:
    pass

# CuPy cleanup (CRITICAL)
cp.get_default_memory_pool().free_all_blocks()
cp.get_default_pinned_memory_pool().free_all_blocks()

gc.collect()


# print(max(curr_hs_shapes))

[0 0 0 ... 1 1 1]
Train Data Shape:  (1600, 512)
Test Data Shape:  (400, 512)
Train Accuracy: 1.0
Test Accuracy: 1.0
Prompt 1 Train Accuracy: 1.0
Prompt 1 Test Accuracy: 1.0
Prompt 2 Train Accuracy: 1.0
Prompt 2 Test Accuracy: 1.0


196

In [10]:
# print(type(preds))

In [11]:
# print(type(y_test))