In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from tqdm import tqdm

from xgboost import XGBClassifier
from xgboost.core import XGBoostError

import csv

import cupy as cp

import gc

In [2]:
df = pd.read_csv("../../story_dataset.csv")
df

Unnamed: 0,prompt_id,prompt,story,hidden_state_file,len_generated_story
0,1,Once upon a time there was a dragon,Once upon a time there was a dragon. It was bi...,./hidden_states/prompt_1.npz,273
1,1,Once upon a time there was a dragon,Once upon a time there was a dragon. He loved ...,./hidden_states/prompt_1.npz,246
2,1,Once upon a time there was a dragon,Once upon a time there was a dragon named Sam....,./hidden_states/prompt_1.npz,397
3,1,Once upon a time there was a dragon,Once upon a time there was a dragon. He was a ...,./hidden_states/prompt_1.npz,294
4,1,Once upon a time there was a dragon,Once upon a time there was a dragon. The drago...,./hidden_states/prompt_1.npz,296
...,...,...,...,...,...
9995,10,Once upon a time there was a poor boy,Once upon a time there was a poor boy named Ti...,./hidden_states/prompt_10.npz,315
9996,10,Once upon a time there was a poor boy,Once upon a time there was a poor boy named Ti...,./hidden_states/prompt_10.npz,270
9997,10,Once upon a time there was a poor boy,Once upon a time there was a poor boy named Ti...,./hidden_states/prompt_10.npz,206
9998,10,Once upon a time there was a poor boy,Once upon a time there was a poor boy named Ti...,./hidden_states/prompt_10.npz,375


In [3]:
hidden_states_by_layer = {}
curr_labels = []
NUM_PROMPTS = 10
# NUM_PROMPTS = 2
context_level = 23

#Load hidden states to generate first token
for prompt_id in range(1, NUM_PROMPTS + 1):
    with np.load(f'../../hidden_states/prompt_{prompt_id}.npz') as loaded_data:
        for i in tqdm(range(1000)):
            curr_hidden_states = loaded_data[f"arr_{i}"]
            # print(curr_hidden_states.shape)
            if(context_level > curr_hidden_states.shape[0]):
                continue

            curr_hidden_states = curr_hidden_states[context_level]
            # curr_labels.append(df.iloc[i * prompt_id].prompt_id)
            curr_labels.append(prompt_id - 1)
            
            #By layer
            for layer in range(1):
                curr_layer_hidden_states = curr_hidden_states[layer][0].astype('float32') #FAISS expects data in type float32 instead of float64 - saves memory too!
                # print(curr_layer_hidden_states.shape)
                
                if(f"layer_{layer}" in hidden_states_by_layer):
                    hidden_states_by_layer[f"layer_{layer}"].append(curr_layer_hidden_states)
                else:
                    hidden_states_by_layer[f"layer_{layer}"] = [curr_layer_hidden_states]

100%|███████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:22<00:00, 45.37it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:21<00:00, 46.97it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:23<00:00, 43.17it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:20<00:00, 47.92it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:22<00:00, 44.65it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:20<00:00, 47.76it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:21<00:00, 47.07it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:21<00:00, 45.61it/s]
100%|███████████████████████████

In [4]:
layer_hs_array = hidden_states_by_layer["layer_0"]
print(layer_hs_array[0].shape)
print(layer_hs_array[-1].shape)
#1x512


for hs in range(len(layer_hs_array)):
    layer_hs_array[hs] = layer_hs_array[hs].flatten()

(1, 512)
(1, 512)


In [5]:
max_story_len = max(df[:NUM_PROMPTS*1000]["len_generated_story"])
max_story_len

522

In [6]:
df[df["len_generated_story"] >= max_story_len]

Unnamed: 0,prompt_id,prompt,story,hidden_state_file,len_generated_story
9851,10,Once upon a time there was a poor boy,Once upon a time there was a poor boy named Ti...,./hidden_states/prompt_10.npz,522


In [7]:
min_story_len = min(df["len_generated_story"])
min_story_len

37

In [8]:
def safe_split(total, train_ratio=0.8):
    train = int(train_ratio * total)   # floor
    test  = total - train              # leftover
    return train, test

def build_dataset(curr_context_level_hs, curr_labels, train_ratio=0.8):
    """
    Groups samples by prompt_id, splits safely into train/test,
    and returns X_train, y_train, X_test, y_test.
    """

    unique_ids = sorted(set(curr_labels))

    X_train_list = []
    X_test_list = []
    y_train_list = []
    y_test_list = []

    for pid in unique_ids:

        # get samples for this prompt id
        mask = (curr_labels == pid)
        X_pid = curr_context_level_hs[mask]
        y_pid = curr_labels[mask]

        total = len(X_pid)
        train_n, test_n = safe_split(total, train_ratio)

        # split
        X_train_list.append(X_pid[:train_n])
        y_train_list.append(y_pid[:train_n])

        X_test_list.append(X_pid[train_n:])
        y_test_list.append(y_pid[train_n:])

    # concatenate all prompt-id blocks
    X_train = np.concatenate(X_train_list, axis=0)
    y_train = np.concatenate(y_train_list, axis=0)
    X_test  = np.concatenate(X_test_list, axis=0)
    y_test  = np.concatenate(y_test_list, axis=0)

    # return cp.array(X_train), cp.array(y_train), cp.array(X_test), cp.array(y_test)
    return X_train, y_train, X_test, y_test
    

In [11]:
curr_context_level_hs = np.array(layer_hs_array)
curr_labels = np.array(curr_labels)

In [12]:
#Context-level analysis
unique_ids = sorted(set(curr_labels))

X_train, y_train, X_test, y_test = build_dataset(curr_context_level_hs, curr_labels)

# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train)
# X_test_scaled = scaler.transform(X_test)

# from sklearn.decomposition import PCA
# pca = PCA(n_components=0.95)

# X_train_pca = pca.fit_transform(X_train_scaled)
# X_test_pca = pca.transform(X_test_scaled)

# print("Train Data Shape: ", X_train_pca.shape)
# # print("Train Labels Shape: ", y_train.shape)
# print("Test Data Shape: ", X_test_pca.shape)
# # print("Test Labels Shape: ", y_test.shape)

# X_train_pca = cp.array(X_train_pca)
# y_train = cp.array(y_train)
# X_test_pca = cp.array(X_test_pca)
# y_test = cp.array(y_test)

print("Train Data Shape: ", X_train.shape)
#     print("Train Labels Shape: ", y_train.shape)
print("Test Data Shape: ", X_test.shape)
#     print("Test Labels Shape: ", y_test.shape)

X_train = cp.array(X_train)
y_train = cp.array(y_train)
X_test = cp.array(X_test)
y_test = cp.array(y_test)

# classifier = XGBClassifier(seed = 42, objective = 'multi:softmax', eval_metric = "merror", num_class = len(unique_ids))
# classifier = XGBClassifier(seed = 42, objective = 'multi:softmax', eval_metric = "merror", num_class = len(unique_ids), tree_method='gpu_hist', predictor='gpu_predictor')

try:
    classifier = XGBClassifier(seed = 42, objective = 'multi:softmax', eval_metric = "merror", num_class = len(unique_ids), tree_method='hist', device='cuda')
    classifier.fit(X_train, y_train)
    preds = classifier.predict(X_test)

except XGBoostError:
    print("GPU OOM; used max_bin=128")
    classifier = XGBClassifier(seed = 42, objective = 'multi:softmax', eval_metric = "merror", num_class = len(unique_ids), tree_method='hist', device='cuda', max_bin=128)
    classifier.fit(X_train, y_train)
    preds = classifier.predict(X_test)

# classifier.fit(X_train_pca, y_train)
# preds = classifier.predict(X_test_pca)


#     print("Predictions Shape: ", preds.shape)

#     print("Unique Preds: ", pd.Series(preds).value_counts())
#     print("Unique Test Labels: ", pd.Series(y_test).value_counts())
accuracy = np.mean(cp.array(preds) == y_test)
print(f"Accuracy: {accuracy}")
with open("results-50.csv", "a+", newline='') as csvfile:
    csv_writer = csv.writer(csvfile, delimiter=',')
    csv_writer.writerow([context_level + 1, accuracy])


del classifier, X_train, y_train, X_test, y_test, preds

# XGBoost cleanup
try:
    booster = classifier.get_booster()
    del booster
except:
    pass

# CuPy cleanup (CRITICAL)
cp.get_default_memory_pool().free_all_blocks()
cp.get_default_pinned_memory_pool().free_all_blocks()

gc.collect()

Train Data Shape:  (8000, 512)
Test Data Shape:  (2000, 512)
Accuracy: 0.182


837

In [13]:
pd.Series(curr_labels).value_counts()

0    1000
1    1000
2    1000
3    1000
4    1000
5    1000
6    1000
7    1000
8    1000
9    1000
Name: count, dtype: int64

In [None]:
# print(type(preds))

In [None]:
# print(type(y_test))