In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from tqdm import tqdm

import xgboost as xgb
from xgboost import XGBClassifier
from xgboost.core import XGBoostError
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


import csv

import cupy as cp

import gc

In [2]:
df = pd.read_csv("../../story_dataset.csv")
df

Unnamed: 0,prompt_id,prompt,story,hidden_state_file,len_generated_story
0,1,Once upon a time there was a dragon,Once upon a time there was a dragon. It was bi...,./hidden_states/prompt_1.npz,273
1,1,Once upon a time there was a dragon,Once upon a time there was a dragon. He loved ...,./hidden_states/prompt_1.npz,246
2,1,Once upon a time there was a dragon,Once upon a time there was a dragon named Sam....,./hidden_states/prompt_1.npz,397
3,1,Once upon a time there was a dragon,Once upon a time there was a dragon. He was a ...,./hidden_states/prompt_1.npz,294
4,1,Once upon a time there was a dragon,Once upon a time there was a dragon. The drago...,./hidden_states/prompt_1.npz,296
...,...,...,...,...,...
9995,10,Once upon a time there was a poor boy,Once upon a time there was a poor boy named Ti...,./hidden_states/prompt_10.npz,315
9996,10,Once upon a time there was a poor boy,Once upon a time there was a poor boy named Ti...,./hidden_states/prompt_10.npz,270
9997,10,Once upon a time there was a poor boy,Once upon a time there was a poor boy named Ti...,./hidden_states/prompt_10.npz,206
9998,10,Once upon a time there was a poor boy,Once upon a time there was a poor boy named Ti...,./hidden_states/prompt_10.npz,375


In [3]:
hidden_states_by_cl = {}
curr_labels = {}
NUM_PROMPTS = 10
layer = 8

In [4]:
max_story_len = max(df[:NUM_PROMPTS*1000]["len_generated_story"])
max_story_len

522

In [5]:
df[df["len_generated_story"] >= max_story_len]

Unnamed: 0,prompt_id,prompt,story,hidden_state_file,len_generated_story
9851,10,Once upon a time there was a poor boy,Once upon a time there was a poor boy named Ti...,./hidden_states/prompt_10.npz,522


In [6]:
min_story_len = min(df["len_generated_story"])
min_story_len

37

In [7]:
context_levels = [0] #get hs used to generate second token
context_levels.extend([i for i in range(23, max_story_len, 25)]) #actual context level is context_level + 2 (gather hs to generate the context_level + 2 token)

In [8]:
hidden_states_by_cl = {f"cl_{cl+2}": [] for cl in context_levels}
curr_labels = {f"cl_{cl+2}": [] for cl in context_levels}

for prompt_id in range(1, NUM_PROMPTS + 1):
    with np.load(f'../../../llamatales-xgboost-ii/hidden_states/prompt_{prompt_id}.npz') as loaded_data:

        for i in tqdm(range(1000)):
            hs = loaded_data[f"arr_{i}"]  # load once

            for context_level in context_levels:
                if context_level >= hs.shape[0]:
                    continue

                key = f"cl_{context_level + 2}"

                hidden_states_by_cl[key].append(
                    hs[context_level][layer][0].astype("float32")
                )
                curr_labels[key].append(prompt_id - 1)


100%|███████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:28<00:00, 34.65it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:27<00:00, 35.98it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:30<00:00, 32.99it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:27<00:00, 36.36it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:29<00:00, 33.98it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:27<00:00, 36.25it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:27<00:00, 35.95it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:28<00:00, 35.49it/s]
100%|███████████████████████████

In [9]:
def safe_split(total, train_ratio=0.8):
    train = int(train_ratio * total)   # floor
    test  = total - train              # leftover
    return train, test

def build_dataset(curr_context_level_hs, curr_labels, train_ratio=0.8):
    """
    Groups samples by prompt_id, splits safely into train/test,
    and returns X_train, y_train, X_test, y_test.
    """

    unique_ids = sorted(set(curr_labels))

    X_train_list = []
    X_test_list = []
    y_train_list = []
    y_test_list = []

    for pid in unique_ids:

        # get samples for this prompt id
        mask = (curr_labels == pid)
        X_pid = curr_context_level_hs[mask]
        y_pid = curr_labels[mask]

        total = len(X_pid)
        train_n, test_n = safe_split(total, train_ratio)

        # split
        X_train_list.append(X_pid[:train_n])
        y_train_list.append(y_pid[:train_n])

        X_test_list.append(X_pid[train_n:])
        y_test_list.append(y_pid[train_n:])

    # concatenate all prompt-id blocks
    X_train = np.concatenate(X_train_list, axis=0)
    y_train = np.concatenate(y_train_list, axis=0)
    X_test  = np.concatenate(X_test_list, axis=0)
    y_test  = np.concatenate(y_test_list, axis=0)

    # return cp.array(X_train), cp.array(y_train), cp.array(X_test), cp.array(y_test)
    return X_train, y_train, X_test, y_test
    

In [10]:
np.array(hidden_states_by_cl[f"cl_2"]).shape

(10000, 1, 512)

In [11]:
print(np.array(curr_labels[f"cl_2"]).shape)

(10000,)


In [12]:
# with open("results-test.csv", "w+", newline='') as csvfile:
#     csv_writer = csv.writer(csvfile, delimiter=',')
#     header = ['Layer', 'Context_Level', 'Train_Accuracy', 'Test_Accuracy']
#     prompt_headers = []
#     for i in range(1, 11):
#         prompt_headers.extend([f"Prompt_{i}_Train_Accuracy", f"Prompt_{i}_Test_Accuracy"])

#     header.extend(prompt_headers)
#     csv_writer.writerow(header)

for context_level in context_levels[1:]:
    print(f"Optimizing Layer {layer} at Context Level {context_level + 2}")
    cl_hs_array = hidden_states_by_cl[f"cl_{context_level + 2}"]
    curr_label_set = curr_labels[f"cl_{context_level + 2}"]
    # print(cl_hs_array[0].shape)
    # print(cl_hs_array[-1].shape)
    #1x512

    print(np.array(cl_hs_array).shape)
    for hs in range(len(cl_hs_array)):
        cl_hs_array[hs] = cl_hs_array[hs].flatten()

    curr_context_level_hs = np.array(cl_hs_array)
    print(curr_context_level_hs.shape)

    curr_label_set = np.array(curr_label_set)

    unique_ids = sorted(set(curr_label_set))

    if(len(unique_ids) < 10): break
    
    X_train, y_train, X_test, y_test = build_dataset(curr_context_level_hs, curr_label_set)

    X_train_opt, X_valid, y_train_opt, y_valid = train_test_split(X_train, y_train, test_size = 0.2, random_state=42)

    #split into train and test and see how many samples of each class are in the test set (might explain 0.0 acc performance in test set).
    print(pd.Series(y_train).value_counts())
    print(pd.Series(y_test).value_counts())
    print(pd.Series(y_train_opt).value_counts())
    print(pd.Series(y_valid).value_counts())

    print("Train Data Shape: ", X_train.shape)
    print("Test Data Shape: ", X_test.shape)
    print("Optimizer Train Data Shape: ", X_train_opt.shape)

    X_train = cp.array(X_train)
    y_train = cp.array(y_train)
    X_test = cp.array(X_test)
    y_test = cp.array(y_test)

    X_train_opt = cp.array(X_train_opt)
    y_train_opt = cp.array(y_train_opt)
    X_valid = cp.array(X_valid)
    y_valid = cp.array(y_valid)

    classifier = XGBClassifier(max_depth = 3, 
                               reg_alpha = 10, 
                               reg_lambda = 10, 
                               gamma = 10, 
                               subsample = 0.75,
                               colsample_bytree = 0.75,
                               eta = 0.01,
                               n_estimators = 500,
                               # min_child_weight = 20,
                               seed = 42, objective = 'multi:softmax', eval_metric = "merror", num_class = len(unique_ids), tree_method='hist', device='cuda')
    classifier.fit(X_train, y_train)
    preds_train = classifier.predict(X_train)
    preds = classifier.predict(X_test)

    train_accuracy = np.mean(cp.array(preds_train) == y_train)
    accuracy = np.mean(cp.array(preds) == y_test)

    print(f"Train Accuracy: {train_accuracy}")
    print(f"Test Accuracy: {accuracy}")

    prompt_accs = []

    for i in range(10):
        mask = y_test == i
        prompt_test = y_test[mask]
        prompt_preds = cp.array(preds)[mask]
        prompt_test_acc = np.mean(prompt_preds == prompt_test)

        mask_train = y_train == i
        prompt_train = y_train[mask_train]
        prompt_preds_train = cp.array(preds_train)[mask_train]
        prompt_train_acc = np.mean(prompt_preds_train == prompt_train)

        print(f"Prompt {i + 1} Train Accuracy: {prompt_train_acc}")
        print(f"Prompt {i + 1} Test Accuracy: {prompt_test_acc}")

        prompt_accs.append(prompt_train_acc)
        prompt_accs.append(prompt_test_acc)

    
    with open("results-test.csv", "a+", newline='') as csvfile:
        csv_writer = csv.writer(csvfile, delimiter=',')
        values = [layer, context_level + 2, train_accuracy, accuracy]
        values.extend(prompt_accs)
        csv_writer.writerow(values)
            


    del classifier, X_train, y_train, X_test, y_test, preds

    # XGBoost cleanup
    try:
        booster = classifier.get_booster()
        del booster
    except:
        pass
    
    # CuPy cleanup
    cp.get_default_memory_pool().free_all_blocks()
    cp.get_default_pinned_memory_pool().free_all_blocks()
    
    gc.collect()

Optimizing Layer 8 at Context Level 25
(10000, 1, 512)
(10000, 512)
0    800
1    800
2    800
3    800
4    800
5    800
6    800
7    800
8    800
9    800
Name: count, dtype: int64
0    200
1    200
2    200
3    200
4    200
5    200
6    200
7    200
8    200
9    200
Name: count, dtype: int64
6    654
4    653
2    642
9    642
3    639
5    639
8    635
1    634
7    634
0    628
Name: count, dtype: int64
0    172
1    166
7    166
8    165
3    161
5    161
2    158
9    158
4    147
6    146
Name: count, dtype: int64
Train Data Shape:  (8000, 512)
Test Data Shape:  (2000, 512)
Optimizer Train Data Shape:  (6400, 512)
Train Accuracy: 0.73425
Test Accuracy: 0.6405
Prompt 1 Train Accuracy: 0.79625
Prompt 1 Test Accuracy: 0.715
Prompt 2 Train Accuracy: 0.65375
Prompt 2 Test Accuracy: 0.515
Prompt 3 Train Accuracy: 0.8375
Prompt 3 Test Accuracy: 0.77
Prompt 4 Train Accuracy: 0.46125
Prompt 4 Test Accuracy: 0.325
Prompt 5 Train Accuracy: 0.85875
Prompt 5 Test Accuracy: 0.76
Prompt 6