In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from tqdm import tqdm

import xgboost
from xgboost import XGBClassifier

import transformers
import torch

import csv

import gc

In [2]:
!python3 --version
print(f"XGBoost Version: {xgboost.__version__}")
print(f"PyTorch Version: {torch.__version__}")
print(f"Transformers Version: {transformers.__version__}")
print(f"NumPy Version: {np.__version__}")

Python 3.10.12
XGBoost Version: 3.1.2
PyTorch Version: 2.2.2+cu121
Transformers Version: 4.33.3
NumPy Version: 1.26.4


In [3]:
df = pd.read_csv("../../llamatales/story_dataset.csv")
df

Unnamed: 0,prompt_id,prompt,story,hidden_state_file,len_generated_story,len_new_story
0,1,Once upon a time there was a dragon,Once upon a time there was a dragon named Blaz...,./hidden_states/prompt_1.npz,270,271
1,1,Once upon a time there was a dragon,Once upon a time there was a dragon named Spar...,./hidden_states/prompt_1.npz,349,350
2,1,Once upon a time there was a dragon,Once upon a time there was a dragon named Scor...,./hidden_states/prompt_1.npz,278,278
3,1,Once upon a time there was a dragon,Once upon a time there was a dragon. The drago...,./hidden_states/prompt_1.npz,117,118
4,1,Once upon a time there was a dragon,Once upon a time there was a dragon. The drago...,./hidden_states/prompt_1.npz,129,130
...,...,...,...,...,...,...
9995,10,Once upon a time there was a poor boy,Once upon a time there was a poor boy named Ti...,./hidden_states/prompt_10.npz,289,290
9996,10,Once upon a time there was a poor boy,Once upon a time there was a poor boy named Ti...,./hidden_states/prompt_10.npz,119,119
9997,10,Once upon a time there was a poor boy,Once upon a time there was a poor boy named Ti...,./hidden_states/prompt_10.npz,127,128
9998,10,Once upon a time there was a poor boy,Once upon a time there was a poor boy named Ti...,./hidden_states/prompt_10.npz,441,441


In [4]:
hidden_states_by_layer = {}
NUM_PROMPTS = 10
# NUM_PROMPTS = 2

for prompt_id in range(1, NUM_PROMPTS + 1):
    with np.load(f'../../llamatales/hidden_states/prompt_{prompt_id}.npz') as loaded_data:
        for i in tqdm(range(1000)):
            curr_hidden_states = loaded_data[f"arr_{i}"][0]
            # print(curr_hidden_states.shape)
            
            #By layer
            for layer in range(1, 2):
                curr_layer_hidden_states = curr_hidden_states[layer][0].astype('float32') #FAISS expects data in type float32 instead of float64 - saves memory too!
                # print(curr_layer_hidden_states.shape)
                
                if(f"layer_{layer}" in hidden_states_by_layer):
                    hidden_states_by_layer[f"layer_{layer}"].append(curr_layer_hidden_states)
                else:
                    hidden_states_by_layer[f"layer_{layer}"] = [curr_layer_hidden_states]

100%|███████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:22<00:00, 43.55it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:37<00:00, 26.46it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 1000/1000 [02:36<00:00,  6.37it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 1000/1000 [02:51<00:00,  5.83it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 1000/1000 [02:48<00:00,  5.94it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 1000/1000 [02:31<00:00,  6.58it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 1000/1000 [02:36<00:00,  6.41it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 1000/1000 [02:28<00:00,  6.75it/s]
100%|███████████████████████████

In [5]:
layer_hs_array = hidden_states_by_layer["layer_1"]
print(layer_hs_array[0].shape)
print(layer_hs_array[1].shape)

(270, 512)
(349, 512)


In [6]:
min_story_len = min(df["len_generated_story"])
min_story_len

21

In [7]:
max_story_len = max(df[:NUM_PROMPTS*1000]["len_generated_story"])

In [8]:
max_new_story_len = max(df["len_new_story"])

In [9]:
max_new_story_len = max(df[:NUM_PROMPTS*1000]["len_new_story"])

In [10]:
max_new_story_len

523

In [11]:
df[df["len_generated_story"] >= max_story_len].shape

(13, 6)

In [12]:
df[df["len_new_story"] >= max_new_story_len]

Unnamed: 0,prompt_id,prompt,story,hidden_state_file,len_generated_story,len_new_story
2360,3,Once upon a time there were two children,"Once upon a time there were two children, Emma...",./hidden_states/prompt_3.npz,522,523
2628,3,Once upon a time there were two children,Once upon a time there were two children named...,./hidden_states/prompt_3.npz,522,523
7795,8,Once upon a time there was a wolf,Once upon a time there was a wolf named Max. M...,./hidden_states/prompt_8.npz,522,523
9178,10,Once upon a time there was a poor boy,Once upon a time there was a poor boy named Ti...,./hidden_states/prompt_10.npz,522,523
9201,10,Once upon a time there was a poor boy,Once upon a time there was a poor boy named Ti...,./hidden_states/prompt_10.npz,522,523
9270,10,Once upon a time there was a poor boy,Once upon a time there was a poor boy named Ti...,./hidden_states/prompt_10.npz,522,523
9313,10,Once upon a time there was a poor boy,Once upon a time there was a poor boy named Ti...,./hidden_states/prompt_10.npz,522,523
9603,10,Once upon a time there was a poor boy,Once upon a time there was a poor boy named Ti...,./hidden_states/prompt_10.npz,522,523
9626,10,Once upon a time there was a poor boy,Once upon a time there was a poor boy named Ti...,./hidden_states/prompt_10.npz,522,523
9685,10,Once upon a time there was a poor boy,Once upon a time there was a poor boy named Ti...,./hidden_states/prompt_10.npz,522,523


In [13]:
max(df[:NUM_PROMPTS*1000]["len_new_story"])

523

In [14]:
hs_shapes = []
for i in range(len(layer_hs_array)):
    hs_shapes.append(layer_hs_array[i].shape[0])

In [15]:
np.array(hs_shapes)[9685]

522

In [16]:
import numpy as np

def safe_split(total, train_ratio=0.8):
    train = int(train_ratio * total)
    test  = total - train   
    return train, test

def build_dataset(curr_context_level_hs, curr_labels, train_ratio=0.8):
    """
    Groups samples by prompt_id, splits safely into train/test,
    and returns X_train, y_train, X_test, y_test.
    """

    unique_ids = sorted(set(curr_labels))

    X_train_list = []
    X_test_list = []
    y_train_list = []
    y_test_list = []

    for pid in unique_ids:

        # get samples for this prompt id
        mask = (curr_labels == pid)
        X_pid = curr_context_level_hs[mask]
        y_pid = curr_labels[mask]

        total = len(X_pid)
        train_n, test_n = safe_split(total, train_ratio)

        # split
        X_train_list.append(X_pid[:train_n])
        y_train_list.append(y_pid[:train_n])

        X_test_list.append(X_pid[train_n:])
        y_test_list.append(y_pid[train_n:])

    # concatenate all prompt-id blocks
    X_train = np.concatenate(X_train_list, axis=0)
    y_train = np.concatenate(y_train_list, axis=0)
    X_test  = np.concatenate(X_test_list, axis=0)
    y_test  = np.concatenate(y_test_list, axis=0)

    return X_train, y_train, X_test, y_test

In [17]:
#Context-level analysis
# min_story_len = min(df["len_new_story"])
max_story_len = max(df[:NUM_PROMPTS*1000]["len_new_story"]) #save hidden states when generating new story - length of hidden states is of the old story (522)

with open("results-5.csv", "w+", newline='') as csvfile:
    csv_writer = csv.writer(csvfile, delimiter=',')
    csv_writer.writerow(['Context Level', 'Accuracy'])

for context_level in range(11, max_story_len):
    curr_context_level_hs = []
    curr_labels = []
    
    curr_hs_shapes = []
    for i in range(len(layer_hs_array)):
        # print(np.array(layer_hs_array[i][:context_level]).shape) #see if taking context level more than available causes error?
        if(layer_hs_array[i].shape[0] < context_level):
            # print(i) #see which samples have less tokens than current context level
            continue
        else:
            layer_hs_upto_context = layer_hs_array[i][10:context_level]
            curr_hs_shapes.append(layer_hs_array[i].shape[0])
            curr_context_level_hs.append(np.array(layer_hs_upto_context).flatten())
            curr_labels.append(df.iloc[i].prompt_id)
    
    curr_context_level_hs = np.array(curr_context_level_hs).astype('float32')
    # print([i for i in range(len(curr_context_level_hs)) if curr_context_level_hs[i].shape[0] < i]) #see if any residuals escape screening
    # print(curr_context_level_hs.shape) #print shape without samples less than context level
    curr_labels = np.array(curr_labels) - 1
    
    if np.var(curr_context_level_hs) < 1e-6:
        print("Skipped context level: ", context_level)
        continue
    
    unique_ids = sorted(set(curr_labels))

    if(len(unique_ids) < 10): break
    
    X_train, y_train, X_test, y_test = build_dataset(curr_context_level_hs, curr_labels)
    
    print("Train Data Shape: ", X_train.shape)
#     print("Train Labels Shape: ", y_train.shape)
    print("Test Data Shape: ", X_test.shape)
#     print("Test Labels Shape: ", y_test.shape)

    classifier = XGBClassifier(seed = 42, objective = 'multi:softmax', eval_metric = "merror", num_class = len(unique_ids), tree_method='hist', device='cuda', max_bin=128)
    
    # classifier = XGBClassifier(seed = 42, objective = 'multi:softmax', eval_metric = "merror", num_class = len(unique_ids), tree_method='gpu_hist', predictor='gpu_predictor')
    # classifier = XGBClassifier(seed = 42, objective = 'multi:softmax', eval_metric = "merror", num_class = len(unique_ids), tree_method='hist', predictor='cpu_predictor')
    # classifier = XGBClassifier(seed = 42, objective = 'multi:softmax', eval_metric = "merror", num_class = len(unique_ids), tree_method='gpu_hist', predictor='gpu_predictor', max_bin=128)
    classifier.fit(X_train, y_train)
    preds = classifier.predict(X_test)
    
#     print("Predictions Shape: ", preds.shape)
    
    # print("Unique Preds: ", pd.Series(preds).value_counts())
    # print("Unique Test Labels: ", pd.Series(y_test).value_counts())
    accuracy = np.mean(preds == y_test)
    print(f"Accuracy: {accuracy}")
    with open("results-5.csv", "a+", newline='') as csvfile:
        csv_writer = csv.writer(csvfile, delimiter=',')
        csv_writer.writerow([X_train.shape[1] // 512, accuracy])

Train Data Shape:  (8000, 512)
Test Data Shape:  (2000, 512)


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)


Accuracy: 0.992
Train Data Shape:  (8000, 1024)
Test Data Shape:  (2000, 1024)
Accuracy: 0.992
Train Data Shape:  (8000, 1536)
Test Data Shape:  (2000, 1536)
Accuracy: 0.99
Train Data Shape:  (8000, 2048)
Test Data Shape:  (2000, 2048)
Accuracy: 0.9905
Train Data Shape:  (8000, 2560)
Test Data Shape:  (2000, 2560)
Accuracy: 0.9905
Train Data Shape:  (8000, 3072)
Test Data Shape:  (2000, 3072)
Accuracy: 0.9895
Train Data Shape:  (8000, 3584)
Test Data Shape:  (2000, 3584)
Accuracy: 0.99
Train Data Shape:  (8000, 4096)
Test Data Shape:  (2000, 4096)
Accuracy: 0.989
Train Data Shape:  (8000, 4608)
Test Data Shape:  (2000, 4608)
Accuracy: 0.9895
Train Data Shape:  (8000, 5120)
Test Data Shape:  (2000, 5120)
Accuracy: 0.989
Train Data Shape:  (8000, 5632)
Test Data Shape:  (2000, 5632)
Accuracy: 0.9905
Train Data Shape:  (7999, 6144)
Test Data Shape:  (2000, 6144)
Accuracy: 0.9905
Train Data Shape:  (7999, 6656)
Test Data Shape:  (2000, 6656)
Accuracy: 0.99
Train Data Shape:  (7999, 7168)
T

In [18]:
gc.collect()

# 2. Clear PyTorch CUDA cache (if using PyTorch)
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    torch.cuda.synchronize()