In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from tqdm import tqdm

from xgboost import XGBClassifier

import csv

import cupy as cp

In [2]:
df = pd.read_csv("../../llamatales/story_dataset.csv")
df

Unnamed: 0,prompt_id,prompt,story,hidden_state_file,len_generated_story,len_new_story
0,1,Once upon a time there was a dragon,Once upon a time there was a dragon named Blaz...,./hidden_states/prompt_1.npz,270,271
1,1,Once upon a time there was a dragon,Once upon a time there was a dragon named Spar...,./hidden_states/prompt_1.npz,349,350
2,1,Once upon a time there was a dragon,Once upon a time there was a dragon named Scor...,./hidden_states/prompt_1.npz,278,278
3,1,Once upon a time there was a dragon,Once upon a time there was a dragon. The drago...,./hidden_states/prompt_1.npz,117,118
4,1,Once upon a time there was a dragon,Once upon a time there was a dragon. The drago...,./hidden_states/prompt_1.npz,129,130
...,...,...,...,...,...,...
9995,10,Once upon a time there was a poor boy,Once upon a time there was a poor boy named Ti...,./hidden_states/prompt_10.npz,289,290
9996,10,Once upon a time there was a poor boy,Once upon a time there was a poor boy named Ti...,./hidden_states/prompt_10.npz,119,119
9997,10,Once upon a time there was a poor boy,Once upon a time there was a poor boy named Ti...,./hidden_states/prompt_10.npz,127,128
9998,10,Once upon a time there was a poor boy,Once upon a time there was a poor boy named Ti...,./hidden_states/prompt_10.npz,441,441


In [3]:
hidden_states_by_layer = {}
NUM_PROMPTS = 10
# NUM_PROMPTS = 2

for prompt_id in range(1, NUM_PROMPTS + 1):
    with np.load(f'../../llamatales/hidden_states/prompt_{prompt_id}.npz') as loaded_data:
        for i in tqdm(range(1000)):
            curr_hidden_states = loaded_data[f"arr_{i}"][0]
            # print(curr_hidden_states.shape)
            
            #By layer
            for layer in range(1):
                curr_layer_hidden_states = curr_hidden_states[layer][0].astype('float32') #FAISS expects data in type float32 instead of float64 - saves memory too!
                # print(curr_layer_hidden_states.shape)
                
                if(f"layer_{layer}" in hidden_states_by_layer):
                    hidden_states_by_layer[f"layer_{layer}"].append(curr_layer_hidden_states)
                else:
                    hidden_states_by_layer[f"layer_{layer}"] = [curr_layer_hidden_states]

100%|███████████████████████████████████████████████████████████████████████████████| 1000/1000 [05:05<00:00,  3.27it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 1000/1000 [03:57<00:00,  4.21it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 1000/1000 [03:25<00:00,  4.87it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 1000/1000 [04:30<00:00,  3.70it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 1000/1000 [05:04<00:00,  3.29it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 1000/1000 [03:59<00:00,  4.17it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 1000/1000 [04:08<00:00,  4.02it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 1000/1000 [03:53<00:00,  4.28it/s]
100%|███████████████████████████

In [4]:
layer_hs_array = hidden_states_by_layer["layer_0"]
print(layer_hs_array[0].shape)
print(layer_hs_array[1].shape)

(270, 512)
(349, 512)


In [5]:
min_story_len = min(df["len_generated_story"])
min_story_len

21

In [6]:
max_story_len = max(df[:NUM_PROMPTS*1000]["len_generated_story"])

In [7]:
max_new_story_len = max(df["len_new_story"])

In [8]:
max_new_story_len = max(df[:NUM_PROMPTS*1000]["len_new_story"])

In [9]:
max_new_story_len

523

In [10]:
df[df["len_generated_story"] >= max_story_len].shape

(13, 6)

In [11]:
df[df["len_new_story"] >= max_new_story_len]

Unnamed: 0,prompt_id,prompt,story,hidden_state_file,len_generated_story,len_new_story
2360,3,Once upon a time there were two children,"Once upon a time there were two children, Emma...",./hidden_states/prompt_3.npz,522,523
2628,3,Once upon a time there were two children,Once upon a time there were two children named...,./hidden_states/prompt_3.npz,522,523
7795,8,Once upon a time there was a wolf,Once upon a time there was a wolf named Max. M...,./hidden_states/prompt_8.npz,522,523
9178,10,Once upon a time there was a poor boy,Once upon a time there was a poor boy named Ti...,./hidden_states/prompt_10.npz,522,523
9201,10,Once upon a time there was a poor boy,Once upon a time there was a poor boy named Ti...,./hidden_states/prompt_10.npz,522,523
9270,10,Once upon a time there was a poor boy,Once upon a time there was a poor boy named Ti...,./hidden_states/prompt_10.npz,522,523
9313,10,Once upon a time there was a poor boy,Once upon a time there was a poor boy named Ti...,./hidden_states/prompt_10.npz,522,523
9603,10,Once upon a time there was a poor boy,Once upon a time there was a poor boy named Ti...,./hidden_states/prompt_10.npz,522,523
9626,10,Once upon a time there was a poor boy,Once upon a time there was a poor boy named Ti...,./hidden_states/prompt_10.npz,522,523
9685,10,Once upon a time there was a poor boy,Once upon a time there was a poor boy named Ti...,./hidden_states/prompt_10.npz,522,523


In [12]:
max(df[:NUM_PROMPTS*1000]["len_new_story"])

523

In [13]:
hs_shapes = []
for i in range(len(layer_hs_array)):
    hs_shapes.append(layer_hs_array[i].shape[0])

In [14]:
np.array(hs_shapes)[9685]

522

In [15]:
def safe_split(total, train_ratio=0.8):
    train = int(train_ratio * total)   # floor
    test  = total - train              # leftover
    return train, test

def build_dataset(curr_context_level_hs, curr_labels, train_ratio=0.8):
    """
    Groups samples by prompt_id, splits safely into train/test,
    and returns X_train, y_train, X_test, y_test.
    """

    unique_ids = sorted(set(curr_labels))

    X_train_list = []
    X_test_list = []
    y_train_list = []
    y_test_list = []

    for pid in unique_ids:

        # get samples for this prompt id
        mask = (curr_labels == pid)
        X_pid = curr_context_level_hs[mask]
        y_pid = curr_labels[mask]

        total = len(X_pid)
        train_n, test_n = safe_split(total, train_ratio)

        # split
        X_train_list.append(X_pid[:train_n])
        y_train_list.append(y_pid[:train_n])

        X_test_list.append(X_pid[train_n:])
        y_test_list.append(y_pid[train_n:])

    # concatenate all prompt-id blocks
    X_train = np.concatenate(X_train_list, axis=0)
    y_train = np.concatenate(y_train_list, axis=0)
    X_test  = np.concatenate(X_test_list, axis=0)
    y_test  = np.concatenate(y_test_list, axis=0)

    # return cp.array(X_train), cp.array(y_train), cp.array(X_test), cp.array(y_test)
    return X_train, y_train, X_test, y_test
    

In [16]:
#Context-level analysis
# min_story_len = min(df["len_new_story"])
max_story_len = max(df[:NUM_PROMPTS*1000]["len_new_story"]) #save hidden states when generating new story - length of hidden states is of the old story (522)

with open("results-3.csv", "w+", newline='') as csvfile:
    csv_writer = csv.writer(csvfile, delimiter=',')
    csv_writer.writerow(['Context Level', 'Accuracy'])

for context_level in range(11, max_story_len):
    curr_context_level_hs = []
    curr_labels = []
    
    curr_hs_shapes = []
    for i in range(len(layer_hs_array)):
        # print(np.array(layer_hs_array[i][:context_level]).shape) #see if taking context level more than available causes error?
        if(layer_hs_array[i].shape[0] < context_level):
            # print(i) #see which samples have less tokens than current context level
            continue
        else:
            layer_hs_upto_context = layer_hs_array[i][10:context_level]
            curr_hs_shapes.append(layer_hs_array[i].shape[0])
            curr_context_level_hs.append(np.array(layer_hs_upto_context).flatten())
            curr_labels.append(df.iloc[i].prompt_id)
    
    curr_context_level_hs = np.array(curr_context_level_hs).astype('float32')
    # print([i for i in range(len(curr_context_level_hs)) if curr_context_level_hs[i].shape[0] < i]) #see if any residuals escape screening
    # print(curr_context_level_hs.shape) #print shape without samples less than context level
    curr_labels = np.array(curr_labels) - 1
    unique_ids = sorted(set(curr_labels))
    
    if np.var(curr_context_level_hs) < 1e-6:
        print("Skipped context level: ", context_level)
        continue

    if(len(unique_ids) < 10): break
    
    X_train, y_train, X_test, y_test = build_dataset(curr_context_level_hs, curr_labels)

    # from sklearn.preprocessing import StandardScaler
    # scaler = StandardScaler()
    # X_train_scaled = scaler.fit_transform(X_train)
    # X_test_scaled = scaler.transform(X_test)

    # from sklearn.decomposition import PCA
    # pca = PCA(n_components=0.95)
    
    # X_train_pca = pca.fit_transform(X_train_scaled)
    # X_test_pca = pca.transform(X_test_scaled)

    # print("Train Data Shape: ", X_train_pca.shape)
    # # print("Train Labels Shape: ", y_train.shape)
    # print("Test Data Shape: ", X_test_pca.shape)
    # # print("Test Labels Shape: ", y_test.shape)

    # X_train_pca = cp.array(X_train_pca)
    # y_train = cp.array(y_train)
    # X_test_pca = cp.array(X_test_pca)
    # y_test = cp.array(y_test)

    print("Train Data Shape: ", X_train.shape)
#     print("Train Labels Shape: ", y_train.shape)
    print("Test Data Shape: ", X_test.shape)
#     print("Test Labels Shape: ", y_test.shape)
    
    X_train = cp.array(X_train)
    y_train = cp.array(y_train)
    X_test = cp.array(X_test)
    y_test = cp.array(y_test)
    
    # classifier = XGBClassifier(seed = 42, objective = 'multi:softmax', eval_metric = "merror", num_class = len(unique_ids))
    # classifier = XGBClassifier(seed = 42, objective = 'multi:softmax', eval_metric = "merror", num_class = len(unique_ids), tree_method='gpu_hist', predictor='gpu_predictor')

    classifier = XGBClassifier(seed = 42, objective = 'multi:softmax', eval_metric = "merror", num_class = len(unique_ids), tree_method='hist', device='cuda')
    
    classifier.fit(X_train, y_train)
    preds = classifier.predict(X_test)

    # classifier.fit(X_train_pca, y_train)
    # preds = classifier.predict(X_test_pca)
    
    
#     print("Predictions Shape: ", preds.shape)
    
#     print("Unique Preds: ", pd.Series(preds).value_counts())
#     print("Unique Test Labels: ", pd.Series(y_test).value_counts())
    accuracy = np.mean(cp.array(preds) == y_test)
    print(f"Accuracy: {accuracy}")
    with open("results-3.csv", "a+", newline='') as csvfile:
        csv_writer = csv.writer(csvfile, delimiter=',')
        csv_writer.writerow([X_train.shape[1] // 512, accuracy])
    
    # print(max(curr_hs_shapes))

Train Data Shape:  (8000, 38)
Test Data Shape:  (2000, 38)
Accuracy: 0.681
Train Data Shape:  (8000, 65)
Test Data Shape:  (2000, 65)
Accuracy: 0.733
Train Data Shape:  (8000, 111)
Test Data Shape:  (2000, 111)
Accuracy: 0.7665
Train Data Shape:  (8000, 177)
Test Data Shape:  (2000, 177)
Accuracy: 0.82
Train Data Shape:  (8000, 264)
Test Data Shape:  (2000, 264)
Accuracy: 0.845
Train Data Shape:  (8000, 371)
Test Data Shape:  (2000, 371)
Accuracy: 0.868
Train Data Shape:  (8000, 494)
Test Data Shape:  (2000, 494)
Accuracy: 0.861
Train Data Shape:  (8000, 623)
Test Data Shape:  (2000, 623)
Accuracy: 0.863
Train Data Shape:  (8000, 760)
Test Data Shape:  (2000, 760)
Accuracy: 0.8835
Train Data Shape:  (8000, 906)
Test Data Shape:  (2000, 906)
Accuracy: 0.883
Train Data Shape:  (8000, 1062)
Test Data Shape:  (2000, 1062)
Accuracy: 0.884
Train Data Shape:  (7999, 1221)
Test Data Shape:  (2000, 1221)
Accuracy: 0.893
Train Data Shape:  (7999, 1382)
Test Data Shape:  (2000, 1382)
Accuracy: 0.

In [17]:
# print(type(preds))

In [18]:
# print(type(y_test))