In [1]:
!pip install xgboost
!apt-get -y update
!apt-get -y install libatlas-base-dev

Hit:1 http://security.ubuntu.com/ubuntu bionic-security InRelease              
Hit:2 http://archive.ubuntu.com/ubuntu bionic InRelease                        
Hit:3 http://archive.ubuntu.com/ubuntu bionic-updates InRelease
Hit:4 http://archive.ubuntu.com/ubuntu bionic-backports InRelease
Reading package lists... Done
Reading package lists... Done
Building dependency tree       
Reading state information... Done
libatlas-base-dev is already the newest version (3.10.3-5).
0 upgraded, 0 newly installed, 0 to remove and 83 not upgraded.


In [2]:
# !conda install -y -c conda-forge py-xgboost

In [3]:
# !conda install -y -c conda-forge py-xgboost=*=cuda*

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from tqdm import tqdm

from xgboost import XGBClassifier

In [5]:
df = pd.read_csv("../llamatales/story_dataset.csv")
df

Unnamed: 0,prompt_id,prompt,story,hidden_state_file,len_generated_story,len_new_story
0,1,Once upon a time there was a dragon,Once upon a time there was a dragon named Blaz...,./hidden_states/prompt_1.npz,270,271
1,1,Once upon a time there was a dragon,Once upon a time there was a dragon named Spar...,./hidden_states/prompt_1.npz,349,350
2,1,Once upon a time there was a dragon,Once upon a time there was a dragon named Scor...,./hidden_states/prompt_1.npz,278,278
3,1,Once upon a time there was a dragon,Once upon a time there was a dragon. The drago...,./hidden_states/prompt_1.npz,117,118
4,1,Once upon a time there was a dragon,Once upon a time there was a dragon. The drago...,./hidden_states/prompt_1.npz,129,130
...,...,...,...,...,...,...
9995,10,Once upon a time there was a poor boy,Once upon a time there was a poor boy named Ti...,./hidden_states/prompt_10.npz,289,290
9996,10,Once upon a time there was a poor boy,Once upon a time there was a poor boy named Ti...,./hidden_states/prompt_10.npz,119,119
9997,10,Once upon a time there was a poor boy,Once upon a time there was a poor boy named Ti...,./hidden_states/prompt_10.npz,127,128
9998,10,Once upon a time there was a poor boy,Once upon a time there was a poor boy named Ti...,./hidden_states/prompt_10.npz,441,441


In [6]:
hidden_states_by_layer = {}
NUM_PROMPTS = 10
# NUM_PROMPTS = 2

for prompt_id in range(1, NUM_PROMPTS + 1):
    with np.load(f'../llamatales/hidden_states/prompt_{prompt_id}.npz') as loaded_data:
        for i in tqdm(range(1000)):
            curr_hidden_states = loaded_data[f"arr_{i}"][0]
            # print(curr_hidden_states.shape)
            
            #By layer
            for layer in range(1):
                curr_layer_hidden_states = curr_hidden_states[layer][0].astype('float32') #FAISS expects data in type float32 instead of float64 - saves memory too!
                # print(curr_layer_hidden_states.shape)
                
                if(f"layer_{layer}" in hidden_states_by_layer):
                    hidden_states_by_layer[f"layer_{layer}"].append(curr_layer_hidden_states)
                else:
                    hidden_states_by_layer[f"layer_{layer}"] = [curr_layer_hidden_states]

100%|██████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [01:07<00:00, 14.89it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:57<00:00, 17.32it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:53<00:00, 18.75it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [01:01<00:00, 16.19it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [01:02<00:00, 16.08it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:59<00:00, 16.90it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:58<00:00, 17.12it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [

In [7]:
layer_hs_array = hidden_states_by_layer["layer_0"]
print(layer_hs_array[0].shape)
print(layer_hs_array[1].shape)

(270, 512)
(349, 512)


In [8]:
min_story_len = min(df["len_generated_story"])
min_story_len

21

In [9]:
max_story_len = max(df[:NUM_PROMPTS*1000]["len_generated_story"])

In [10]:
max_new_story_len = max(df["len_new_story"])

In [11]:
max_new_story_len = max(df[:NUM_PROMPTS*1000]["len_new_story"])

In [12]:
max_new_story_len

523

In [13]:
df[df["len_generated_story"] >= max_story_len].shape

(13, 6)

In [14]:
df[df["len_new_story"] >= max_new_story_len]

Unnamed: 0,prompt_id,prompt,story,hidden_state_file,len_generated_story,len_new_story
2360,3,Once upon a time there were two children,"Once upon a time there were two children, Emma...",./hidden_states/prompt_3.npz,522,523
2628,3,Once upon a time there were two children,Once upon a time there were two children named...,./hidden_states/prompt_3.npz,522,523
7795,8,Once upon a time there was a wolf,Once upon a time there was a wolf named Max. M...,./hidden_states/prompt_8.npz,522,523
9178,10,Once upon a time there was a poor boy,Once upon a time there was a poor boy named Ti...,./hidden_states/prompt_10.npz,522,523
9201,10,Once upon a time there was a poor boy,Once upon a time there was a poor boy named Ti...,./hidden_states/prompt_10.npz,522,523
9270,10,Once upon a time there was a poor boy,Once upon a time there was a poor boy named Ti...,./hidden_states/prompt_10.npz,522,523
9313,10,Once upon a time there was a poor boy,Once upon a time there was a poor boy named Ti...,./hidden_states/prompt_10.npz,522,523
9603,10,Once upon a time there was a poor boy,Once upon a time there was a poor boy named Ti...,./hidden_states/prompt_10.npz,522,523
9626,10,Once upon a time there was a poor boy,Once upon a time there was a poor boy named Ti...,./hidden_states/prompt_10.npz,522,523
9685,10,Once upon a time there was a poor boy,Once upon a time there was a poor boy named Ti...,./hidden_states/prompt_10.npz,522,523


In [15]:
max(df[:NUM_PROMPTS*1000]["len_new_story"])

523

In [16]:
hs_shapes = []
for i in range(len(layer_hs_array)):
    hs_shapes.append(layer_hs_array[i].shape[0])

In [17]:
np.array(hs_shapes)[9685]

522

In [18]:
import numpy as np

def safe_split(total, train_ratio=0.8):
    train = int(train_ratio * total)   # floor
    test  = total - train              # leftover
    return train, test

def build_dataset(curr_context_level_hs, curr_labels, train_ratio=0.8):
    """
    Groups samples by prompt_id, splits safely into train/test,
    and returns X_train, y_train, X_test, y_test.
    """

    unique_ids = sorted(set(curr_labels))

    X_train_list = []
    X_test_list = []
    y_train_list = []
    y_test_list = []

    for pid in unique_ids:

        # get samples for this prompt id
        mask = (curr_labels == pid)
        X_pid = curr_context_level_hs[mask]
        y_pid = curr_labels[mask]

        total = len(X_pid)
        train_n, test_n = safe_split(total, train_ratio)

        # split
        X_train_list.append(X_pid[:train_n])
        y_train_list.append(y_pid[:train_n])

        X_test_list.append(X_pid[train_n:])
        y_test_list.append(y_pid[train_n:])

    # concatenate all prompt-id blocks
    X_train = np.concatenate(X_train_list, axis=0)
    y_train = np.concatenate(y_train_list, axis=0)
    X_test  = np.concatenate(X_test_list, axis=0)
    y_test  = np.concatenate(y_test_list, axis=0)

    return X_train, y_train, X_test, y_test

In [None]:
%%capture output
#Context-level analysis
# min_story_len = min(df["len_new_story"])
max_story_len = max(df[:NUM_PROMPTS*1000]["len_new_story"]) #save hidden states when generating new story - length of hidden states is of the old story (522)
for context_level in range(11, max_story_len):
    curr_context_level_hs = []
    curr_labels = []
    
    curr_hs_shapes = []
    for i in range(len(layer_hs_array)):
        # print(np.array(layer_hs_array[i][:context_level]).shape) #see if taking context level more than available causes error?
        if(layer_hs_array[i].shape[0] < context_level):
            # print(i) #see which samples have less tokens than current context level
            continue
        else:
            layer_hs_upto_context = layer_hs_array[i][10:context_level]
            curr_hs_shapes.append(layer_hs_array[i].shape[0])
            curr_context_level_hs.append(np.array(layer_hs_upto_context).flatten())
            curr_labels.append(df.iloc[i].prompt_id)
    
    curr_context_level_hs = np.array(curr_context_level_hs).astype('float32')
    # print([i for i in range(len(curr_context_level_hs)) if curr_context_level_hs[i].shape[0] < i]) #see if any residuals escape screening
    # print(curr_context_level_hs.shape) #print shape without samples less than context level
    curr_labels = np.array(curr_labels) - 1
    
    if np.var(curr_context_level_hs) < 1e-6:
        continue
    
    unique_ids = sorted(set(curr_labels))
    
    X_train, y_train, X_test, y_test = build_dataset(curr_context_level_hs, curr_labels)
    
    classifier = XGBClassifier(seed = 42, objective = 'multi:softmax', eval_metric = "merror", num_class = len(unique_ids))
    classifier.fit(X_train, y_train)
    preds = classifier.predict(X_test)
    
    print("Train Data Shape: ", X_train.shape)
    print("Train Labels Shape: ", y_train.shape)
    print("Test Data Shape: ", X_test.shape)
    print("Test Labels Shape: ", y_test.shape)
    
    print("Predictions Shape: ", preds.shape)
    
    print("Unique Preds: ", pd.Series(preds).value_counts())
    print("Unique Test Labels: ", pd.Series(y_test).value_counts())
    print(f"Accuracy: {np.mean(preds == y_test)}")
    
    # print(max(curr_hs_shapes))

Train Data Shape:  (8000, 512)
Train Labels Shape:  (8000,)
Test Data Shape:  (2000, 512)
Test Labels Shape:  (2000,)
Predictions Shape:  (2000,)
Unique Preds:  0    354
3    268
2    261
8    212
6    212
9    210
4    188
1    134
7    129
5     32
dtype: int64
Unique Test Labels:  0    200
1    200
2    200
3    200
4    200
5    200
6    200
7    200
8    200
9    200
dtype: int64
Accuracy: 0.688
Train Data Shape:  (8000, 1024)
Train Labels Shape:  (8000,)
Test Data Shape:  (2000, 1024)
Test Labels Shape:  (2000,)
Predictions Shape:  (2000,)
Unique Preds:  7    260
2    260
8    221
9    204
1    199
4    190
0    176
6    174
3    168
5    148
dtype: int64
Unique Test Labels:  0    200
1    200
2    200
3    200
4    200
5    200
6    200
7    200
8    200
9    200
dtype: int64
Accuracy: 0.807
Train Data Shape:  (8000, 1536)
Train Labels Shape:  (8000,)
Test Data Shape:  (2000, 1536)
Test Labels Shape:  (2000,)
Predictions Shape:  (2000,)
Unique Preds:  5    236
7    224
3    213


In [None]:
for i in range(10):
    print(np.mean(preds[i * 200 : (i + 1) * 200] == i + 1))