## Install Packages

In [1]:
# !conda install -y -c conda-forge py-xgboost
!pip install xgboost
!apt-get -y update
!apt-get -y install libatlas-base-dev

Hit:1 http://security.ubuntu.com/ubuntu bionic-security InRelease
Hit:2 http://archive.ubuntu.com/ubuntu bionic InRelease                        
Hit:3 http://archive.ubuntu.com/ubuntu bionic-updates InRelease
Hit:4 http://archive.ubuntu.com/ubuntu bionic-backports InRelease
Reading package lists... Done                     
Reading package lists... Done
Building dependency tree       
Reading state information... Done
libatlas-base-dev is already the newest version (3.10.3-5).
0 upgraded, 0 newly installed, 0 to remove and 83 not upgraded.


## Load Data

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from tqdm import tqdm

from sklearn.random_projection import GaussianRandomProjection

from xgboost import XGBClassifier
import xgboost as xgb

In [3]:
df = pd.read_csv("../llamatales/story_dataset.csv")
df

Unnamed: 0,prompt_id,prompt,story,hidden_state_file,len_generated_story,len_new_story
0,1,Once upon a time there was a dragon,Once upon a time there was a dragon named Blaz...,./hidden_states/prompt_1.npz,270,271
1,1,Once upon a time there was a dragon,Once upon a time there was a dragon named Spar...,./hidden_states/prompt_1.npz,349,350
2,1,Once upon a time there was a dragon,Once upon a time there was a dragon named Scor...,./hidden_states/prompt_1.npz,278,278
3,1,Once upon a time there was a dragon,Once upon a time there was a dragon. The drago...,./hidden_states/prompt_1.npz,117,118
4,1,Once upon a time there was a dragon,Once upon a time there was a dragon. The drago...,./hidden_states/prompt_1.npz,129,130
...,...,...,...,...,...,...
9995,10,Once upon a time there was a poor boy,Once upon a time there was a poor boy named Ti...,./hidden_states/prompt_10.npz,289,290
9996,10,Once upon a time there was a poor boy,Once upon a time there was a poor boy named Ti...,./hidden_states/prompt_10.npz,119,119
9997,10,Once upon a time there was a poor boy,Once upon a time there was a poor boy named Ti...,./hidden_states/prompt_10.npz,127,128
9998,10,Once upon a time there was a poor boy,Once upon a time there was a poor boy named Ti...,./hidden_states/prompt_10.npz,441,441


In [4]:
max_story_len = max(df["len_generated_story"])
max_story_len

522

In [5]:
hidden_states_by_layer = {}
NUM_PROMPTS = 10

for prompt_id in range(1, NUM_PROMPTS + 1):
    with np.load(f'../llamatales/hidden_states/prompt_{prompt_id}.npz') as loaded_data:
        for i in tqdm(range(1000)):
            curr_hidden_states = loaded_data[f"arr_{i}"][0]
#             print(curr_hidden_states.shape)
            for layer in range(1):
                padded_arr = np.zeros((max_story_len, 512))
                padded_arr_len = len(curr_hidden_states[layer][0])
                
                padded_arr[:padded_arr_len] = curr_hidden_states[layer][0]
                
                padded_arr = padded_arr.flatten().astype('float32') #FAISS expects data in type float32 instead of float64 - saves memory too!
#                 print(padded_arr.shape)
                
                if(f"layer_{layer}" in hidden_states_by_layer):
                    hidden_states_by_layer[f"layer_{layer}"].append(padded_arr)
                else:
                    hidden_states_by_layer[f"layer_{layer}"] = [padded_arr]

100%|███████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:22<00:00, 45.28it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:20<00:00, 48.41it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:18<00:00, 54.92it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:21<00:00, 46.78it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:21<00:00, 47.13it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:20<00:00, 48.90it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 1000/1000 [02:56<00:00,  5.68it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 1000/1000 [04:11<00:00,  3.97it/s]
100%|███████████████████████████

In [6]:
layer_hs_array = np.array(hidden_states_by_layer["layer_0"])
layer_hs_array.shape

(10000, 267264)

## Layer 0 Clustering

In [7]:
# # Use original vectors for clustering - uncomment next line and comment out last two lines

dim_reduced_vecs = layer_hs_array

# random_projector = GaussianRandomProjection(random_state = 42)
# dim_reduced_vecs = random_projector.fit_transform(layer_hs_array).astype('float32')

In [8]:
# dim_reduced_vecs = np.array([v / np.linalg.norm(v) for v in dim_reduced_vecs])
# dim_reduced_vecs.shape

In [9]:
prompt_ids = df["prompt_id"]
prompt_ids = prompt_ids.to_numpy() - 1
prompt_ids

array([0, 0, 0, ..., 9, 9, 9])

In [10]:
# X_train, y_train, X_test, y_test = [], [], [], []
# for i in range(10):
#     X_train.extend(dim_reduced_vecs[i * 1000 : (i * 1000) + 800]) # [0:800] [1000:1800] etc.
#     y_train.extend(prompt_ids[i * 1000 : (i * 1000) + 800])
    
#     X_test.extend(dim_reduced_vecs[(i * 1000) + 800 : (i + 1) * 1000]) # [800:1000] [1800:2000] etc.
#     y_test.extend(prompt_ids[(i * 1000) + 800 : (i + 1) * 1000])

In [11]:
# X_train = np.array(X_train)
# y_train = np.array(y_train)
# X_test = np.array(X_test)
# y_test = np.array(y_test)

# print(X_train.shape)
# print(y_train.shape)
# print(X_test.shape)
# print(y_test.shape)

In [12]:
X_train = np.zeros((int(0.8 * dim_reduced_vecs.shape[0]), dim_reduced_vecs.shape[1]), dtype = np.float32)
y_train = np.zeros(int(0.8 * dim_reduced_vecs.shape[0]))

X_test = np.zeros((int(0.2 * dim_reduced_vecs.shape[0]), dim_reduced_vecs.shape[1]), dtype = np.float32)
y_test = np.zeros(int(0.2 * dim_reduced_vecs.shape[0]))

In [13]:
for i in range(10):
    X_train[i * 800 : (i + 1) * 800] = dim_reduced_vecs[i * 1000 : (i * 1000) + 800]
    y_train[i * 800 : (i + 1) * 800] = prompt_ids[i * 1000 : (i * 1000) + 800]
    
    X_test[i * 200 : (i + 1) * 200] = dim_reduced_vecs[(i * 1000) + 800 : (i + 1) * 1000]
    y_test[i * 200 : (i + 1) * 200] = prompt_ids[(i * 1000) + 800 : (i + 1) * 1000]

In [14]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(8000, 267264)
(8000,)
(2000, 267264)
(2000,)


In [15]:
xgb.set_config(verbosity=3)

In [16]:
# dtrain = xgb.DMatrix(X_train, label = y_train)
# # params = {'seed': 42, 'objective': 'multi:softmax', 'eval_metric': "merror", 'num_class': 10, 'device': 'cuda'}
# params = {'seed': 42, 'objective': 'multi:softmax', 'eval_metric': "merror", 'num_class': 10, 'device': 'cpu'}

In [17]:
# classifier = xgb.train(params, dtrain)

In [18]:
# dtest = xgb.DMatrix(X_test)
# preds = classifier.predict(dtest)

In [19]:
classifier = XGBClassifier(seed = 42, objective = 'multi:softmax', eval_metric = "merror", num_class = 10, tree_method='gpu_hist', predictor='gpu_predictor', device='cuda')
classifier.fit(X_train, y_train)
preds = classifier.predict(X_test)

[06:08:40] DEBUG: ../src/tree/updater_gpu_hist.cu:817: [GPU Hist]: Configure
Parameters: { "device" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[06:08:43] MakeCuts: 0.036155s, 1 calls @ 36155us

[06:08:43] Prune: 0.030576s, 1 calls @ 30576us

[06:08:43] ScanInput: 0.388551s, 1 calls @ 388551us

[06:08:43] Unique: 0.024897s, 1 calls @ 24897us

[06:11:37] Configure: 0.003321s, 1 calls @ 3321us

[06:11:37] EvalOneIter: 0.000547s, 100 calls @ 547us

[06:11:37] GetGradient: 0.002966s, 100 calls @ 2966us

[06:11:37] PredictRaw: 0.496597s, 100 calls @ 496597us

[06:11:37] UpdateOneIter: 176.737s, 100 calls @ 176736906us

[06:11:37] BoostNewTrees: 176.233s, 100 calls @ 176232706us

[06:11:37] CommitModel: 5.1e-05s, 100 calls @ 51us

[06:11:37] Peak mem

In [20]:
print(f"Accuracy: {np.mean(preds == y_test)}")

Accuracy: 1.0


## Perform Inference on Test Set & Collect Test Accuracy

## Confusion Matrix of Results

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

In [None]:
cm = confusion_matrix(y_train, classifications_to_label)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[f"{i}" for i in range(1, 11)])

disp.plot()
plt.title('Confusion Matrix for Train Set')
plt.show()

In [None]:
cm = confusion_matrix(y_test, classifications_to_label_test)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[f"{i}" for i in range(1, 11)])

disp.plot()
plt.title('Confusion Matrix for Test Set')
plt.show()