## Install Packages

In [1]:
!conda install -y -c conda-forge faiss-gpu
!apt-get -y update
!apt-get -y install libatlas-base-dev

done
Solving environment: done

## Package Plan ##

  environment location: /opt/conda

  added / updated specs:
    - faiss-gpu


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    conda-4.14.0               |   py37h89c1867_0        1010 KB  conda-forge
    toolz-0.12.1               |     pyhd8ed1ab_0          51 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         1.0 MB

The following NEW packages will be INSTALLED:

  toolz              conda-forge/noarch::toolz-0.12.1-pyhd8ed1ab_0

The following packages will be UPDATED:

  conda                               4.12.0-py37h89c1867_0 --> 4.14.0-py37h89c1867_0



Downloading and Extracting Packages
toolz-0.12.1         | 51 KB     | ##################################### | 100% 
conda-4.14.0         | 1010 KB   | ##################################### | 1

## Load Data

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.random_projection import GaussianRandomProjection

from tqdm import tqdm

import faiss

In [3]:
df = pd.read_csv("story_dataset.csv")
df

Unnamed: 0,prompt_id,prompt,story,hidden_state_file,len_generated_story,len_new_story
0,1,Once upon a time there was a dragon,Once upon a time there was a dragon named Blaz...,./hidden_states/prompt_1.npz,270,271
1,1,Once upon a time there was a dragon,Once upon a time there was a dragon named Spar...,./hidden_states/prompt_1.npz,349,350
2,1,Once upon a time there was a dragon,Once upon a time there was a dragon named Scor...,./hidden_states/prompt_1.npz,278,278
3,1,Once upon a time there was a dragon,Once upon a time there was a dragon. The drago...,./hidden_states/prompt_1.npz,117,118
4,1,Once upon a time there was a dragon,Once upon a time there was a dragon. The drago...,./hidden_states/prompt_1.npz,129,130
...,...,...,...,...,...,...
9995,10,Once upon a time there was a poor boy,Once upon a time there was a poor boy named Ti...,./hidden_states/prompt_10.npz,289,290
9996,10,Once upon a time there was a poor boy,Once upon a time there was a poor boy named Ti...,./hidden_states/prompt_10.npz,119,119
9997,10,Once upon a time there was a poor boy,Once upon a time there was a poor boy named Ti...,./hidden_states/prompt_10.npz,127,128
9998,10,Once upon a time there was a poor boy,Once upon a time there was a poor boy named Ti...,./hidden_states/prompt_10.npz,441,441


In [4]:
max_story_len = max(df["len_generated_story"])
max_story_len

522

In [5]:
hidden_states_by_layer = {}
NUM_PROMPTS = 10

for prompt_id in range(1, NUM_PROMPTS + 1):
    with np.load(f'./hidden_states/prompt_{prompt_id}.npz') as loaded_data:
        for i in tqdm(range(1000)):
            curr_hidden_states = loaded_data[f"arr_{i}"][0]
#             print(curr_hidden_states.shape)
            for layer in range(4, 5):
                padded_arr = np.zeros((max_story_len, 512))
                padded_arr_len = len(curr_hidden_states[layer][0])
                
                padded_arr[:padded_arr_len] = curr_hidden_states[layer][0]
                
                padded_arr = padded_arr.flatten().astype('float32') #FAISS expects data in type float32 instead of float64 - saves memory too!
#                 print(padded_arr.shape)
                
                if(f"layer_{layer}" in hidden_states_by_layer):
                    hidden_states_by_layer[f"layer_{layer}"].append(padded_arr)
                else:
                    hidden_states_by_layer[f"layer_{layer}"] = [padded_arr]

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [05:29<00:00,  3.04it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [05:21<00:00,  3.11it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [04:11<00:00,  3.97it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [04:58<00:00,  3.35it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [04:53<00:00,  3.41it/s]
100%|█████████████████████████████████████████████████████████████████

In [6]:
layer_hs_array = np.array(hidden_states_by_layer["layer_4"])
layer_hs_array.shape

(10000, 267264)

## Layer 4 Clustering

In [7]:
# Use original vectors for clustering - uncomment next line and comment out last two lines

# dim_reduced_vecs = layer_hs_array

random_projector = GaussianRandomProjection(random_state = 42)
dim_reduced_vecs = random_projector.fit_transform(layer_hs_array).astype('float32')

In [8]:
dim_reduced_vecs = np.array([v / np.linalg.norm(v) for v in dim_reduced_vecs])
dim_reduced_vecs.shape

(10000, 7894)

In [9]:
# K-means Clustering

ncentroids = NUM_PROMPTS
niter = 20
verbose = True
dim = dim_reduced_vecs.shape[1]
kmeans = faiss.Kmeans(dim, ncentroids, niter = niter, verbose = verbose, gpu = True, nredo = 10, spherical = True, max_points_per_centroid = 1000)
kmeans.train(dim_reduced_vecs)

Clustering 10000 points in 7894D to 10 clusters, redo 10 times, 20 iterations
  Preprocessing in 0.04 s
Outer iteration 0 / 10
  Iteration 19 (0.77 s, search 0.54 s): objective=4866.65 imbalance=1.316 nsplit=0       
Objective improved: keep new clusters
Outer iteration 1 / 10
  Iteration 19 (1.53 s, search 1.07 s): objective=4876.48 imbalance=1.133 nsplit=0       
Objective improved: keep new clusters
Outer iteration 2 / 10
  Iteration 19 (2.30 s, search 1.61 s): objective=4878.12 imbalance=1.140 nsplit=0       
Objective improved: keep new clusters
Outer iteration 3 / 10
  Iteration 19 (3.07 s, search 2.15 s): objective=4861.69 imbalance=1.060 nsplit=0       
Outer iteration 4 / 10
  Iteration 19 (3.84 s, search 2.69 s): objective=4860.95 imbalance=1.096 nsplit=0       
Outer iteration 5 / 10
  Iteration 19 (4.61 s, search 3.23 s): objective=4870.01 imbalance=1.141 nsplit=0       
Outer iteration 6 / 10
  Iteration 19 (5.37 s, search 3.77 s): objective=4850.31 imbalance=1.546 nsplit=

4878.115234375

In [10]:
kmeans.centroids #cluster centers

array([[ 0.01702208, -0.01215687,  0.01227838, ..., -0.01178698,
         0.00448606,  0.00421717],
       [ 0.01664986, -0.00477508,  0.01191848, ..., -0.01003913,
         0.00400551,  0.0020383 ],
       [ 0.01001574, -0.00994997,  0.01187276, ..., -0.00653664,
         0.00602305,  0.01195088],
       ...,
       [ 0.01090497, -0.009727  ,  0.01221477, ..., -0.01464461,
         0.00740312, -0.0004668 ],
       [ 0.0158527 , -0.00545237,  0.0142204 , ..., -0.00379057,
         0.00624877,  0.0129192 ],
       [ 0.01427492, -0.00400123,  0.01382805, ..., -0.00471401,
         0.00441769,  0.00662025]], dtype=float32)

In [11]:
for centroid in kmeans.centroids:
    print(np.linalg.norm(centroid))

1.0000001
0.9999998
1.0
1.0000001
1.0000001
1.0000004
1.0000005
1.0000001
0.9999998
0.9999998


In [12]:
kmeans.obj #inertia at each iteration

array([2906.19750977, 4693.49658203, 4774.64208984, 4809.57470703,
       4822.40576172, 4828.45019531, 4832.28466797, 4834.97949219,
       4836.82910156, 4838.59375   , 4841.92333984, 4845.87597656,
       4849.95898438, 4855.76464844, 4862.03564453, 4864.10888672,
       4865.06152344, 4865.70068359, 4866.22802734, 4866.64746094,
       3063.81420898, 4671.39453125, 4762.78564453, 4822.44921875,
       4841.07226562, 4848.03417969, 4851.97949219, 4855.19042969,
       4857.89892578, 4860.75244141, 4863.51074219, 4865.75976562,
       4868.62988281, 4871.20410156, 4873.828125  , 4874.99755859,
       4875.44091797, 4875.77539062, 4876.078125  , 4876.484375  ,
       2860.95166016, 4692.15429688, 4788.91015625, 4818.21386719,
       4835.78369141, 4850.89306641, 4860.00976562, 4866.22998047,
       4869.26660156, 4871.32128906, 4872.52978516, 4873.05810547,
       4873.48876953, 4873.90771484, 4874.47607422, 4875.31347656,
       4876.35107422, 4877.46679688, 4878.04296875, 4878.11523

In [13]:
pd.Series(kmeans.index.search(dim_reduced_vecs.astype(np.float32), 1)[1].flatten()).value_counts()

4    1666
2    1296
8    1251
0    1217
3    1127
5     980
7     942
1     573
6     528
9     420
dtype: int64

In [14]:
normalized_vecs = [v / np.linalg.norm(v) for v in dim_reduced_vecs]

In [15]:
cos_similarities = normalized_vecs @ kmeans.centroids.T
classifications = np.argmax(cos_similarities, axis=1)

In [16]:
pd.Series(classifications).value_counts()

4    1666
2    1296
8    1251
0    1217
3    1127
5     980
7     942
1     573
6     528
9     420
dtype: int64