## Install Packages

In [1]:
!conda install -y -c conda-forge faiss-gpu
!apt-get -y update
!apt-get -y install libatlas-base-dev

done
Solving environment: done

## Package Plan ##

  environment location: /opt/conda

  added / updated specs:
    - faiss-gpu


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    conda-4.14.0               |   py37h89c1867_0        1010 KB  conda-forge
    toolz-0.12.1               |     pyhd8ed1ab_0          51 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         1.0 MB

The following NEW packages will be INSTALLED:

  toolz              conda-forge/noarch::toolz-0.12.1-pyhd8ed1ab_0

The following packages will be UPDATED:

  conda                               4.12.0-py37h89c1867_0 --> 4.14.0-py37h89c1867_0



Downloading and Extracting Packages
conda-4.14.0         | 1010 KB   | ##################################### | 100% 
toolz-0.12.1         | 51 KB     | ##################################### | 1

## Load Data

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.random_projection import GaussianRandomProjection

from tqdm import tqdm

import faiss

In [3]:
df = pd.read_csv("story_dataset.csv")
df

Unnamed: 0,prompt_id,prompt,story,hidden_state_file,len_generated_story,len_new_story
0,1,Once upon a time there was a dragon,Once upon a time there was a dragon named Blaz...,./hidden_states/prompt_1.npz,270,271
1,1,Once upon a time there was a dragon,Once upon a time there was a dragon named Spar...,./hidden_states/prompt_1.npz,349,350
2,1,Once upon a time there was a dragon,Once upon a time there was a dragon named Scor...,./hidden_states/prompt_1.npz,278,278
3,1,Once upon a time there was a dragon,Once upon a time there was a dragon. The drago...,./hidden_states/prompt_1.npz,117,118
4,1,Once upon a time there was a dragon,Once upon a time there was a dragon. The drago...,./hidden_states/prompt_1.npz,129,130
...,...,...,...,...,...,...
9995,10,Once upon a time there was a poor boy,Once upon a time there was a poor boy named Ti...,./hidden_states/prompt_10.npz,289,290
9996,10,Once upon a time there was a poor boy,Once upon a time there was a poor boy named Ti...,./hidden_states/prompt_10.npz,119,119
9997,10,Once upon a time there was a poor boy,Once upon a time there was a poor boy named Ti...,./hidden_states/prompt_10.npz,127,128
9998,10,Once upon a time there was a poor boy,Once upon a time there was a poor boy named Ti...,./hidden_states/prompt_10.npz,441,441


In [4]:
max_story_len = max(df["len_generated_story"])
max_story_len

522

In [5]:
hidden_states_by_layer = {}
NUM_PROMPTS = 10

for prompt_id in range(1, NUM_PROMPTS + 1):
    with np.load(f'./hidden_states/prompt_{prompt_id}.npz') as loaded_data:
        for i in tqdm(range(1000)):
            curr_hidden_states = loaded_data[f"arr_{i}"][0]
#             print(curr_hidden_states.shape)
            for layer in range(2, 3):
                padded_arr = np.zeros((max_story_len, 512))
                padded_arr_len = len(curr_hidden_states[layer][0])
                
                padded_arr[:padded_arr_len] = curr_hidden_states[layer][0]
                
                padded_arr = padded_arr.flatten().astype('float32') #FAISS expects data in type float32 instead of float64 - saves memory too!
#                 print(padded_arr.shape)
                
                if(f"layer_{layer}" in hidden_states_by_layer):
                    hidden_states_by_layer[f"layer_{layer}"].append(padded_arr)
                else:
                    hidden_states_by_layer[f"layer_{layer}"] = [padded_arr]

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [01:46<00:00,  9.43it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [01:50<00:00,  9.04it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [01:41<00:00,  9.81it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [01:44<00:00,  9.59it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [02:04<00:00,  8.02it/s]
100%|█████████████████████████████████████████████████████████████████

In [6]:
layer_hs_array = np.array(hidden_states_by_layer["layer_2"])
layer_hs_array.shape

(10000, 267264)

## Layer 2 Clustering

In [7]:
# Use original vectors for clustering - uncomment next line and comment out last two lines

# dim_reduced_vecs = layer_hs_array

random_projector = GaussianRandomProjection(random_state = 42)
dim_reduced_vecs = random_projector.fit_transform(layer_hs_array).astype('float32')

In [8]:
dim_reduced_vecs = np.array([v / np.linalg.norm(v) for v in dim_reduced_vecs])
dim_reduced_vecs.shape

(10000, 7894)

In [9]:
# K-means Clustering

ncentroids = NUM_PROMPTS
niter = 20
verbose = True
dim = dim_reduced_vecs.shape[1]
kmeans = faiss.Kmeans(dim, ncentroids, niter = niter, verbose = verbose, gpu = True, nredo = 10, spherical = True, max_points_per_centroid = 1000)
kmeans.train(dim_reduced_vecs)

Clustering 10000 points in 7894D to 10 clusters, redo 10 times, 20 iterations
  Preprocessing in 0.05 s
Outer iteration 0 / 10
  Iteration 19 (1.59 s, search 1.07 s): objective=4922.71 imbalance=1.044 nsplit=0       
Objective improved: keep new clusters
Outer iteration 1 / 10
  Iteration 19 (3.17 s, search 2.13 s): objective=4920.71 imbalance=1.118 nsplit=0       
Outer iteration 2 / 10
  Iteration 19 (4.77 s, search 3.22 s): objective=4921.47 imbalance=1.100 nsplit=0       
Outer iteration 3 / 10
  Iteration 19 (6.37 s, search 4.30 s): objective=4928.03 imbalance=1.091 nsplit=0       
Objective improved: keep new clusters
Outer iteration 4 / 10
  Iteration 19 (7.98 s, search 5.39 s): objective=4919.51 imbalance=1.099 nsplit=0       
Outer iteration 5 / 10
  Iteration 19 (9.55 s, search 6.44 s): objective=4916.74 imbalance=1.264 nsplit=0       
Outer iteration 6 / 10
  Iteration 19 (11.41 s, search 7.68 s): objective=4901.01 imbalance=1.539 nsplit=0       
Outer iteration 7 / 10
  Ite

4928.03076171875

In [10]:
kmeans.centroids #cluster centers

array([[ 0.017112  , -0.00487969,  0.01473732, ...,  0.00087953,
         0.01218723,  0.0135426 ],
       [ 0.0214989 , -0.0013115 ,  0.01243807, ..., -0.00012607,
         0.00843788,  0.01333357],
       [ 0.01174534, -0.01089548,  0.01075566, ...,  0.00153992,
         0.01473929,  0.01566073],
       ...,
       [ 0.01232267, -0.00683162,  0.01197605, ..., -0.00328329,
         0.01713051,  0.01194394],
       [ 0.01191194, -0.00974948,  0.00945816, ..., -0.00131915,
         0.0162207 ,  0.01738647],
       [ 0.01478603, -0.00444456,  0.01175744, ...,  0.00415378,
         0.00707317,  0.01581967]], dtype=float32)

In [11]:
for centroid in kmeans.centroids:
    print(np.linalg.norm(centroid))

0.9999998
1.0000002
1.0000001
0.9999998
1.0
0.9999999
0.9999999
1.0000001
0.9999997
1.0000001


In [12]:
kmeans.obj #inertia at each iteration

array([3079.79125977, 4755.37988281, 4824.86816406, 4859.81982422,
       4877.60888672, 4888.93847656, 4896.64746094, 4902.54150391,
       4907.16601562, 4910.2265625 , 4912.54931641, 4914.40722656,
       4916.00927734, 4917.26367188, 4918.47949219, 4919.53027344,
       4920.61962891, 4921.62109375, 4922.30419922, 4922.70654297,
       3239.22827148, 4741.60253906, 4794.84033203, 4850.12988281,
       4882.4140625 , 4893.60888672, 4900.72460938, 4904.76220703,
       4907.21630859, 4909.20214844, 4910.72949219, 4912.08496094,
       4913.23242188, 4914.20507812, 4915.14990234, 4916.14013672,
       4917.21435547, 4918.39697266, 4919.46728516, 4920.71142578,
       3029.66137695, 4754.37353516, 4835.2265625 , 4872.50537109,
       4886.5703125 , 4894.00830078, 4898.36425781, 4901.42236328,
       4904.24609375, 4907.52636719, 4910.31054688, 4912.36914062,
       4913.90771484, 4915.25      , 4916.45703125, 4917.63964844,
       4918.73388672, 4919.63916016, 4920.55712891, 4921.47070

In [13]:
pd.Series(kmeans.index.search(dim_reduced_vecs.astype(np.float32), 1)[1].flatten()).value_counts()

2    1397
4    1251
3    1215
8    1213
9    1195
6    1088
5     806
7     802
1     594
0     439
dtype: int64

In [14]:
normalized_vecs = [v / np.linalg.norm(v) for v in dim_reduced_vecs]

In [15]:
cos_similarities = normalized_vecs @ kmeans.centroids.T
classifications = np.argmax(cos_similarities, axis=1)

In [16]:
pd.Series(classifications).value_counts()

2    1397
4    1251
3    1215
8    1213
9    1195
6    1088
5     806
7     802
1     594
0     439
dtype: int64