In [1]:
!conda install -y -c conda-forge faiss-gpu
!apt-get -y update
!apt-get -y install libatlas-base-dev

done
Solving environment: done

## Package Plan ##

  environment location: /opt/conda

  added / updated specs:
    - faiss-gpu


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    conda-4.14.0               |   py37h89c1867_0        1010 KB  conda-forge
    toolz-0.12.1               |     pyhd8ed1ab_0          51 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         1.0 MB

The following NEW packages will be INSTALLED:

  toolz              conda-forge/noarch::toolz-0.12.1-pyhd8ed1ab_0

The following packages will be UPDATED:

  conda                               4.12.0-py37h89c1867_0 --> 4.14.0-py37h89c1867_0



Downloading and Extracting Packages
conda-4.14.0         | 1010 KB   | ##################################### | 100% 
toolz-0.12.1         | 51 KB     | ##################################### | 1

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.random_projection import GaussianRandomProjection

from tqdm import tqdm

import faiss

In [3]:
df = pd.read_csv("story_dataset.csv")
df

Unnamed: 0,prompt_id,prompt,story,hidden_state_file,len_generated_story,len_new_story
0,1,Once upon a time there was a dragon,Once upon a time there was a dragon named Blaz...,./hidden_states/prompt_1.npz,270,271
1,1,Once upon a time there was a dragon,Once upon a time there was a dragon named Spar...,./hidden_states/prompt_1.npz,349,350
2,1,Once upon a time there was a dragon,Once upon a time there was a dragon named Scor...,./hidden_states/prompt_1.npz,278,278
3,1,Once upon a time there was a dragon,Once upon a time there was a dragon. The drago...,./hidden_states/prompt_1.npz,117,118
4,1,Once upon a time there was a dragon,Once upon a time there was a dragon. The drago...,./hidden_states/prompt_1.npz,129,130
...,...,...,...,...,...,...
9995,10,Once upon a time there was a poor boy,Once upon a time there was a poor boy named Ti...,./hidden_states/prompt_10.npz,289,290
9996,10,Once upon a time there was a poor boy,Once upon a time there was a poor boy named Ti...,./hidden_states/prompt_10.npz,119,119
9997,10,Once upon a time there was a poor boy,Once upon a time there was a poor boy named Ti...,./hidden_states/prompt_10.npz,127,128
9998,10,Once upon a time there was a poor boy,Once upon a time there was a poor boy named Ti...,./hidden_states/prompt_10.npz,441,441


In [6]:
max_story_len = max(df["len_generated_story"])
max_story_len

522

In [7]:
hidden_states_by_layer = {}
NUM_PROMPTS = 10

# for prompt_id in range(1, 11):
for prompt_id in range(1, NUM_PROMPTS + 1):
    with np.load(f'./hidden_states/prompt_{prompt_id}.npz') as loaded_data:
        for i in tqdm(range(1000)):
            curr_hidden_states = loaded_data[f"arr_{i}"][0]
#             print(curr_hidden_states.shape)
            for layer in range(1):
                padded_arr = np.zeros((max_story_len, 512))
                padded_arr_len = len(curr_hidden_states[layer][0])
                
                padded_arr[:padded_arr_len] = curr_hidden_states[layer][0]
                
                padded_arr = padded_arr.flatten().astype('float32') #FAISS expects data in type float32 instead of float64 - saves memory too!
#                 print(padded_arr.shape)
                
                if(f"layer_{layer}" in hidden_states_by_layer):
                    hidden_states_by_layer[f"layer_{layer}"].append(padded_arr)
                else:
                    hidden_states_by_layer[f"layer_{layer}"] = [padded_arr]

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:23<00:00, 42.93it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:21<00:00, 45.65it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [01:04<00:00, 15.57it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [01:01<00:00, 16.16it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [01:00<00:00, 16.60it/s]
100%|█████████████████████████████████████████████████████████████████

In [10]:
layer_hs_array = np.array(hidden_states_by_layer["layer_0"])
layer_hs_array.shape

(10000, 267264)

In [28]:
# Use original vectors for clustering - uncomment this cell and comment out next cell
# dim_reduced_vecs = np.array(hidden_states_by_layer["layer_0"])

In [57]:
# Layer 0 clustering

random_projector = GaussianRandomProjection(random_state = 42)
dim_reduced_vecs = random_projector.fit_transform(layer_hs_array).astype('float32')

In [59]:
dim_reduced_vecs = np.array([v / np.linalg.norm(v) for v in dim_reduced_vecs])
dim_reduced_vecs.shape

(10000, 7894)

In [60]:
# K-means Clustering

ncentroids = NUM_PROMPTS
niter = 100
verbose = True
dim = dim_reduced_vecs.shape[1]
kmeans = faiss.Kmeans(dim, ncentroids, niter=niter, verbose=verbose, gpu=True, spherical = True)
kmeans.train(dim_reduced_vecs)


Sampling a subset of 2560 / 10000 for training
Clustering 2560 points in 7894D to 10 clusters, redo 1 times, 100 iterations
  Preprocessing in 0.08 s
  Iteration 99 (0.81 s, search 0.43 s): objective=754.796 imbalance=1.178 nsplit=0       

754.7957763671875

In [61]:
kmeans.centroids #cluster centers

array([[ 0.0153456 ,  0.00904646,  0.00191673, ...,  0.00585843,
        -0.01694074, -0.01097484],
       [ 0.02382143,  0.00869313,  0.00826571, ..., -0.00691274,
        -0.00911196, -0.00840147],
       [ 0.02215165,  0.00660776,  0.00428421, ..., -0.01263674,
        -0.00932354, -0.01416455],
       ...,
       [ 0.00592884,  0.01377043,  0.00696063, ..., -0.01714636,
        -0.01298237, -0.00092342],
       [ 0.00784142,  0.00629826, -0.00648828, ..., -0.0150449 ,
        -0.00577383,  0.00220145],
       [-0.0031441 ,  0.01533546, -0.00366792, ..., -0.01044317,
        -0.01152603, -0.01013506]], dtype=float32)

In [62]:
kmeans.obj #inertia at each iteration

array([276.28485107, 715.73626709, 738.60107422, 744.5791626 ,
       746.39337158, 748.50982666, 750.86102295, 752.63299561,
       753.18481445, 753.40441895, 753.60491943, 753.89709473,
       754.50030518, 754.69390869, 754.76245117, 754.78479004,
       754.79577637, 754.79577637, 754.79577637, 754.79577637,
       754.79577637, 754.79577637, 754.79577637, 754.79577637,
       754.79577637, 754.79577637, 754.79577637, 754.79577637,
       754.79577637, 754.79577637, 754.79577637, 754.79577637,
       754.79577637, 754.79577637, 754.79577637, 754.79577637,
       754.79577637, 754.79577637, 754.79577637, 754.79577637,
       754.79577637, 754.79577637, 754.79577637, 754.79577637,
       754.79577637, 754.79577637, 754.79577637, 754.79577637,
       754.79577637, 754.79577637, 754.79577637, 754.79577637,
       754.79577637, 754.79577637, 754.79577637, 754.79577637,
       754.79577637, 754.79577637, 754.79577637, 754.79577637,
       754.79577637, 754.79577637, 754.79577637, 754.79

In [63]:
pd.Series(kmeans.index.search(dim_reduced_vecs.astype(np.float32), 1)[1].flatten()).value_counts()

9    2140
1    1548
2    1244
4     903
3     886
6     860
5     855
0     817
7     659
8      88
dtype: int64

In [64]:
classifications = []
for dim_red_vec in dim_reduced_vecs:
    dist_from_first_centroid = np.linalg.norm((dim_red_vec - kmeans.centroids[0]))
    dist_from_second_centroid = np.linalg.norm((dim_red_vec - kmeans.centroids[1]))
    classification = 0 if dist_from_first_centroid < dist_from_second_centroid else 1
    
    classifications.append(classification)
    
pd.Series(classifications).value_counts()

1    8738
0    1262
dtype: int64

In [36]:
np.mean(classifications[:1000])

0.526

In [37]:
np.mean(classifications[1000:])

0.495

In [40]:
np.mean([i == 1 for i in classifications[:1000]])

0.526

In [41]:
np.mean([i == 0 for i in classifications[1000:]])

0.505

In [66]:
normalized_vecs = [v / np.linalg.norm(v) for v in dim_reduced_vecs]

In [67]:
cos_similarities = normalized_vecs @ kmeans.centroids.T
assignments = np.argmax(cos_similarities, axis=1)

In [68]:
pd.Series(assignments).value_counts()

9    2140
1    1548
2    1244
4     903
3     886
6     860
5     855
0     817
7     659
8      88
dtype: int64

In [69]:
for centroid in kmeans.centroids:
    print(np.linalg.norm(centroid))

1.0000001
1.0000001
0.9999999
1.0000004
1.0000005
0.99999976
1.0
1.0
1.0000001
1.0000001
