In [1]:
!git clone git@github.com:raynardj/ray.git
!yes | conda install bcolz

fatal: destination path 'ray' already exists and is not an empty directory.
Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.

yes: standard output: Broken pipe


In [2]:
import pandas as pd
import numpy as np
import torch
from tqdm import tqdm
from ray.kmean_torch import kmeans_core

In [3]:
df = pd.read_csv("data/guttenberg-sentences-base.csv")
df.head()

Unnamed: 0,text
0,"""Yessir, I do."
1,!+« riefen sie.
2,!chaleur du !
3,!« sagte Bebel.
4,!” I like that.


In [4]:
embeddings = np.load("data/guttenberg-sentences-embeddings.npy").astype(np.float16)
embeddings.shape

(19153433, 384)

In [5]:
class DotKmeans(kmeans_core):
    def calc_distance(self, dt):
        cent_norm = ((self.cent ** 2).sum(dim=-1, keepdim=True)).sqrt()
        cent = self.cent / (cent_norm + 1e-6)
        distance = (-dt.matmul(cent.T) + 1.0)
        return distance

In [6]:
cluster_count = int((len(embeddings) // 2) ** 0.5)
batch_size = 40000

In [7]:
km = DotKmeans(k=cluster_count, data_array=embeddings, batch_size=batch_size, epochs=200, all_cuda=False)
km.run(verbose=False)

tensor([ 229, 2494, 2902,  ..., 1763, 1928, 1659], device='cuda:0')

In [9]:
idx = km.predict(torch.FloatTensor(embeddings))

In [11]:
pd.Series(idx).value_counts()

412     237101
1195    175202
1923    165921
2334    124235
564     119232
         ...  
2415        21
3009        19
2454        15
559         14
374          6
Length: 3094, dtype: int64

In [16]:
df["cluster"] = idx
df.head()

Unnamed: 0,text,cluster
0,"""Yessir, I do.",229
1,!+« riefen sie.,2494
2,!chaleur du !,2902
3,!« sagte Bebel.,2902
4,!” I like that.,1887


In [27]:
def _sample_size(sdf):
    return int(np.ceil(np.sqrt(len(sdf)) ** 1.5))

In [28]:
df_sampled = df.groupby("cluster")\
    .apply(lambda sdf: sdf.sample(_sample_size(sdf), random_state=42)["text"])\
    .reset_index()\
    .drop("level_1", axis=1)
df_sampled.head()

Unnamed: 0,cluster,text
0,0,"""Gobryas is there?"""
1,0,His name's Gonzago.
2,0,"Goneril, gonəril."
3,0,"In discussing the character of Hlestakov, the ..."
4,0,Gomalco Productions.


In [31]:
df_sampled["cluster"].value_counts()

412     10745
1195     8564
1923     8222
2334     6618
564      6417
        ...  
2415       10
3009       10
559         8
2454        8
374         4
Name: cluster, Length: 3094, dtype: int64

In [30]:
df_sampled.to_csv("data/guttenberg-sentences-sampled.csv", index=False)