In [4]:
import dask 
import numpy as np
import dask.dataframe as dd
import pandas as pd
import time 

In [5]:
file = "../data/val/clone-detection-600k-5fold.parquet"
ddf = dd.read_parquet("../data/val/clone-detection-600k-5fold.parquet") 
# only 1000 random rows
ddf_100 = ddf.sample(frac=0.001)
# read only the code1_column
ddf_100_code1 = ddf["code1"]
# 10k random rows
ddf_500 = ddf.sample(frac=0.005)
# # read only the code2_column
ddf_500_code2 = ddf_500["code2"]

# Dask vs Normal Encoding Speed for 1000 and 5000 rows

### Dask

In [6]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('annakotarba/sentence-similarity')
def encode_code(code):
    code = code.to_list()
    return model.encode(code)


In [8]:
# dask
start = time.time()
embeddings_1 = ddf_100_code1.map_partitions(encode_code).compute(scheduler='processes')
print("Time taken: ", time.time()-start)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [7]:
# dask
start = time.time()
embeddings_1 = ddf_500_code2.map_partitions(encode_code).compute(scheduler='processes')
print("Time taken: ", time.time()-start)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Time taken:  57.59621000289917


## Without Dask

In [43]:
file = "../data/val/clone-detection-600k-5fold.parquet"
ddf = dd.read_parquet("../data/val/clone-detection-600k-5fold.parquet") 
print(ddf.shape)
# only 1000 random rows
ddf_100 = ddf.sample(frac=0.001)
ddf_100.compute()
# read only the code1_column
ddf_100_code1 = ddf_100["code1"]
ddf_500 = ddf.sample(frac=0.005)
ddf_500.compute()
# read only the code2_column
ddf_500_code2 = ddf_500["code2"]


(<dask_expr.expr.Scalar: expr=ReadParquetFSSpec(74e45e2).size() // 7, dtype=int64>, 7)


In [44]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('annakotarba/sentence-similarity')

start = time.time()
embeddings = model.encode(ddf_100_code1, show_progress_bar=True)
print(embeddings)
print("Time taken: ", time.time()-start)

start = time.time()
embeddings = model.encode(ddf_500_code2, show_progress_bar=True)
print(embeddings)
print("Time taken: ", time.time()-start)



Batches: 100%|██████████| 43/43 [27:35<00:00, 38.51s/it]


[[-0.12333419 -0.174767   -0.20818347 ...  0.13409105 -0.02637528
   0.09043723]
 [-0.12333419 -0.174767   -0.20818347 ...  0.13409105 -0.02637528
   0.09043723]
 [-0.12333419 -0.174767   -0.20818347 ...  0.13409105 -0.02637528
   0.09043723]
 ...
 [-0.12333419 -0.174767   -0.20818347 ...  0.13409105 -0.02637528
   0.09043723]
 [-0.12333419 -0.174767   -0.20818347 ...  0.13409105 -0.02637528
   0.09043723]
 [-0.12333419 -0.174767   -0.20818347 ...  0.13409105 -0.02637528
   0.09043723]]
Time taken:  1659.4923441410065


Batches: 100%|██████████| 211/211 [2:23:21<00:00, 40.76s/it]  


[[-0.12333419 -0.174767   -0.20818347 ...  0.13409105 -0.02637528
   0.09043723]
 [-0.12333419 -0.174767   -0.20818347 ...  0.13409105 -0.02637528
   0.09043723]
 [-0.12333419 -0.174767   -0.20818347 ...  0.13409105 -0.02637528
   0.09043723]
 ...
 [-0.12333419 -0.174767   -0.20818347 ...  0.13409105 -0.02637528
   0.09043723]
 [-0.12333419 -0.174767   -0.20818347 ...  0.13409105 -0.02637528
   0.09043723]
 [-0.12333419 -0.174767   -0.20818347 ...  0.13409105 -0.02637528
   0.09043723]]
Time taken:  8607.2845890522


In [None]:
model_code2_embeddings = model.encode(ddf_500_code2.to_list())

In [11]:
## Model 1

models = {}







# plit the code list into chunks


# parallelize the encoding

embeddings = dask.compute(*embeddings)

print(time.time() - start)
model = SentenceTransformer('annakotarba/sentence-similarity')
start = time.time()
embeddings = model.encode(code, show_progress_bar=True)
print(time.time() - start)

# use dask to parallelize






# similarity = cosine_similarity([embeddings[0]], [embeddings[1]])

0.06683897972106934


Batches: 100%|██████████| 1/1 [00:00<00:00, 24.92it/s]

0.04278826713562012





In [52]:
model.similarity(embeddings, embeddings)
# plt heatmap


tensor([[1.0000, 0.7171, 0.3585,  ..., 0.6446, 0.5378, 0.6475],
        [0.7171, 1.0000, 0.3667,  ..., 0.6303, 0.6153, 0.6687],
        [0.3585, 0.3667, 1.0000,  ..., 0.4309, 0.6593, 0.3520],
        ...,
        [0.6446, 0.6303, 0.4309,  ..., 1.0000, 0.5942, 0.7079],
        [0.5378, 0.6153, 0.6593,  ..., 0.5942, 1.0000, 0.5126],
        [0.6475, 0.6687, 0.3520,  ..., 0.7079, 0.5126, 1.0000]])