In [1]:
from datasets import Dataset, load_dataset, load_from_disk
from typing import TypedDict, Literal, List

  from .autonotebook import tqdm as notebook_tqdm


# Utils

In [2]:
batch_size = 100
train_samples_count = 50000

In [3]:
CodeSearchNetLanguage = Literal['python', 'go', 'java', 'javascript', 'php', 'ruby']
CodeSearchNetSplit = Literal['train', 'test', 'validation']

class CodeSearchNetSample(TypedDict):
  repository_name: str
  func_path_in_repository: str
  func_name: str
  whole_func_string: str
  language: CodeSearchNetLanguage
  func_code_string: str
  func_code_tokens: List[str]
  func_documentation_string: str
  func_documentation_string_tokens: List[str]
  split_name: CodeSearchNetSplit
  func_code_url: str

In [4]:
from typing import cast

dataset_name = "code_search_net"

def load(language: CodeSearchNetLanguage, split: CodeSearchNetSplit, take: int) -> Dataset:
  ds = cast(Dataset, load_dataset(dataset_name, language, split=split))
  return Dataset.from_dict(ds[:take])

# Embedding models

In [5]:
from sentence_transformers import SentenceTransformer

comment_model = SentenceTransformer('all-mpnet-base-v2')
code_model = SentenceTransformer('flax-sentence-embeddings/st-codesearch-distilroberta-base')
embedding_shape = (768)

# Generate negative samples

In [6]:
from typing import Iterator
import numpy as np
from numpy.random import default_rng


random_generator = default_rng(seed=42)

skip_generating_ds = True

def generate_negative_samples(iterator: Iterator, negative_samples_per_sample: int):
  for batched_sample in iterator:
    processed_codes = [' '.join(code_tokens) for code_tokens in batched_sample['func_code_tokens']]
    processed_comments = [' '.join(comment_tokens) for comment_tokens in batched_sample['func_documentation_tokens']]
    batch_indexes = range(len(processed_codes))

    for index, code, comment in zip(batch_indexes, processed_codes, processed_comments):
      indexes = [i for i in batch_indexes if i != index]
      negative_indexes = random_generator.choice(indexes, negative_samples_per_sample, replace=False)
      negative_comments = np.array(processed_comments)[negative_indexes]
      for negative_comment in negative_comments:
        yield {
          "code": code,
          "comment_positive": comment,
          "comment_negative": negative_comment,
        }

def pre_process_sample(sample):
  return {
    "code": ' '.join(sample['func_code_tokens']),
    "comment": ' '.join(sample['func_documentation_tokens'])
  }

if skip_generating_ds == False:
  pre_processed_ds: Dataset = load('python', 'train', take=train_samples_count).map(pre_process_sample, desc="Loading and pre-processing")
  full_ds: Dataset = Dataset.from_generator(lambda: generate_negative_samples(pre_processed_ds.iter(batch_size=batch_size), 3)) # type: ignore

# Train

In [7]:
def add_embeddings(batched_sample):
  return {
    "code_embedding": code_model.encode(batched_sample["code"]),
    "comment_positive_embedding": comment_model.encode(batched_sample["comment_positive"]),
    "comment_negative_embedding": comment_model.encode(batched_sample["comment_negative"]),
  }

train_ds = full_ds.map(add_embeddings, remove_columns=list(full_ds.features.keys()), batch_size=batch_size, batched=True, desc="Generating embeddings") if skip_generating_ds == False else None

In [8]:
if train_ds is None:
  train_ds = load_from_disk('../datasets/train_with_negative_samples')
else:
  train_ds.save_to_disk('../datasets/train_with_negative_samples')

In [9]:
from models import embedding_comparator_model


model = embedding_comparator_model(input_shape=embedding_shape, margin=0.1)

Metal device set to: Apple M1 Pro

systemMemory: 16.00 GB
maxCacheSize: 5.33 GB



2023-10-07 17:22:02.707217: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-10-07 17:22:02.707314: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [10]:
test_train_indexes = range(0, 30000, 3)
new_train_ds = train_ds.select(test_train_indexes)

In [11]:
from datetime import datetime
from keras import callbacks

print(f"training with {len(test_train_indexes)} samples")

train_tf_dataset = new_train_ds.to_tf_dataset(batch_size=batch_size)

tensor_board_callback = callbacks.TensorBoard(log_dir=f'../logs/embedding_comparator_{datetime.now()}')
model.fit(train_tf_dataset, epochs=50, batch_size=batch_size, callbacks=[tensor_board_callback])

training with 10000 samples
Epoch 1/50


2023-10-07 17:22:02.845433: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2023-10-07 17:22:02.894713: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x31532ce80>

In [12]:
tests = new_train_ds.to_tf_dataset().take(1000)
predicts = model.predict(tests)

  39/1000 [>.............................] - ETA: 3s

2023-10-07 17:22:31.729830: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.




In [13]:
first, second = predicts