# Install requirements
Make sure you're using the local conda env for running this notebook. If is not created yet, create one with python 3.9 by running `conda create --name myenv python=3.9`

In [1]:
! pip install --upgrade pip



In [2]:
! pip install -r "../requirements.txt"

Collecting sentence-transformers (from -r ../requirements.txt (line 8))
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting torch>=1.6.0 (from sentence-transformers->-r ../requirements.txt (line 8))
  Downloading torch-2.1.0-cp39-none-macosx_11_0_arm64.whl.metadata (24 kB)
Collecting torchvision (from sentence-transformers->-r ../requirements.txt (line 8))
  Downloading torchvision-0.16.0-cp39-cp39-macosx_11_0_arm64.whl.metadata (6.6 kB)
Collecting scikit-learn (from sentence-transformers->-r ../requirements.txt (line 8))
  Downloading scikit_learn-1.3.1-cp39-cp39-macosx_12_0_arm64.whl.metadata (11 kB)
Collecting scipy (from sentence-transformers->-r ../requirements.txt (line 8))
  Downloading scipy-1.11.3-cp39-cp39-macosx_12_0_arm64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

# Load dataset

In [3]:
from datasets import Dataset, load_dataset, load_from_disk, concatenate_datasets
dataset_name = "code_search_net"

def load_from_cs_net(take: int) -> Dataset:
  ds = load_dataset(dataset_name, 'python', split='train')
  return Dataset.from_dict(ds[:take]) # type: ignore

  from .autonotebook import tqdm as notebook_tqdm


# Embedding models

In [4]:
from sentence_transformers import SentenceTransformer

comment_model = SentenceTransformer('all-mpnet-base-v2')
code_model = SentenceTransformer('flax-sentence-embeddings/st-codesearch-distilroberta-base')
embedding_shape = (768)

# Generate negative samples

In [5]:
from typing import Iterator
from numpy.random import default_rng


random_generator = default_rng(seed=42)

def generate_negative_samples(iterator: Iterator, negative_samples_per_sample: int):
  for batched_sample in iterator:
    codes_embeddings = batched_sample['code_embedding']
    comments_embeddings = batched_sample['comment_embedding']
    batch_indexes = range(len(codes_embeddings))

    for index in batch_indexes:
      indexes = [i for i in batch_indexes if i != index]
      negative_indexes = random_generator.choice(indexes, negative_samples_per_sample, replace=False)

      yield {
        "code_embedding": codes_embeddings[index],
        "comment_embedding": comments_embeddings[index],
        "target": 1
      }

      for negative_index in negative_indexes:
        yield {
          "code_embedding": codes_embeddings[index],
          "comment_embedding": comments_embeddings[negative_index],
          "target": 0
        }

def with_neg_samples(dataset: Dataset, negative_samples_per_sample: int, batch_size = 100) -> Dataset:
  assert negative_samples_per_sample <= batch_size, "negative_samples_per_sample must not be greater than batch_size"
  if negative_samples_per_sample <= 0:
    return dataset
  
  dataset_with_negative_samples: Dataset = Dataset.from_generator(lambda: generate_negative_samples(dataset.iter(batch_size=batch_size), negative_samples_per_sample)) # type: ignore
  return dataset_with_negative_samples

# Generate embedding dataset for training

In [6]:
import os


train_count = 2000
train_dataset_path = f'../datasets/embeddings_python_train_{train_count}'
train_pairs = load_from_cs_net(train_count)
is_embeddings_dataset_stored = os.path.isdir(train_dataset_path)

def generate_embeddings_in_batch(batched_sample):
  codes = batched_sample['func_code_string']
  comments = batched_sample['func_documentation_string']

  return {
    "code_embedding": code_model.encode(codes),
    "comment_embedding": comment_model.encode(comments),
  }

embeddings_dataset: Dataset = Dataset.from_dict(load_from_disk(train_dataset_path)[:train_count]) if is_embeddings_dataset_stored else train_pairs.map(
  generate_embeddings_in_batch, 
  batched=True, 
  batch_size=100,
  remove_columns=list(train_pairs[0].keys()),
  desc="Generating embeddings"
) # type: ignore

if is_embeddings_dataset_stored == False:
  embeddings_dataset.save_to_disk(train_dataset_path)

Downloading builder script: 100%|██████████| 8.44k/8.44k [00:00<00:00, 15.2MB/s]
Downloading metadata: 100%|██████████| 18.5k/18.5k [00:00<00:00, 10.5MB/s]
Downloading readme: 100%|██████████| 12.9k/12.9k [00:00<00:00, 11.0MB/s]
Downloading data: 100%|██████████| 941M/941M [00:46<00:00, 20.0MB/s]
Downloading data files: 100%|██████████| 1/1 [00:48<00:00, 48.79s/it]
Extracting data files: 100%|██████████| 1/1 [00:08<00:00,  8.05s/it]
Extracting data files: 100%|██████████| 3/3 [00:02<00:00,  1.08it/s]
Generating train split: 100%|██████████| 412178/412178 [01:00<00:00, 6766.82 examples/s]
Generating test split: 100%|██████████| 22176/22176 [00:03<00:00, 6758.14 examples/s]
Generating validation split: 100%|██████████| 23107/23107 [00:03<00:00, 6553.27 examples/s]


# Train

In [7]:
epoch = 100
batch_size = 200

## Add negative samples to train dataset

In [8]:
def to_tf_dataset(negative_samples_per_sample: int):
  tf_train_dataset = with_neg_samples(embeddings_dataset.shuffle(), negative_samples_per_sample).to_tf_dataset().map(lambda sample: ({
    "code_embedding": sample["code_embedding"],
    "comment_embedding": sample["comment_embedding"],
  }, sample["target"]))
  
  return tf_train_dataset

## Fit

In [9]:
from keras import callbacks
from models import build_dense_model

neg_samples_count = [1, 5, 15]
num_hidden_layers = 4
for neg_count in neg_samples_count:
  model = build_dense_model(num_hidden_layers=num_hidden_layers, input_shape=embedding_shape, model_name=f'dense_{num_hidden_layers}_neg_{neg_count}')
  tf_train_dataset = to_tf_dataset(neg_count)
  tensor_board_callback = callbacks.TensorBoard(log_dir=f'../logs/{model.name}')

  model.fit(
    tf_train_dataset.batch(batch_size),
    batch_size=batch_size,
    epochs=epoch,
    callbacks=[tensor_board_callback]
  )
  model.save(f'../models/{model.name}')

2023-10-20 15:40:25.682503: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M2 Pro
2023-10-20 15:40:25.682521: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 32.00 GB
2023-10-20 15:40:25.682526: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 10.67 GB
2023-10-20 15:40:25.682575: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-10-20 15:40:25.682788: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
Generating train split: 4000 examples [00:00, 6523.18 examples/s]


Epoch 1/100


2023-10-20 15:40:28.218464: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.
2023-10-20 15:40:28.261700: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:961] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node Adam/AssignAddVariableOp.


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 7

INFO:tensorflow:Assets written to: ../models/dense_4_neg_1/assets
Generating train split: 12000 examples [00:00, 15392.73 examples/s]

Epoch 1/100



2023-10-20 16:07:03.326269: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:961] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node Adam/AssignAddVariableOp.


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 7

INFO:tensorflow:Assets written to: ../models/dense_4_neg_5/assets
Generating train split: 32000 examples [00:01, 24497.12 examples/s]


Epoch 1/100


2023-10-20 16:54:14.636278: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:961] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node Adam/AssignAddVariableOp.


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 7

INFO:tensorflow:Assets written to: ../models/dense_4_neg_15/assets


# Validation

## Generate pairs lookup dictionary

In [None]:
from tqdm import tqdm


python_splits = load_dataset(dataset_name, 'python', split=['train', 'test', 'validation']) # type: ignore
python_full_dataset = concatenate_datasets(python_splits)
splits_info = python_splits[0].info.splits
python_full_dataset_count = sum([splits_info[key].num_examples for key in splits_info.keys()])

full_dataset_url_index = { sample['func_code_url']: index  for index, sample in tqdm(enumerate(python_full_dataset), desc="Generating dict lookup", total=python_full_dataset_count) }
def search_by_url(url: str) -> int | None:
  try:
    return full_dataset_url_index[url]
  except:
    return None

## 1. CodeSearchNet queries

In [None]:
query_samples_path = '../datasets/query_samples'

def remove_duplicates(dataset: Dataset) -> Dataset:
  pandas_dataset = dataset.to_pandas().drop_duplicates(subset=['Language', 'Query', 'GitHubUrl', 'Relevance'], ignore_index=True) # type: ignore
  dedup_dataset = Dataset.from_pandas(pandas_dataset)
  return dedup_dataset

def remove_queries_without_code(dataset: Dataset) -> Dataset:
  return dataset.filter(lambda sample: search_by_url(sample['GitHubUrl']) is not None, desc="Filtering queries with no corresponding code")

def pre_process_query_samples() -> Dataset:
  cs_net_queries_dataset: Dataset = Dataset.from_csv('../datasets/code_search_net_queries.csv') # type: ignore
  
  return remove_queries_without_code(remove_duplicates(cs_net_queries_dataset))

def get_query_samples() -> Dataset:
  try:
    return Dataset.load_from_disk(query_samples_path)
  except:
    query_samples = pre_process_query_samples()
    query_samples.save_to_disk(query_samples_path)
    return query_samples

In [None]:
query_samples: Dataset = get_query_samples()

### Predict

In [None]:
def get_query_code_embeddings(samples) -> Dataset:
  query_texts = [sample['Query'] for sample in samples]
  query_codes = [python_full_dataset[search_by_url(sample['GitHubUrl'])]['func_code_string'] for sample in samples]
  assert len(query_texts) == len(query_codes), "query_texts and query_codes arrays doesn't have the same length"

  query_embeddings = comment_model.encode(query_texts)
  code_embeddings = code_model.encode(query_codes)

  validation_dataset = []
  for query_embedding, code_embedding in zip(query_embeddings, code_embeddings):
    validation_dataset.append({
      "code_embedding": code_embedding,
      "comment_embedding": query_embedding,
    })

  return Dataset.from_list(validation_dataset)

In [None]:
from keras.models import load_model

def validate(model, samples):
  validation_dataset = get_query_code_embeddings(samples).to_tf_dataset(batch_size=10)

  return {
    "predictions": model.predict(validation_dataset, verbose=0).flatten(),
    "targets": [sample['Relevance'] for sample in samples]
  }


In [None]:
def is_prediction_correct(prediction, target) -> bool:
  if target in [0, 1]:
    return prediction <= 0.5
  
  if target in [2, 3]:
    return prediction > 0.5
  
  raise ValueError(f"target should be in range of [0, 3]. Instead, it has value of {target}")

In [None]:
validation_query_samples = [sample for sample in query_samples if sample['Language'].lower() == 'python']
validation_query_samples_count = len(validation_query_samples)

for model_name in os.listdir('../models/'):
  model = load_model(f'../models/{model_name}')
  result = validate(model, validation_query_samples)
  
  hits = sum([is_prediction_correct(prediction, target) for prediction, target in zip(result['predictions'], result['targets'])])
  success_percentage = hits / validation_query_samples_count

  print(f"model {model_name}: {success_percentage:.2%} - {hits} of {validation_query_samples_count}")