# Load dataset

In [1]:
from datasets import Dataset, load_dataset, load_from_disk, concatenate_datasets
dataset_name = "code_search_net"

def load_from_cs_net(take: int) -> Dataset:
  ds = load_dataset(dataset_name, 'python', split='train')
  return Dataset.from_dict(ds[:take]) # type: ignore


  from .autonotebook import tqdm as notebook_tqdm


# Embedding models

In [2]:
from sentence_transformers import SentenceTransformer

comment_model = SentenceTransformer('all-mpnet-base-v2')
code_model = SentenceTransformer('flax-sentence-embeddings/st-codesearch-distilroberta-base')
embedding_shape = (768)

# Generate negative samples

In [3]:
from typing import Iterator
from numpy.random import default_rng


random_generator = default_rng(seed=42)

def generate_negative_samples(iterator: Iterator, negative_samples_per_sample: int):
  for batched_sample in iterator:
    codes_embeddings = batched_sample['code_embedding']
    comments_embeddings = batched_sample['comment_embedding']
    batch_indexes = range(len(codes_embeddings))

    for index in batch_indexes:
      indexes = [i for i in batch_indexes if i != index]
      negative_indexes = random_generator.choice(indexes, negative_samples_per_sample, replace=False)

      yield {
        "code_embedding": codes_embeddings[index],
        "comment_embedding": comments_embeddings[index],
        "target": 1
      }

      for negative_index in negative_indexes:
        yield {
          "code_embedding": codes_embeddings[index],
          "comment_embedding": comments_embeddings[negative_index],
          "target": 0
        }

def with_neg_samples(dataset: Dataset, negative_samples_per_sample: int, batch_size = 100) -> Dataset:
  assert negative_samples_per_sample <= batch_size, "negative_samples_per_sample must not be greater than batch_size"
  if negative_samples_per_sample <= 0:
    return dataset
  
  dataset_with_negative_samples: Dataset = Dataset.from_generator(lambda: generate_negative_samples(dataset.iter(batch_size=batch_size), negative_samples_per_sample)) # type: ignore
  return dataset_with_negative_samples

# Generate embedding dataset for training

In [4]:
def generate_embeddings_in_batch(batched_sample):
  codes = batched_sample['func_code_string']
  comments = batched_sample['func_documentation_string']

  return {
    "code_embedding": code_model.encode(codes),
    "comment_embedding": comment_model.encode(comments),
  }

In [5]:
import os


train_count = 2000
train_dataset_path = '../datasets/embeddings_python_train_10000'
train_pairs = load_from_cs_net(train_count)
is_embeddings_dataset_stored = os.path.isdir(train_dataset_path)

embeddings_dataset: Dataset = Dataset.from_dict(load_from_disk(train_dataset_path)[:train_count]) if is_embeddings_dataset_stored else train_pairs.map(
  generate_embeddings_in_batch, 
  batched=True, 
  batch_size=100,
  remove_columns=list(train_pairs[0].keys()),
  desc="Generating embeddings"
) # type: ignore

if is_embeddings_dataset_stored == False:
  embeddings_dataset.save_to_disk(train_dataset_path)

# Train

In [None]:
epoch = 100
batch_size = 200

## Add negative samples to train dataset

In [None]:
def to_tf_dataset(negative_samples_per_sample: int):
  tf_train_dataset = with_neg_samples(embeddings_dataset.shuffle(), negative_samples_per_sample).to_tf_dataset().map(lambda sample: ({
    "code_embedding": sample["code_embedding"],
    "comment_embedding": sample["comment_embedding"],
  }, sample["target"]))
  
  return tf_train_dataset

## Fit

In [None]:
from keras import callbacks
from models import build_dense_model

neg_samples_count = [1]
for neg_count in neg_samples_count:
  model = build_dense_model(num_hidden_layers=4, input_shape=embedding_shape, model_name=f'dense_4_neg_{neg_count}')
  tf_train_dataset = to_tf_dataset(neg_count)
  tensor_board_callback = callbacks.TensorBoard(log_dir=f'../logs/{model.name}')

  model.fit(
    tf_train_dataset.batch(batch_size),
    batch_size=batch_size,
    epochs=epoch,
    callbacks=[tensor_board_callback]
  )
  model.save(f'../models/{model.name}')

# Validate using CodeSearchNet queries

## Preprocess CodeSearchNet queries

In [6]:
from tqdm import tqdm


queries = Dataset.from_csv('../datasets/code_search_net_queries.csv')
assert isinstance(queries, Dataset), f"invalid type for queries. Expected Dataset found {type(queries)}"

python_splits = load_dataset(dataset_name, 'python', split=['train', 'test', 'validation'])
python_full_dataset = concatenate_datasets(python_splits)
splits_info = python_splits[0].info.splits
python_full_dataset_count = sum([splits_info[key].num_examples for key in splits_info.keys()])

full_dataset_url_index = { sample['func_code_url']: index  for index, sample in tqdm(enumerate(python_full_dataset), desc="Generating dict lookup", total=python_full_dataset_count) }
def search_by_url(url: str) -> int | None:
  try:
    return full_dataset_url_index[url]
  except:
    return None
  

queries_set = set()
def filter_queries(sample):
  if sample['Language'].lower() != 'python':
    return False
  
  if sample['Query'] in queries_set:
    return False
  else:
    queries_set.add(sample['Query'])

  
  if search_by_url(sample['GitHubUrl']) is None:
    return False

  return True

python_queries = queries.filter(filter_queries)

Generating dict lookup: 100%|██████████| 457461/457461 [00:50<00:00, 9072.99it/s] 


## Generate query/code embedding pairs

In [7]:
query_texts = [sample['Query'] for sample in python_queries]
query_codes = [python_full_dataset[search_by_url(sample['GitHubUrl'])] for sample in python_queries]
assert len(query_texts) == len(query_codes), "query_texts and query_codes arrays doesn't have the same length"

query_embeddings = comment_model.encode(query_texts)
code_embeddings = code_model.encode(query_codes)

validation_dataset = []
for query_embedding, code_embedding in zip(query_embeddings, code_embeddings):
  validation_dataset.append({
    "code_embedding": code_embedding,
    "comment_embedding": query_embedding,
  })
validation_dataset = Dataset.from_list(validation_dataset).to_tf_dataset(batch_size=5)

Metal device set to: Apple M1 Pro

systemMemory: 16.00 GB
maxCacheSize: 5.33 GB



2023-10-14 11:39:21.432999: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-10-14 11:39:21.433480: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


## Run CodeSearchNet queries validation

In [8]:
from keras.models import load_model

validation_model = load_model('../models/dense_4_neg_1/')
predictions = validation_model.predict(validation_dataset).flatten()
targets = [sample['Relevance'] for sample in python_queries]



2023-10-14 11:39:21.905192: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2023-10-14 11:39:21.957371: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


In [9]:
def is_prediction_correct(prediction, target) -> bool:
  if target in [0, 1]:
    return prediction <= 0.5
  
  if target in [2, 3]:
    return prediction > 0.5
  
  raise ValueError(f"target should be in range of [0, 3]. Instead, it has value of {target}")
  
hits = [is_prediction_correct(prediction, target) for prediction, target in zip(predictions, targets)].count(True)
hits

25