# Install requirements
Make sure you're using the local conda env for running this notebook. If is not created yet, create one with python 3.9 by running `conda create --name myenv python=3.9`

In [1]:
! pip install --upgrade pip



In [2]:
! pip install -r "../requirements.txt"



# Load dataset

In [3]:
from datasets import Dataset, load_dataset, load_from_disk, concatenate_datasets
dataset_name = "code_search_net"

def load_from_cs_net(take: int) -> Dataset:
  ds = load_dataset(dataset_name, 'python', split='train')
  return Dataset.from_dict(ds[:take]) # type: ignore

  from .autonotebook import tqdm as notebook_tqdm


# Embedding models

In [4]:
from sentence_transformers import SentenceTransformer

comment_model = SentenceTransformer('all-mpnet-base-v2')
code_model = SentenceTransformer('flax-sentence-embeddings/st-codesearch-distilroberta-base')
embedding_shape = (768)

# Generate negative samples

In [5]:
from typing import Iterator
from numpy.random import default_rng


random_generator = default_rng(seed=42)

def generate_negative_samples(iterator: Iterator, negative_samples_per_sample: int):
  for batched_sample in iterator:
    codes_embeddings = batched_sample['code_embedding']
    comments_embeddings = batched_sample['comment_embedding']
    batch_indexes = range(len(codes_embeddings))

    for index in batch_indexes:
      indexes = [i for i in batch_indexes if i != index]
      negative_indexes = random_generator.choice(indexes, negative_samples_per_sample, replace=False)

      yield {
        "code_embedding": codes_embeddings[index],
        "comment_embedding": comments_embeddings[index],
        "target": 1
      }

      for negative_index in negative_indexes:
        yield {
          "code_embedding": codes_embeddings[index],
          "comment_embedding": comments_embeddings[negative_index],
          "target": 0
        }

def with_neg_samples(dataset: Dataset, negative_samples_per_sample: int, batch_size = 100) -> Dataset:
  assert negative_samples_per_sample <= batch_size, "negative_samples_per_sample must not be greater than batch_size"
  if negative_samples_per_sample <= 0:
    return dataset
  
  dataset_with_negative_samples: Dataset = Dataset.from_generator(lambda: generate_negative_samples(dataset.iter(batch_size=batch_size), negative_samples_per_sample)) # type: ignore
  return dataset_with_negative_samples

# Generate embedding dataset

In [6]:
import os


train_count = 2000
train_dataset_path = f'../datasets/embeddings_python_train_{train_count}'
train_pairs = load_from_cs_net(train_count)
is_embeddings_dataset_stored = os.path.isdir(train_dataset_path)

def generate_embeddings_in_batch(batched_sample):
  codes = batched_sample['func_code_string']
  comments = batched_sample['func_documentation_string']

  return {
    "code_embedding": code_model.encode(codes),
    "comment_embedding": comment_model.encode(comments),
  }

embeddings_dataset: Dataset = Dataset.from_dict(load_from_disk(train_dataset_path)[:train_count]) if is_embeddings_dataset_stored else train_pairs.map(
  generate_embeddings_in_batch, 
  batched=True, 
  batch_size=100,
  remove_columns=list(train_pairs[0].keys()),
  desc="Generating embeddings"
) # type: ignore

if is_embeddings_dataset_stored == False:
  embeddings_dataset.save_to_disk(train_dataset_path)

# Train

In [None]:
epoch = 100
batch_size = 200

## Add negative samples to train dataset

In [None]:
def to_tf_dataset(negative_samples_per_sample: int):
  tf_train_dataset = with_neg_samples(embeddings_dataset.shuffle(), negative_samples_per_sample).to_tf_dataset().map(lambda sample: ({
    "code_embedding": sample["code_embedding"],
    "comment_embedding": sample["comment_embedding"],
  }, sample["target"]))
  
  return tf_train_dataset

## Fit

In [None]:
from keras import callbacks
from models import build_dense_model

neg_samples_count = [1, 5, 15]
num_hidden_layers = 4
for neg_count in neg_samples_count:
  model = build_dense_model(num_hidden_layers=num_hidden_layers, input_shape=embedding_shape, model_name=f'dense_{num_hidden_layers}_neg_{neg_count}')
  tf_train_dataset = to_tf_dataset(neg_count)
  tensor_board_callback = callbacks.TensorBoard(log_dir=f'../logs/{model.name}')

  model.fit(
    tf_train_dataset.batch(batch_size),
    batch_size=batch_size,
    epochs=epoch,
    callbacks=[tensor_board_callback]
  )
  model.save(f'../models/{model.name}')

# Validation

In [7]:
from typing import Optional
from tqdm import tqdm
from keras.models import load_model

## 1. CodeSearchNet queries

In [None]:
python_splits = load_dataset(dataset_name, 'python', split=['train', 'test', 'validation']) # type: ignore
python_full_dataset = concatenate_datasets(python_splits)
splits_info = python_splits[0].info.splits
python_full_dataset_count = sum([splits_info[key].num_examples for key in splits_info.keys()])

full_dataset_url_index = { sample['func_code_url']: index  for index, sample in tqdm(enumerate(python_full_dataset), desc="Generating dict lookup", total=python_full_dataset_count) }
def search_by_url(url: str) -> Optional[int]:
  try:
    return full_dataset_url_index[url]
  except:
    return None

In [None]:
query_samples_path = '../datasets/query_samples'

def remove_duplicates(dataset: Dataset) -> Dataset:
  pandas_dataset = dataset.to_pandas().drop_duplicates(subset=['Language', 'Query', 'GitHubUrl', 'Relevance'], ignore_index=True) # type: ignore
  dedup_dataset = Dataset.from_pandas(pandas_dataset)
  return dedup_dataset

def remove_queries_without_code(dataset: Dataset) -> Dataset:
  return dataset.filter(lambda sample: search_by_url(sample['GitHubUrl']) is not None, desc="Filtering queries with no corresponding code")

def pre_process_query_samples() -> Dataset:
  cs_net_queries_dataset: Dataset = Dataset.from_csv('../datasets/code_search_net_queries.csv') # type: ignore
  
  return remove_queries_without_code(remove_duplicates(cs_net_queries_dataset))

def get_query_samples() -> Dataset:
  try:
    return Dataset.load_from_disk(query_samples_path)
  except:
    query_samples = pre_process_query_samples()
    query_samples.save_to_disk(query_samples_path)
    return query_samples

In [None]:
query_samples: Dataset = get_query_samples()

### Predict

In [None]:
def get_query_code_embeddings(samples) -> Dataset:
  query_texts = [sample['Query'] for sample in samples]
  query_codes = [python_full_dataset[search_by_url(sample['GitHubUrl'])]['func_code_string'] for sample in samples]
  assert len(query_texts) == len(query_codes), "query_texts and query_codes arrays doesn't have the same length"

  query_embeddings = comment_model.encode(query_texts)
  code_embeddings = code_model.encode(query_codes)

  validation_dataset = []
  for query_embedding, code_embedding in zip(query_embeddings, code_embeddings):
    validation_dataset.append({
      "code_embedding": code_embedding,
      "comment_embedding": query_embedding,
    })

  return Dataset.from_list(validation_dataset)

In [None]:
from keras.models import load_model

def validate(model, samples):
  validation_dataset = get_query_code_embeddings(samples).to_tf_dataset(batch_size=10)

  return {
    "predictions": model.predict(validation_dataset, verbose=0).flatten(),
    "targets": [sample['Relevance'] for sample in samples]
  }


In [None]:
def is_prediction_correct(prediction, target) -> bool:
  if target in [0, 1]:
    return prediction <= 0.5
  
  if target in [2, 3]:
    return prediction > 0.5
  
  raise ValueError(f"target should be in range of [0, 3]. Instead, it has value of {target}")

In [None]:
validation_query_samples = [sample for sample in query_samples if sample['Language'].lower() == 'python']
validation_query_samples_count = len(validation_query_samples)

for model_name in os.listdir('../models/'):
  model = load_model(f'../models/{model_name}')
  result = validate(model, validation_query_samples)
  
  hits = sum([is_prediction_correct(prediction, target) for prediction, target in zip(result['predictions'], result['targets'])])
  success_percentage = hits / validation_query_samples_count

  print(f"model {model_name}: {success_percentage:.2%} - {hits} of {validation_query_samples_count}")

## 2. Generalization experiment

In [35]:
from typing import List


def search(query, model) -> List:
  query_embedding = comment_model.encode([query]).flatten()
  samples = Dataset.from_list([{ "code_embedding": embedding_pair["code_embedding"], 'comment_embedding': query_embedding } for embedding_pair in embeddings_dataset]).to_tf_dataset(batch_size=10)

  predictions = model.predict(samples, verbose=0).flatten()
  results = [{ "prediction": prediction, "index": index } for index, prediction in enumerate(predictions)]

  return results

def top_k(k: int, results: List):
  return [train_pairs[result['index']] for result in results[:k]]

def bottom_k(k: int, results: List):
  return [train_pairs[result['index']] for result in results[-k:]]

In [36]:
search_model = load_model('../models/dense_4_neg_5/')
max_words_to_remove = 30
experiment_results = []
samples_count = 10

for sample in tqdm(train_pairs.to_iterable_dataset().take(samples_count), total=samples_count, desc="Running generalization experiment"):
  comment_tokens: list = sample['func_documentation_tokens']
  words_to_remove_count = min(len(comment_tokens), max_words_to_remove)
  word_indexes_to_remove = list(range(words_to_remove_count))
  random_generator.shuffle(word_indexes_to_remove)
  
  for word_index in word_indexes_to_remove:
    comment_tokens_copy = comment_tokens.copy()
    word_removed = comment_tokens_copy.pop(word_index)
    query = ' '.join(comment_tokens_copy)
    search_results = search(
      query=query,
      model=search_model
    )

    experiment_results.append({
      "query": query,
      "word_removed": word_removed,
      "top_1_code": top_k(1, search_results)[0]['func_code_string'],
      "top_1_prediction_score": search_results[0]['prediction']
    })


Running generalization experiment: 100%|██████████| 10/10 [02:52<00:00, 17.28s/it]


In [38]:
Dataset.from_list(experiment_results).to_csv('../results/generalization_1.csv')

Creating CSV from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 364.28ba/s]


86283