# Install requirements
Make sure you're using the local conda env for running this notebook. If is not created yet, create one with python 3.9 by running `conda create --name myenv python=3.9`

In [None]:
! pip install --upgrade pip

In [None]:
! pip install -r "../requirements.txt"

# Load dataset

In [None]:
from datasets import Dataset, load_dataset, load_from_disk, concatenate_datasets
dataset_name = "code_search_net"

def load_from_cs_net(take: int) -> Dataset:
  ds = load_dataset(dataset_name, 'python', split='train')
  return Dataset.from_dict(ds[:take]) # type: ignore

# Embedding models

In [None]:
from sentence_transformers import SentenceTransformer

comment_model = SentenceTransformer('all-mpnet-base-v2')
code_model = SentenceTransformer('flax-sentence-embeddings/st-codesearch-distilroberta-base')
embedding_shape = (768)

# Generate negative samples

In [None]:
from typing import Iterator
from numpy.random import default_rng


random_generator = default_rng(seed=42)

def generate_negative_samples(iterator: Iterator, negative_samples_per_sample: int):
  for batched_sample in iterator:
    codes_embeddings = batched_sample['code_embedding']
    comments_embeddings = batched_sample['comment_embedding']
    batch_indexes = range(len(codes_embeddings))

    for index in batch_indexes:
      indexes = [i for i in batch_indexes if i != index]
      negative_indexes = random_generator.choice(indexes, negative_samples_per_sample, replace=False)

      yield {
        "code_embedding": codes_embeddings[index],
        "comment_embedding": comments_embeddings[index],
        "target": 1
      }

      for negative_index in negative_indexes:
        yield {
          "code_embedding": codes_embeddings[index],
          "comment_embedding": comments_embeddings[negative_index],
          "target": 0
        }

def with_neg_samples(dataset: Dataset, negative_samples_per_sample: int, batch_size = 100) -> Dataset:
  assert negative_samples_per_sample <= batch_size, "negative_samples_per_sample must not be greater than batch_size"
  if negative_samples_per_sample <= 0:
    return dataset
  
  dataset_with_negative_samples: Dataset = Dataset.from_generator(lambda: generate_negative_samples(dataset.iter(batch_size=batch_size), negative_samples_per_sample)) # type: ignore
  return dataset_with_negative_samples

# Generate embedding dataset

In [None]:
import os


train_count = 2000
train_dataset_path = f'../datasets/embeddings_python_train_{train_count}'
train_pairs = load_from_cs_net(train_count)
is_embeddings_dataset_stored = os.path.isdir(train_dataset_path)

def generate_embeddings_in_batch(batched_sample):
  codes = batched_sample['func_code_string']
  comments = batched_sample['func_documentation_string']

  return {
    "code_embedding": code_model.encode(codes),
    "comment_embedding": comment_model.encode(comments),
  }

embeddings_dataset: Dataset = Dataset.from_dict(load_from_disk(train_dataset_path)[:train_count]) if is_embeddings_dataset_stored else train_pairs.map(
  generate_embeddings_in_batch, 
  batched=True, 
  batch_size=100,
  remove_columns=list(train_pairs[0].keys()),
  desc="Generating embeddings"
) # type: ignore

if is_embeddings_dataset_stored == False:
  embeddings_dataset.save_to_disk(train_dataset_path)

# Train

In [None]:
epoch = 100
batch_size = 200

## Fit

In [None]:
def to_tf_fit_dataset(negative_samples_per_sample: int):
  tf_dataset = with_neg_samples(embeddings_dataset.shuffle(), negative_samples_per_sample).to_tf_dataset().map(lambda sample: ({
    "code_embedding": sample["code_embedding"],
    "comment_embedding": sample["comment_embedding"],
  }, sample["target"]))
  dataset_size = tf_dataset.cardinality().numpy() # type: ignore
  print(f"training with {dataset_size} samples")
  validation_samples_count = int(dataset_size * 0.2)
  
  validation_ds = tf_dataset.take(validation_samples_count)
  train_ds = tf_dataset.skip(validation_samples_count)

  return (train_ds, validation_ds)

In [None]:
from keras import callbacks
from models import build_dense_model


neg_samples_count = [5]
num_hidden_layers = 4
for neg_count in neg_samples_count:
  model = build_dense_model(num_hidden_layers=num_hidden_layers, input_shape=embedding_shape, model_name=f'dense_{num_hidden_layers}_neg_{neg_count}-dropout20')
  train, validation = to_tf_fit_dataset(neg_count)
  tensor_board_callback = callbacks.TensorBoard(log_dir=f'../logs/{model.name}')

  model.fit(
    train.batch(batch_size),
    validation_data=validation.batch(batch_size),
    batch_size=batch_size,
    epochs=epoch,
    callbacks=[tensor_board_callback]
  )
  model.save(f'../models/{model.name}')

# Experiments

## Shared code

In [None]:
from typing import Any, List, Optional, TypedDict, List
import numpy as np
from tqdm import tqdm
from keras.models import load_model
import os
import pandas as pd
import plotly.express as px


class SimilarityInput(TypedDict):
  code_embedding: Any
  comment_embedding: Any

class SimilaryResult(TypedDict):
  similarity: np.float32
  pair_index: int

class SearchResult(TypedDict):
  similarity: np.float32
  pair_index: int
  query: str


def get_similarities(inputs: List[SimilarityInput], model) -> List[SimilaryResult]:
  tf_inputs = Dataset.from_list(inputs).to_tf_dataset(batch_size=10)
  similarities = model.predict(tf_inputs, verbose=0).flatten()
  results = [{ "similarity": similarity, "pair_index": index } for index, similarity in enumerate(similarities)]
  return results # type: ignore


def search(query, model) -> List[SearchResult]:
  query_embedding = comment_model.encode([query]).flatten()
  similarities = list(get_similarities(
    inputs=[ { "code_embedding": embedding_pair['code_embedding'], "comment_embedding": query_embedding } for embedding_pair in embeddings_dataset],
    model=model
  ))

  return [{
    "similarity": similarity['similarity'],
    "pair_index": similarity['pair_index'],
    "query": query
  } for similarity in similarities]

In [None]:
class CommentQuery(TypedDict):
  comment_query: str
  removed_word: str

def generate_comment_queries(comment_tokens: List[str], max_words_to_remove = 30) -> Iterator[CommentQuery]:
  words_to_remove_count = min(len(comment_tokens), max_words_to_remove)
  word_indexes_to_remove = list(range(words_to_remove_count))
  random_generator.shuffle(word_indexes_to_remove)

  for word_index in word_indexes_to_remove:
    comment_tokens_copy = comment_tokens.copy()
    removed_word = comment_tokens_copy.pop(word_index)
    comment_query = ' '.join(comment_tokens_copy)
    yield {
      "comment_query": comment_query,
      "removed_word": removed_word
    }

In [None]:
import more_itertools

def wrap(value: str, max_width = 20) -> str:
  return "<br>".join([' '.join(sentence) for sentence in more_itertools.chunked(value.split(' '), n=max_width)])

In [None]:
import plotly.graph_objects as go

class ExperimentLog:
  def __init__(self, experiment_name: str) -> None:
    self.experiment_name = experiment_name
    self.experiment_results_dir = f'../results/{self.experiment_name}'
    if not os.path.isdir(self.experiment_results_dir):
      os.makedirs(self.experiment_results_dir)
  
  def save_figure(self, figure: go.Figure, figure_name: str):
    image_extension = '.png'
    figure_name_path = figure_name if figure_name.endswith(image_extension) else f'{figure_name}{image_extension}'
    figure.write_image(os.path.join(self.experiment_results_dir, figure_name_path))

  def save_file(self, data: str, file_name: str):
    text_extension = '.txt'
    file_name_path = file_name if file_name.endswith(text_extension) else f'{file_name}{text_extension}'
    with open(os.path.join(self.experiment_results_dir, file_name_path), 'w') as file:
      file.write(data)



## 1. Generalization with a single comment

In [None]:
def run_generalization_experiment(search_model, samples_count):
  for sample_index in range(samples_count):
    comment_tokens: list = train_pairs[sample_index]['func_documentation_tokens']
    experiment_results = []
    
    for index, comment_query in enumerate(generate_comment_queries(comment_tokens)) :
      similarity = get_similarity(
        query=comment_query['comment_query'],
        code_embedding=embeddings_dataset[sample_index]['code_embedding'],
        model=search_model,
      )

      experiment_results.append({
        "removed_word": comment_query['removed_word'],
        "similarity": similarity,
        "original_query": ' '.join(comment_tokens),
        "index": index,
      })

    yield experiment_results

def get_similarity(query, code_embedding, model) -> np.float32:
  query_embedding = comment_model.encode([query]).flatten()
  predictions = get_similarities(
    inputs=[{ "code_embedding": code_embedding, 'comment_embedding': query_embedding }], 
    model=model
  )
  return predictions[0]['similarity']

### Running the experiment and plotting the results

In [None]:
search_model = load_model('../models/dense_4_neg_5/')
experiment_1_log = ExperimentLog('experiment_1_article')
bad_sample_threshold = 0.2

for index, results in enumerate(run_generalization_experiment(search_model=search_model, samples_count=100)):
  sorted_results = sorted(results, key=lambda x: x['similarity'], reverse=True)
  n_samples = len(sorted_results)

  data_frame = pd.DataFrame({
    "removed_word": [result['removed_word'] for result in sorted_results],
    "similarity": [result['similarity'] for result in sorted_results],
  })

  original_query = wrap(sorted_results[0]['original_query'], 18)
  fig = px.bar(data_frame, x=data_frame.index, y="similarity", text_auto=True, title=f"<sup>Comment: {original_query}</sup>")
  fig.update_xaxes(title_text='Removed word', tickvals=data_frame.index, ticktext=data_frame["removed_word"].tolist())
  fig.update_yaxes(title_text='Similarity')
  experiment_1_log.save_figure(fig, f'sample_{index}')

## 2. Generalization with comments + search

In [None]:
class Experiment2Result(TypedDict):
  sample_index: int
  search_match_index: int
  original_query: str
  removed_word: str

def get_search_match_index(search_ranking: List[SearchResult], sample_index: int):
  for ranking_index, search_result in enumerate(search_ranking):
    if search_result['pair_index'] == sample_index:
      return ranking_index
  raise ValueError("Pair not found in search ranking")


def run_experiment_2(search_model, pairs_count: int) -> Iterator[List[Experiment2Result]]:
  for sample_index in tqdm(range(pairs_count), total=pairs_count, desc="Running experiment 2"):
    experiment_results: List[Experiment2Result] = []
    pair = train_pairs[sample_index]
    original_query = ' '.join(pair["func_documentation_tokens"])

    for comment_query in generate_comment_queries(pair['func_documentation_tokens']):
      ranking = search(query=comment_query, model=search_model)
      similarity_rank = sorted(ranking, key=lambda it: float(it['similarity']), reverse=True)

      experiment_results.append({
        "sample_index": sample_index,
        "search_match_index": get_search_match_index(similarity_rank, sample_index),
        "original_query": original_query,
        "removed_word": comment_query["removed_word"]
      })
      
    yield experiment_results

### Plotting results

In [None]:
def success_rate(results: List[Experiment2Result], k: int) -> float:
  results_count = len(results)

  hits = sum([result['search_match_index'] < k for result in results])
  success_rate_k = 0 if hits == 0 else hits / results_count
  return success_rate_k

def mean_reciprocal_rank(results: List[Experiment2Result]) -> float:
  results_count = len(results)
  reciprocal_ranks_sum = sum([1 / (result['search_match_index'] + 1) for result in results])
  mrr = reciprocal_ranks_sum / results_count
  return mrr

In [None]:
search_model = load_model('../models/dense_4_neg_5/')
experiment_2_log = ExperimentLog(experiment_name='experiment_2')
experiment_2_results = list(run_experiment_2(search_model=search_model, pairs_count=100))

### Plotting results

In [None]:

k_values = [1, 5, 10]
k_results = []
for k in k_values:
  success_rates = [success_rate(results, k) for results in experiment_2_results]
  success_rates = sorted(success_rates)
  for sample_index, s_rate in enumerate(success_rates):
    k_results.append({
      "Sample": sample_index,
      "SuccessRate@k": s_rate,
      "k": k
    })

df = pd.DataFrame(k_results)
fig = px.line(df, x="Sample", y="SuccessRate@k", color="k")
experiment_2_log.save_figure(fig, 'success-rates')

In [None]:

mrrs = sorted([mean_reciprocal_rank(results) for results in experiment_2_results])

mrr_results = []
for sample_index, mrr in enumerate(mrrs):
  mrr_results.append({
    "Sample": sample_index,
    "MRR": mrr,
  })

df = pd.DataFrame(mrr_results)
fig = px.line(df, x="Sample", y="MRR")
experiment_2_log.save_figure(fig, 'mrr')

In [None]:
def success_rate_k_mean(k):
  results_count = len(experiment_2_results)
  return sum([success_rate(results, k) for results in experiment_2_results]) / results_count

success_rate_k_mean(1), success_rate_k_mean(5), success_rate_k_mean(10)

In [None]:
def success_rate_edges(k):
  success_rates = [success_rate(results, k) for results in experiment_2_results]
  return {
    "min": min(success_rates),
    "max": max(success_rates),
  }

success_rate_edges(1), success_rate_edges(5), success_rate_edges(10)

In [None]:
def mrr_mean():
  results_count = len(experiment_2_results)
  return sum([mean_reciprocal_rank(results) for results in experiment_2_results]) / results_count

mrr_mean()

In [None]:
top_n = 5
mrrs_rank = sorted([(index, mean_reciprocal_rank(results)) for index, results in enumerate(experiment_2_results)], key=lambda it: it[1])
mrr_dict = { it[0]: it[1] for it in mrrs_rank }

In [None]:
mrrs_rank[:top_n], mrrs_rank[-top_n:]

## 3. Generalization with CodeSearchNet queries + search

In [None]:
python_splits = load_dataset(dataset_name, 'python', split=['train', 'test', 'validation']) # type: ignore
python_full_dataset = concatenate_datasets(python_splits) # type: ignore
splits_info = python_splits[0].info.splits # type: ignore
python_full_dataset_count = sum([splits_info[key].num_examples for key in splits_info.keys()]) # type: ignore

pairs_dataset_lookup = { sample['func_code_url']: index  for index, sample in tqdm(enumerate(python_full_dataset), desc="Generating dict lookup", total=python_full_dataset_count) }
def search_by_url(url: str) -> Optional[int]:
  try:
    return pairs_dataset_lookup[url]
  except:
    return None

In [None]:
query_samples_path = '../datasets/query_samples'

def remove_duplicates(dataset: Dataset) -> Dataset:
  pandas_dataset = dataset.to_pandas().drop_duplicates(subset=['Language', 'Query', 'GitHubUrl', 'Relevance'], ignore_index=True) # type: ignore
  dedup_dataset = Dataset.from_pandas(pandas_dataset)
  return dedup_dataset

def remove_queries_without_code(dataset: Dataset) -> Dataset:
  return dataset.filter(lambda sample: search_by_url(sample['GitHubUrl']) is not None, desc="Filtering queries with no corresponding code")

def pre_process_query_samples() -> Dataset:
  cs_net_queries_dataset: Dataset = Dataset.from_csv('../datasets/code_search_net_queries.csv') # type: ignore
  
  return remove_queries_without_code(remove_duplicates(cs_net_queries_dataset))

def get_query_samples() -> Dataset:
  try:
    return Dataset.load_from_disk(query_samples_path)
  except:
    query_samples = pre_process_query_samples()
    query_samples.save_to_disk(query_samples_path)
    return query_samples

### Predict

In [None]:
def get_query_code_embeddings(samples) -> Dataset:
  query_texts = [sample['Query'] for sample in samples]
  query_codes = [python_full_dataset[search_by_url(sample['GitHubUrl'])]['func_code_string'] for sample in samples]
  assert len(query_texts) == len(query_codes), "query_texts and query_codes arrays doesn't have the same length"

  query_embeddings = comment_model.encode(query_texts)
  code_embeddings = code_model.encode(query_codes)

  validation_dataset = []
  for query_embedding, code_embedding in zip(query_embeddings, code_embeddings):
    validation_dataset.append({
      "code_embedding": code_embedding,
      "comment_embedding": query_embedding,
    })

  return Dataset.from_list(validation_dataset)

In [None]:
from keras.models import load_model

def validate(model, samples):
  validation_dataset = get_query_code_embeddings(samples).to_tf_dataset(batch_size=10)

  return {
    "predictions": model.predict(validation_dataset, verbose=0).flatten(),
    "targets": [sample['Relevance'] for sample in samples]
  }


In [None]:
def is_prediction_correct(prediction, target) -> bool:
  if target in [0, 1]:
    return prediction <= 0.5
  
  if target in [2, 3]:
    return prediction > 0.5
  
  raise ValueError(f"target should be in range of [0, 3]. Instead, it has value of {target}")

In [None]:
validation_query_samples = [sample for sample in get_query_samples() if sample['Language'].lower() == 'python']
validation_query_samples_count = len(validation_query_samples)

model = load_model('../models/dense_4_neg_5')
search_result = validate(model, validation_query_samples)

hits = sum([is_prediction_correct(prediction, target) for prediction, target in zip(search_result['predictions'], search_result['targets'])])
success_percentage = hits / validation_query_samples_count

In [None]:
experiment_3_log = ExperimentLog('experiment_3')
experiment_3_log.save_file(f'{hits} correct predictions out of {validation_query_samples_count} samples - {success_percentage:.2%} success rate', "cs_net_queries_result.txt")