# Load dataset

In [1]:
from datasets import Dataset, load_dataset, load_from_disk
from typing import Literal

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
CodeSearchNetLanguage = Literal['python', 'go', 'java', 'javascript', 'php', 'ruby']
CodeSearchNetSplit = Literal['train', 'test', 'validation']

In [3]:
dataset_name = "code_search_net"

def load_from_cs_net(take: int) -> Dataset:
  ds = load_dataset(dataset_name, 'python', split='train')
  return Dataset.from_dict(ds[:take]) # type: ignore


# Embedding models

In [4]:
from sentence_transformers import SentenceTransformer

comment_model = SentenceTransformer('all-mpnet-base-v2')
code_model = SentenceTransformer('flax-sentence-embeddings/st-codesearch-distilroberta-base')
embedding_shape = (768)

# Generate negative samples

In [5]:
from typing import Iterator
import numpy as np
from numpy.random import default_rng


random_generator = default_rng(seed=42)

def generate_negative_samples(iterator: Iterator, negative_samples_per_sample: int):
  for batched_sample in iterator:
    codes_embeddings = batched_sample['code_embedding']
    comments_embeddings = batched_sample['comment_embedding']
    batch_indexes = range(len(codes_embeddings))

    for index in batch_indexes:
      indexes = [i for i in batch_indexes if i != index]
      negative_indexes = random_generator.choice(indexes, negative_samples_per_sample, replace=False)

      yield {
        "code_embedding": codes_embeddings[index],
        "comment_embedding": comments_embeddings[index],
        "target": 1
      }

      for negative_index in negative_indexes:
        yield {
          "code_embedding": codes_embeddings[index],
          "comment_embedding": comments_embeddings[negative_index],
          "target": 0
        }

def with_neg_samples(dataset: Dataset, negative_samples_per_sample: int, batch_size = 100) -> Dataset:
  if negative_samples_per_sample <= 1:
    return dataset
  
  dataset_with_negative_samples: Dataset = Dataset.from_generator(lambda: generate_negative_samples(dataset.iter(batch_size=batch_size), negative_samples_per_sample)) # type: ignore
  return dataset_with_negative_samples

# Generate embedding dataset for training

In [6]:
def generate_embeddings_in_batch(batched_sample):
  codes = batched_sample['func_code_string']
  comments = batched_sample['func_documentation_string']

  return {
    "code_embedding": code_model.encode(codes),
    "comment_embedding": comment_model.encode(comments),
  }

In [7]:
import os
train_count = 10000
train_dataset_path = f'../datasets/embeddings_python_train_{train_count}'
cs_net_pairs = load_from_cs_net(train_count)
is_embeddings_dataset_stored = os.path.isdir(train_dataset_path)

embeddings_dataset: Dataset = load_from_disk(train_dataset_path) if is_embeddings_dataset_stored else cs_net_pairs.map(
  generate_embeddings_in_batch, 
  batched=True, 
  batch_size=100,
  remove_columns=list(cs_net_pairs[0].keys()),
  desc="Generating embeddings"
) # type: ignore

if is_embeddings_dataset_stored == False:
  embeddings_dataset.save_to_disk(train_dataset_path)

# Train

## Add negative samples to train dataset

In [8]:
negative_samples_per_sample = 5
shuffle_buffer = int(train_count * 0.4)
tf_train_dataset = with_neg_samples(embeddings_dataset, negative_samples_per_sample).to_tf_dataset().shuffle(shuffle_buffer).map(lambda sample: ({
  "code_embedding": sample["code_embedding"],
  "comment_embedding": sample["comment_embedding"],
}, sample["target"]))

Metal device set to: Apple M1 Pro

systemMemory: 16.00 GB
maxCacheSize: 5.33 GB



2023-10-12 01:12:30.288570: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-10-12 01:12:30.288947: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [9]:
from models import build_dense_model

epoch = 100
batch_size = 200
model = build_dense_model(num_hidden_layers=4, input_shape=embedding_shape, model_name='dense_4')

## Fit embeddings

In [10]:
from datetime import datetime
from keras import callbacks

tensor_board_callback = callbacks.TensorBoard(log_dir=f'../logs/{model.name}-{datetime.now()}')
model.fit(
  tf_train_dataset.batch(batch_size),
  batch_size=batch_size,
  epochs=epoch,
  callbacks=[tensor_board_callback]
)

Epoch 1/100


2023-10-12 01:12:30.929095: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2023-10-12 01:12:30.932731: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 7

<keras.callbacks.History at 0x30b47b2b0>

In [11]:
model.save(f'../models/{model.name}-new')

INFO:tensorflow:Assets written to: ../models/dense_4-new/assets


In [12]:
# model.summary()