## Managing The dataset

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!scp /content/drive/MyDrive/codebert/train.jsonl ./

In [3]:
!scp /content/drive/MyDrive/codebert/test.jsonl ./
!scp /content/drive/MyDrive/codebert/valid.jsonl ./
!scp /content/drive/MyDrive/codebert/codebase.jsonl ./

In [5]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.21.1-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 32.6 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 62.0 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 59.9 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 14.8 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling 

In [6]:
import os
import pickle
import random
import torch
import json
import numpy as np
from torch.nn import CrossEntropyLoss, MSELoss
from torch.utils.data import DataLoader, Dataset, SequentialSampler, RandomSampler,TensorDataset
from transformers import (WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup,
                  RobertaConfig, RobertaModel, RobertaTokenizer)

In [7]:
class InputFeatures(object):
    """A single training/test features for a example."""
    def __init__(self,
                 code_tokens,
                 code_ids,
                 nl_tokens,
                 nl_ids,
                 url,

    ):
        self.code_tokens = code_tokens
        self.code_ids = code_ids
        self.nl_tokens = nl_tokens
        self.nl_ids = nl_ids
        self.url=url

In [8]:
def convert_examples_to_features(js,tokenizer):
    code_length = 256
    nl_length = 128
    code=' '.join(js['code_tokens'])
    code_tokens=tokenizer.tokenize(code)[:code_length-2] # Remove examples that #tokens of documents is>256
    code_tokens =[tokenizer.cls_token]+code_tokens+[tokenizer.sep_token] # adding cls and sep token
    code_ids =  tokenizer.convert_tokens_to_ids(code_tokens)
    padding_length = code_length - len(code_ids)
    code_ids+=[tokenizer.pad_token_id]*padding_length
    
    nl=' '.join(js['docstring_tokens'])
    nl_tokens=tokenizer.tokenize(nl)[:nl_length-2]
    nl_tokens =[tokenizer.cls_token]+nl_tokens+[tokenizer.sep_token]
    nl_ids =  tokenizer.convert_tokens_to_ids(nl_tokens)
    padding_length = nl_length - len(nl_ids)
    nl_ids+=[tokenizer.pad_token_id]*padding_length    
    
    return InputFeatures(code_tokens,code_ids,nl_tokens,nl_ids,js['url'])

In [19]:
class TextDataset(Dataset):
    def __init__(self, tokenizer,file_path=None):
        self.examples = []
        data=[]
        with open(file_path) as f:
            for line in f:
                line=line.strip()
                js=json.loads(line)
                data.append(js)

        for js in data:
            self.examples.append(convert_examples_to_features(js,tokenizer))
            
        if 'train' in file_path:
            for idx, example in enumerate(self.examples[:3]):
                print("*** Example ***")
                print("idx: {}".format(idx))
                print("code_tokens: {}".format([x.replace('\u0120','_') for x in example.code_tokens]))
                print("code_ids: {}".format(' '.join(map(str, example.code_ids))))
                print("nl_tokens: {}".format([x.replace('\u0120','_') for x in example.nl_tokens]))
                print("nl_ids: {}".format(' '.join(map(str, example.nl_ids))))                             
        
    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):   
        return (torch.tensor(self.examples[i].code_ids),torch.tensor(self.examples[i].nl_ids))

In [10]:
def set_seed(seed=42):
    random.seed(seed)
    os.environ['PYHTONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

In [20]:
def train(model, tokenizer):
    """ Train the model """
    print("starting")
    # get the args variable
    train_data_file = "train.jsonl"
    output_dir = "./saved_models/python"
    learning_rate = 2e-5
    num_train_epochs = 10
    train_batch_size = 32
    max_grad_norm = 1 # default
    
    print("getting ready for training !!")
    #get training dataset
    train_dataset=TextDataset(tokenizer,train_data_file)
    train_sampler = RandomSampler(train_dataset) # RandomSampler, Dataloader -> torch.utils.data
    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=train_batch_size,num_workers=4)
    
    print("data loader is ready")
    
    #get optimizer and scheduler
    optimizer = AdamW(model.parameters(), lr=learning_rate, eps=1e-8)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0,num_training_steps=len(train_dataloader)*num_train_epochs)

    # Train!
    print("***** Running training *****")
    print("  Num examples = %d", len(train_dataset))
    print("  Num Epochs = %d", num_train_epochs)
    print("  Total train batch size  = %d", train_batch_size)
    print("  Total optimization steps = %d", len(train_dataloader)*num_train_epochs)
    
    # model.resize_token_embeddings(len(tokenizer))
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    model.zero_grad()
    
    model.train()
    tr_num,tr_loss,best_mrr=0,0,0 
    for idx in range(num_train_epochs): 
        for step,batch in enumerate(train_dataloader):
            #get inputs
            code_inputs = batch[0].to(device)
            nl_inputs = batch[1].to(device)
            #get code and nl vectors
            code_vec = model(code_inputs=code_inputs)
            nl_vec = model(nl_inputs=nl_inputs)
            
            #calculate scores and loss
            scores=torch.einsum("ab,cb->ac",nl_vec,code_vec)
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(scores, torch.arange(code_inputs.size(0), device=scores.device))
            
            #report loss
            tr_loss += loss.item()
            tr_num+=1
            if (step+1)% 100==0:
                print("epoch {} step {} loss {}".format(idx,step+1,round(tr_loss/tr_num,5)))
                tr_loss=0
                tr_num=0
            
            #backward
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
            optimizer.step()
            optimizer.zero_grad()
            scheduler.step() 
            
        #evaluate    
        results = evaluate(model, tokenizer, eval_when_training=True)
        for key, value in results.items():
            print("  %s = %s", key, round(value,4))    
            
        #save best model
        if results['eval_mrr']>best_mrr:
            best_mrr=results['eval_mrr']
            print("  "+"*"*20)  
            print("  Best mrr:%s",round(best_mrr,4))
            print("  "+"*"*20)                          

            checkpoint_prefix = 'checkpoint-best-mrr'
            output_dir = os.path.join(output_dir, '{}'.format(checkpoint_prefix))                        
            if not os.path.exists(output_dir):
                os.makedirs(output_dir)         
            model_to_save = model.module if hasattr(model,'module') else model
            model_to_save.encoder.save_pretrained(output_dir)            
            output_dir = os.path.join(output_dir, '{}'.format('model.bin')) 
            torch.save(model_to_save.state_dict(), output_dir) 
            print("Saving model checkpoint to %s", output_dir)

In [None]:
def evaluate(model, tokenizer,file_name,eval_when_training=False):
    #file_name = "valid.jsonl"
    eval_batch_size = 64
    codebase_file = "codebase.jsonl"
    query_dataset = TextDataset(tokenizer,file_name)
    query_sampler = SequentialSampler(query_dataset)
    query_dataloader = DataLoader(query_dataset, sampler=query_sampler, batch_size=eval_batch_size,num_workers=4)
    
    code_dataset = TextDataset(tokenizer, codebase_file)
    code_sampler = SequentialSampler(code_dataset)
    code_dataloader = DataLoader(code_dataset, sampler=code_sampler, batch_size=eval_batch_size,num_workers=4)    

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
        # multi-gpu evaluate
    if n_gpu > 1 and eval_when_training is False:
        model = torch.nn.DataParallel(model)

    # Eval!
    print("***** Running evaluation *****")
    print("  Num queries = %d", len(query_dataset))
    print("  Num codes = %d", len(code_dataset))
    print("  Batch size = %d", eval_batch_size)

    
    model.eval()
    code_vecs=[] 
    nl_vecs=[]
    for batch in query_dataloader:  
        nl_inputs = batch[1].to(device)
        with torch.no_grad():
            nl_vec = model(nl_inputs=nl_inputs) 
            nl_vecs.append(nl_vec.cpu().numpy()) 

    for batch in code_dataloader:
        code_inputs = batch[0].to(device)
        with torch.no_grad():
            code_vec= model(code_inputs=code_inputs)
            code_vecs.append(code_vec.cpu().numpy())  
    model.train()    
    code_vecs=np.concatenate(code_vecs,0)
    nl_vecs=np.concatenate(nl_vecs,0)

    scores=np.matmul(nl_vecs,code_vecs.T)
    
    sort_ids=np.argsort(scores, axis=-1, kind='quicksort', order=None)[:,::-1]    
    
    nl_urls=[]
    code_urls=[]
    for example in query_dataset.examples:
        nl_urls.append(example.url)
        
    for example in code_dataset.examples:
        code_urls.append(example.url)
        
    ranks=[]
    for url, sort_id in zip(nl_urls,sort_ids):
        rank=0
        find=False
        for idx in sort_id[:1000]:
            if find is False:
                rank+=1
            if code_urls[idx]==url:
                find=True
        if find:
            ranks.append(1/rank)
        else:
            ranks.append(0)
    
    result = {
        "eval_mrr":float(np.mean(ranks))
    }

    return result

In [15]:
import torch.nn as nn
class Model(nn.Module):   
    def __init__(self, encoder):
        super(Model, self).__init__()
        self.encoder = encoder
      
    def forward(self, code_inputs=None, nl_inputs=None): 
        if code_inputs is not None:
            return self.encoder(code_inputs,attention_mask=code_inputs.ne(1))[1]
        else:
            return self.encoder(nl_inputs,attention_mask=nl_inputs.ne(1))[1]

In [13]:
#set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
print("device: %s, n_gpu: %s",device, n_gpu)

device: %s, n_gpu: %s cuda 1


In [16]:
# Set seed
seed = 123456
set_seed(seed)

#build model
config = RobertaConfig.from_pretrained("microsoft/codebert-base")
tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
model = RobertaModel.from_pretrained("microsoft/codebert-base")    
model=Model(model)
model.to(device)

Model(
  (encoder): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps

## Training

In [21]:
train(model, tokenizer)

starting
getting ready for training !!
*** Example ***
idx: 0
code_tokens: ['<s>', 'def', '_split', '_', 'ph', 'yl', 'ogen', 'y', '_(', '_p', '_,', '_level', '_=', '_"', 's', '"', '_)', '_:', '_level', '_=', '_level', '_+', '_"', '__', '"', '_result', '_=', '_p', '_.', '_split', '_(', '_level', '_)', '_return', '_result', '_[', '_0', '_]', '_+', '_level', '_+', '_result', '_[', '_1', '_]', '_.', '_split', '_(', '_"', ';"', '_)', '_[', '_0', '_]', '</s>']
code_ids: 0 9232 3462 1215 3792 4360 11575 219 36 181 2156 672 5457 22 29 113 4839 4832 672 5457 672 2055 22 30529 113 898 5457 181 479 3462 36 672 4839 671 898 646 321 27779 2055 672 2055 898 646 112 27779 479 3462 36 22 42777 4839 646 321 27779 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 

  cpuset_checked))


epoch 0 step 100 loss 2.52234
epoch 0 step 200 loss 0.29045
epoch 0 step 300 loss 0.23485
epoch 0 step 400 loss 0.21591
epoch 0 step 500 loss 0.21159
epoch 0 step 600 loss 0.22245
epoch 0 step 700 loss 0.17431
epoch 0 step 800 loss 0.19053
epoch 0 step 900 loss 0.15714
epoch 0 step 1000 loss 0.17104
epoch 0 step 1100 loss 0.18189
epoch 0 step 1200 loss 0.16132
epoch 0 step 1300 loss 0.16131
epoch 0 step 1400 loss 0.15319
epoch 0 step 1500 loss 0.14891
epoch 0 step 1600 loss 0.17394
epoch 0 step 1700 loss 0.14501
epoch 0 step 1800 loss 0.15337
epoch 0 step 1900 loss 0.15881
epoch 0 step 2000 loss 0.14745
epoch 0 step 2100 loss 0.14857
epoch 0 step 2200 loss 0.15736
epoch 0 step 2300 loss 0.13985
epoch 0 step 2400 loss 0.15026
epoch 0 step 2500 loss 0.15878
epoch 0 step 2600 loss 0.14545
epoch 0 step 2700 loss 0.12214
epoch 0 step 2800 loss 0.15591
epoch 0 step 2900 loss 0.14117
epoch 0 step 3000 loss 0.13655
epoch 0 step 3100 loss 0.10615
epoch 0 step 3200 loss 0.13867
epoch 0 step 3300

KeyboardInterrupt: ignored

## Evaluation

In [None]:
# evaluation on dataset
results = {}
output_dir="./saved_models/python"
checkpoint_prefix = 'checkpoint-best-mrr/model.bin'
eval_dataset = "valid.jsonl"
test_dataset = "test.jsonl"
output_dir = os.path.join(output_dir, '{}'.format(checkpoint_prefix))  
model.load_state_dict(torch.load(output_dir),strict=False)      
model.to(device)
result=evaluate( model, tokenizer, eval_dataset)
print("***** Eval results *****")
for key in sorted(result.keys()):
    print("  %s = %s", key, str(round(result[key],4)))

## test
result=evaluate( model, tokenizer, test_dataset)
print("***** Eval results *****")
for key in sorted(result.keys()):
    print("  %s = %s", key, str(round(result[key],4)))

print("==========Result=============")
print(result)