# AI for Software Engineering: Automated Source Code Defect Detection with CodeBERT

In [1]:
# Róisín Luo

In [2]:
import sys
import os
import random
import math

import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm

from tqdm import tqdm

import csv
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F

%matplotlib inline

# GPU acceleration just in case

In [3]:
def get_hwacc_device_v3():

    device = torch.device('cpu')
    
    if torch.cuda.is_available():
        
        print('# of CUDA devices:', torch.cuda.device_count())
        print(torch.cuda.get_device_name(0))
        print('CUDA memory Usage:')
        print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
        print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')
    
        device = torch.device('cuda')
    # MacOS
    elif hasattr(torch, "backends") and \
          hasattr(torch.backends, "mps") and \
          torch.backends.mps.is_available():
                
        device = torch.device('mps')
 
    print("GPU device is: ", device)
    
    return device

In [4]:
device = get_hwacc_device_v3()
#device = torch.device("cpu")
device

Tesla V100-PCIE-16GB
CUDA memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB
GPU device is:  cuda


device(type='cuda')

# Loading dataset

In [None]:
from datasets import list_datasets

datasets_list = list_datasets()
len(datasets_list)

In [None]:
for ds in datasets_list:
    if "defect" in ds:
        print(ds)

In [5]:
cache_dir = "dataset_cache"

In [None]:
dataset = load_dataset(cache_dir + os.sep + "")

In [None]:
from datasets import load_dataset
dataset = load_dataset(path = "semeru/code-code-DefectDetection",
                       cache_dir = cache_dir, 
                       download_mode = "reuse_dataset_if_exists")
dataset.save_to_disk('dataset_cache/code-code-DefectDetection')

In [5]:
from datasets import load_from_disk

dataset = load_from_disk("dataset_cache/code-code-DefectDetection")

## Investigating dataset shape

In [6]:
dataset

DatasetDict({
    train: Dataset({
        features: ['project', 'commit_id', 'target', 'func', 'idx'],
        num_rows: 21854
    })
    validation: Dataset({
        features: ['project', 'commit_id', 'target', 'func', 'idx'],
        num_rows: 2732
    })
    test: Dataset({
        features: ['project', 'commit_id', 'target', 'func', 'idx'],
        num_rows: 2732
    })
})

In [7]:
dataset['train'][0]

{'project': 'FFmpeg',
 'commit_id': '973b1a6b9070e2bf17d17568cbaf4043ce931f51',
 'target': 0,
 'func': 'static av_cold int vdadec_init(AVCodecContext *avctx)\n\n{\n\n    VDADecoderContext *ctx = avctx->priv_data;\n\n    struct vda_context *vda_ctx = &ctx->vda_ctx;\n\n    OSStatus status;\n\n    int ret;\n\n\n\n    ctx->h264_initialized = 0;\n\n\n\n    /* init pix_fmts of codec */\n\n    if (!ff_h264_vda_decoder.pix_fmts) {\n\n        if (kCFCoreFoundationVersionNumber < kCFCoreFoundationVersionNumber10_7)\n\n            ff_h264_vda_decoder.pix_fmts = vda_pixfmts_prior_10_7;\n\n        else\n\n            ff_h264_vda_decoder.pix_fmts = vda_pixfmts;\n\n    }\n\n\n\n    /* init vda */\n\n    memset(vda_ctx, 0, sizeof(struct vda_context));\n\n    vda_ctx->width = avctx->width;\n\n    vda_ctx->height = avctx->height;\n\n    vda_ctx->format = \'avc1\';\n\n    vda_ctx->use_sync_decoding = 1;\n\n    vda_ctx->use_ref_buffer = 1;\n\n    ctx->pix_fmt = avctx->get_format(avctx, avctx->codec->pix_f

In [8]:
print(dataset['train'][0]['func'])

static av_cold int vdadec_init(AVCodecContext *avctx)

{

    VDADecoderContext *ctx = avctx->priv_data;

    struct vda_context *vda_ctx = &ctx->vda_ctx;

    OSStatus status;

    int ret;



    ctx->h264_initialized = 0;



    /* init pix_fmts of codec */

    if (!ff_h264_vda_decoder.pix_fmts) {

        if (kCFCoreFoundationVersionNumber < kCFCoreFoundationVersionNumber10_7)

            ff_h264_vda_decoder.pix_fmts = vda_pixfmts_prior_10_7;

        else

            ff_h264_vda_decoder.pix_fmts = vda_pixfmts;

    }



    /* init vda */

    memset(vda_ctx, 0, sizeof(struct vda_context));

    vda_ctx->width = avctx->width;

    vda_ctx->height = avctx->height;

    vda_ctx->format = 'avc1';

    vda_ctx->use_sync_decoding = 1;

    vda_ctx->use_ref_buffer = 1;

    ctx->pix_fmt = avctx->get_format(avctx, avctx->codec->pix_fmts);

    switch (ctx->pix_fmt) {

    case AV_PIX_FMT_UYVY422:

        vda_ctx->cv_pix_fmt_type = '2vuy';

        break;

    case AV_PIX_FMT_YUYV422:

## Setting dataset and splitting dataset

In [9]:
#Setting format to torch or tensorflow
dataset.set_format(type='torch', columns=['func', 'target'])

In [10]:
dataset_train_ = dataset['train']
dataset_val_ = dataset['validation']
dataset_test_ = dataset['test']

In [11]:
dataset_train_[0]['target']

tensor(0)

# Make dataset class

In [12]:
len(dataset_train_)

21854

In [13]:
from transformers import RobertaTokenizer, RobertaConfig, RobertaModel

class CodeDefectDataset(torch.utils.data.Dataset):
    def __init__(self, 
                 dataset, 
                 random_seed = 42):
        
        self.dataset = dataset
        self.dataset_size = len(dataset)
        self.tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
        
        np.random.seed(random_seed)
        
    def __getitem__(self, index):
        code = self.dataset[index]['func']
        target = self.dataset[index]['target']
        
        #BERT has maximum 512 token limit, we
        #split code into several code blocks.
        BLOCK_MAX_LEN = 500
        
        tokens = self.tokenizer.tokenize(code)
        tokens = [self.tokenizer.cls_token] + tokens + [self.tokenizer.eos_token]
        
        num_tokens = len(tokens)
        num_block_tokens = num_tokens // BLOCK_MAX_LEN
        
        if num_tokens % BLOCK_MAX_LEN > 0:
            num_block_tokens += 1
        
        block_tokens_list = []
        block_token_ids_list = []
        residual = num_tokens
        for n in range(num_block_tokens):
            if residual >= BLOCK_MAX_LEN:
                token_len = BLOCK_MAX_LEN
            else:
                token_len = residual
            
            residual -= token_len
       
            block_tokens = tokens[n*BLOCK_MAX_LEN:n*BLOCK_MAX_LEN + token_len]

            block_tokens_list.append(block_tokens)

            block_token_ids = self.tokenizer.convert_tokens_to_ids(block_tokens)
            
            block_token_ids_list.append(block_token_ids)
        
            if residual == 0:
                break
                
        assert sum([len(t) for t in block_tokens_list]) == len(tokens)
        assert sum([len(t) for t in block_token_ids_list]) == len(tokens)
        
        return (code, tokens, block_tokens_list, block_token_ids_list), target
    
    def __len__(self):
        return self.dataset_size

In [14]:
dataset_train = CodeDefectDataset(dataset_train_)
dataset_val = CodeDefectDataset(dataset_val_)
dataset_test = CodeDefectDataset(dataset_test_)

In [15]:
for i in range(0, 3):
    (code, tokens, block_tokens_list, block_token_ids_list), target = dataset_train[i]
    print("len(tokens): ", len(tokens))
    print("len(block_tokens_list): ", len(block_tokens_list))
    print("len(block_token_ids_list): ", len(block_token_ids_list))
    print("target: ", target)
    print()

len(tokens):  1123
len(block_tokens_list):  3
len(block_token_ids_list):  3
target:  tensor(0)

len(tokens):  20598
len(block_tokens_list):  42
len(block_token_ids_list):  42
target:  tensor(0)

len(tokens):  277
len(block_tokens_list):  1
len(block_token_ids_list):  1
target:  tensor(0)



In [16]:
batch_size=10

In [17]:
#making data into batch
def dataset_collate_fn(batch):
    target_batch = []
    code_batch = []
    tokens_batch = []
    block_tokens_list_batch = []
    block_token_ids_list_batch = []
    
    for (code, tokens, block_tokens_list, block_token_ids_list), target in batch:
        target_batch.append(target)
        code_batch.append(code)
        tokens_batch.append(tokens)
        block_tokens_list_batch.append(block_tokens_list)
        block_token_ids_list_batch.append(block_token_ids_list)
        
    target_batch = torch.stack(target_batch)
    
    return (code_batch, tokens_batch, block_tokens_list_batch, block_token_ids_list_batch), target_batch

In [18]:
dataloader_train = torch.utils.data.DataLoader(dataset_train, batch_size=batch_size, shuffle=True, collate_fn = dataset_collate_fn)
dataloader_val = torch.utils.data.DataLoader(dataset_val, batch_size=batch_size, shuffle=True, collate_fn = dataset_collate_fn)
dataloader_test = torch.utils.data.DataLoader(dataset_test, batch_size=batch_size, shuffle=True, collate_fn = dataset_collate_fn)

In [19]:
batch = next(iter(dataloader_train))
len(batch)

2

In [20]:
(code, tokens, block_tokens_list, block_token_ids_list), target = batch

In [21]:
len(block_tokens_list)

10

In [22]:
len(block_token_ids_list)

10

In [23]:
block_token_ids = block_token_ids_list[0]

In [24]:
len(block_token_ids[0])

398

In [25]:
#code
print(code[0])

void kvm_inject_x86_mce(CPUState *cenv, int bank, uint64_t status,

                        uint64_t mcg_status, uint64_t addr, uint64_t misc,

                        int abort_on_error)

{

#ifdef KVM_CAP_MCE

    struct kvm_x86_mce mce = {

        .bank = bank,

        .status = status,

        .mcg_status = mcg_status,

        .addr = addr,

        .misc = misc,

    };

    struct kvm_x86_mce_data data = {

            .env = cenv,

            .mce = &mce,

    };



    if (!cenv->mcg_cap) {

        fprintf(stderr, "MCE support is not enabled!\n");

        return;

    }



    run_on_cpu(cenv, kvm_do_inject_x86_mce, &data);

#else

    if (abort_on_error)

        abort();

#endif

}



# Loading pre-trained code-understandable model

We can use CodeBERT or Transformer-XL to convert source code into embeddings, since they both are traiend on source code dataset.

In [26]:
from transformers import RobertaTokenizer, RobertaConfig, RobertaModel

tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
embed_model = RobertaModel.from_pretrained("microsoft/codebert-base")

In [27]:
tokens_ids = block_token_ids_list[0][0]
print(tokens_ids)
tokens_ids = torch.tensor(tokens_ids)
tokens_ids = tokens_ids.unsqueeze(0) #into a batch
tokens_ids.shape

[0, 47908, 449, 38486, 1215, 179, 21517, 1215, 1178, 5334, 1215, 119, 1755, 1640, 47378, 13360, 1009, 438, 41124, 6, 6979, 827, 6, 49315, 4027, 1215, 90, 2194, 6, 50140, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 49315, 4027, 1215, 90, 44355, 571, 1215, 29552, 6, 49315, 4027, 1215, 90, 49649, 6, 49315, 4027, 1215, 90, 29526, 6, 50140, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 6979, 34771, 1215, 261, 1215, 44223, 43, 50118, 50118, 45152, 50118, 50118, 10431, 1594, 9232, 229, 20954, 1215, 28494, 1215, 448, 8041, 50140, 1437, 1437, 1437, 29916, 449, 38486, 1215, 1178, 5334, 1215, 119, 1755, 475, 1755, 5457, 25522, 50140, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 479, 5760, 5457, 827, 6, 50140, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 479, 29552, 5457, 2194, 6, 50140, 1437, 1437, 1437, 1437, 1437, 143

torch.Size([1, 398])

In [28]:
outputs = embed_model(tokens_ids)#[0]

embed = outputs.last_hidden_state[:, 0, :] #CLS position.
embed.shape

torch.Size([1, 768])

# Building code defect prediction model

In [29]:
from transformers import RobertaTokenizer, RobertaConfig, RobertaModel

class CodeDefectPredictionModel(nn.Module):
    def __init__(self, device = torch.device("cpu")):
        
        super().__init__()
        
        self.device = device
    
        self.embed_model = RobertaModel.from_pretrained("microsoft/codebert-base")

        #correlation prediction head.
        self.pred_head = nn.Sequential(
                    #nn.Dropout(p = 0.1),
            
                    #Since we add all representations as final context.
                    #so we must re-center the representations statistically.
                    #for NLP task, we DONT use batchnorm. instead SHOULD use layernorm.
                    nn.LayerNorm(normalized_shape = (768)), 
                    nn.Linear(in_features = 768, out_features = 100, bias = True),
                    nn.ReLU(),
            
                    nn.Linear(in_features = 100, out_features = 1, bias = True),
                    nn.Sigmoid(),
                    )
    
    def forward(self, batch):
        self.embed_model.eval()
        
        #A batch format:
        #(code, tokens, block_tokens_list, block_token_ids_list), target

        #get embeddings
        embed_batch = []
        
        (code_batch, tokens_batch, block_tokens_list_batch, block_token_ids_list_batch), target = batch
        
        for block_token_ids_list in block_token_ids_list_batch:
            #Compute each code block respectively.
            block_embed_list = []
            for tokens_ids in block_token_ids_list:
                tokens_ids = torch.tensor(tokens_ids)
                tokens_ids = tokens_ids.unsqueeze(0) #into a batch
                tokens_ids = tokens_ids.to(self.device)
                
                # We only need last layer output the CLS position as embedding.!
                with torch.no_grad():
                    outputs = self.embed_model(tokens_ids)
                    #According to BERT paper, we cal [CLS] (token position 0) as 
                    #context representation vector.
                    embed = outputs.last_hidden_state[:, 0, :] #CLS position.
                    embed = embed.squeeze(0) #from 1x768 to 768
                    block_embed_list.append(embed)
            
            block_embed = torch.stack(block_embed_list)
            #simply add all embeddings to form the code embedding!
            embed = torch.sum(block_embed, dim = 0)
    
            embed_batch.append(embed)
        
        
        embed_batch = torch.stack(embed_batch)        
        
        #print("embed_batch.shape = ", embed_batch.shape)
        
        #stop gradients.
        embed_batch = embed_batch.detach()
        
        probs_batch = self.pred_head(embed_batch)
        probs_batch = probs_batch.squeeze(1)
        
        return probs_batch

In [30]:
batch = next(iter(dataloader_train))

In [31]:
#testing the model simply.
model = CodeDefectPredictionModel(device)
model.to(device)
batch = next(iter(dataloader_train))
probs = model(batch)

In [32]:
probs.shape

torch.Size([10])

In [33]:
probs

tensor([0.4883, 0.4809, 0.4813, 0.4857, 0.4789, 0.4774, 0.4810, 0.4797, 0.4848,
        0.4814], device='cuda:0', grad_fn=<SqueezeBackward1>)

In [34]:
probs >= 0.5

tensor([False, False, False, False, False, False, False, False, False, False],
       device='cuda:0')

In [35]:
torch.mean(probs >= 0.5, dtype = torch.float).cpu()

tensor(0.)

In [36]:
_, labels = batch
labels.shape

torch.Size([10])

In [37]:
labels.shape

torch.Size([10])

In [38]:
labels

tensor([1, 1, 0, 0, 0, 1, 1, 1, 0, 0])

In [39]:
criterion = torch.nn.BCELoss()

#sending to GPU.
probs = probs.to(device)
labels = labels.float().to(device)

criterion(probs, labels)

tensor(0.6948, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward>)

# Training one epoch

In [40]:
%matplotlib inline

#from IPython.display import display, clear_output
from IPython import display

def train_one_epoch(
          model, 
          device, 
          dataloader, 
          optimizer, 
          criterion,
          epoch,
          max_batches = None):
    
    # Enable gradient computing
    model.to(device)
    model.train()
    
    if max_batches is None:
        max_batches = len(dataloader)
    
    #some statistics
    
    #averaged loss in current epoch.
    epoch_loss = 0.0
    total_loss = 0.0
    
    #accuracy in current epoch
    batch_accuracy = 0.0
    #accuracy in current batch
    epoch_accuracy = 0.0
    
    #how many samples predicted correct.
    epoch_corrects = 0.0
    #how many samples trained in this epoch
    epoch_total = 0.0
    
    for batch_idx, batch in enumerate(dataloader, 1):
        
        (code_batch, tokens_batch, block_tokens_list_batch, block_token_ids_list_batch), labels_ = batch
        
        labels = labels_.float()
        #sending labels to GPU if possible
        labels = labels.to(device)

        optimizer.zero_grad()
        
        #predictions.
        preds = model(batch)
        
        #computing BCE
        loss = criterion(preds, labels)
           
        #computing gradients
        loss.backward()
        
        #optimizing the classifier, Notice: the GPT is fixed.
        optimizer.step()
        
        
        #computing accuracy in a batch
        # torch.max() returns values, indices
        preds_ = (preds >= 0.5).int().cpu().data
        #batch_accuracy = torch.mean(preds > 0.5, dtype = torch.float).detach().cpu()
        #batch_accuracy = torch.mean(preds_.float()).detach().cpu()
        
        #computing the total loss and average loss in one epoch
        total_loss += loss.detach().cpu().numpy()
        epoch_loss = total_loss / batch_idx
        
        #computing the correct and total samples
        batch_corrects = torch.sum(labels_.cpu().data == preds_, dtype = torch.int)
        batch_accuracy = batch_corrects / len(labels_)
        epoch_corrects += batch_corrects
        epoch_total += len(labels_)
        epoch_accuracy = epoch_corrects / epoch_total         

        #Updating training displays.
        display.clear_output(wait=True)
        
        display.display('Epoch {} [{}/{} ({:.0f}%)]'.format(
                    epoch, batch_idx, 
                    len(dataloader), 
                    100. * (batch_idx / len(dataloader))))
        
        display.display('* batch accuracy {:.2f}% epoch accuracy {:.2f}%'.format(
                    100. * batch_accuracy, 100. * epoch_accuracy))
        
        display.display('* batch loss {:.6f} epoch loss {:.6f}'.format(
                    loss.item(), epoch_loss))
        display.display('* batch_corrects {}'.format(batch_corrects))
        
        if batch_idx > max_batches:
            break
    
    return epoch_loss, epoch_accuracy

In [41]:
model = CodeDefectPredictionModel(device = device)
_ = model.to(device)

In [48]:
batch_size = 200

dataloader_train = torch.utils.data.DataLoader(dataset_train, 
                                               batch_size=batch_size, 
                                               shuffle=True, 
                                               collate_fn = dataset_collate_fn)

learning_rate = 0.001

#for language models, Adam is a good option.The learning rate 
#typically less than 0.001 for stabability.
optimizer = torch.optim.AdamW(
                        model.pred_head.parameters(), 
                        lr = learning_rate,
                        #momentum = 0.9, 
                        #weight_decay = 5e-4
                      )

#Loss function
criterion = torch.nn.BCELoss()

In [None]:
epoch_loss, epoch_accuracy = train_one_epoch(
          model, 
          device, 
          dataloader_train, 
          optimizer, 
          criterion,
          epoch = 1,
          max_batches = 500)



'* batch accuracy 61.00% epoch accuracy 57.02%'

'* batch loss 0.666603 epoch loss 0.677467'

'* batch_corrects 122'

# Save/load model

In [50]:
import os

def save_model(model, model_path):
    
    save_path = os.path.normpath(os.path.dirname(model_path)).rstrip(os.path.sep)
        
    if not os.path.exists(save_path):
        os.makedirs(save_path)
        
    print("Save model weights to: ", model_path)
    torch.save(model.state_dict(), model_path)  

In [51]:
save_model(model, "models/code_defect_detection_model.pth")

Save model weights to:  models/code_defect_detection_model.pth


In [52]:
def load_model(model_path, device):
    model = CodeDefectPredictionModel(device)
    
    if os.path.exists(model_path):
        #re-loading
        model.load_state_dict(torch.load(model_path, map_location = device)) 
        print("Loaded model weights from: ", model_path)
    else:
        print("Model weights not found.")
        
    return model

In [53]:
model = load_model("models/code_defect_detection_model.pth", device)

Loaded model weights from:  models/code_defect_detection_model.pth


# Complete training

In [None]:
#from atomicwrites import atomic_write #we must guarantee the automicity of write operation.
#import pickle

def train(model, 
          device, 
          dataloader, 
          optimizer,
          criterion,
          epochs,
          scheduler = None,
          checkpoint = None):
    
    #if model_path is not None and not os.path.exists(model_path):
    #    os.makedirs(model_path)
    
    loss_hist = []
    accuracy_hist = []
    
    for epoch in range(1, epochs + 1):
        
        epoch_loss, epoch_accuracy = train_one_epoch(
          model, 
          device, 
          dataloader, 
          optimizer, 
          criterion,
          epoch,
          max_batches = None)
    
        if scheduler:
            #adjusting LR is necessary
            scheduler.step()
            
        loss_hist.append(epoch_loss)
        accuracy_hist.append(epoch_accuracy)
   
        if checkpoint is not None:
            save_path = os.path.normpath(os.path.dirname(checkpoint)).rstrip(os.path.sep)
        
            if not os.path.exists(save_path):
                os.makedirs(save_path)
        
            #print("Save model weights to: ", checkpoint)
            torch.save(model.state_dict(), checkpoint) 
            
    return loss_hist, laccuracy_hist

In [None]:
batch_size = 64

dataloader_train = torch.utils.data.DataLoader(dataset_train, 
                                               batch_size=batch_size, 
                                               shuffle=True, 
                                               collate_fn = dataset_collate_fn)

learning_rate = 0.001

#for language models, Adam is a good option.The learning rate 
#typically less than 0.001 for stabability.
optimizer = torch.optim.Adam(
                        model.pred_head.parameters(), 
                        lr = learning_rate,
                        #momentum = 0.9, 
                        #weight_decay = 5e-4
                      )

#scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 
#                                                step_size = 10, #dropping learning-rate every 10 steps. 
#                                                gamma = 0.1)
scheduler = None

#Loss function
criterion = torch.nn.BCELoss()

In [None]:
loss_hist, laccuracy_hist = train(model, 
          device, 
          dataloader_train, 
          optimizer,
          criterion,
          epochs = 50,
          scheduler = scheduler)

In [None]:
#plt.plot(loss_hist)

In [None]:
#plt.plot(laccuracy_hist)

# Evaluation

In [133]:
def evaluate(model, 
             device, 
             dataloader,
             max_batches = None):
    
    # Disable gradient computing
    model.eval()
    
    if max_batches is None:
        max_batches = len(dataloader)
    
    batch_accuracy = 0.0
    total_accuracy = 0.0
    
    total_corrects = 0.0
    total_entries = 0.0
    
    for batch_idx, batch in enumerate(dataloader, 1):
        
        (code_batch, tokens_batch, block_tokens_list_batch, block_token_ids_list_batch), labels_ = batch
        
        
        #no need to track gradients
        with torch.no_grad():
            preds = model(batch)
                        
        #computing accuracy in a batch
        preds_ = (preds >= 0.5).int().cpu().data
        
        #computing the correct and total samples
        batch_corrects = torch.sum(labels.cpu().data == preds_, dtype = torch.int)
        batch_accuracy = batch_corrects / len(labels)
        
        total_corrects += batch_corrects
        total_entries += len(labels)
        total_accuracy = total_corrects / total_entries         

        #Updating training displays.
        display.clear_output(wait=True)
        
        display.display('Evaluation: [{}/{} ({:.0f}%)]'.format(
                    batch_idx, 
                    len(dataloader), 
                    100. * (batch_idx / len(dataloader))))
        
        display.display('* batch accuracy {:.2f}% total accuracy {:.2f}%'.format(
                    100. * batch_accuracy, 100. * total_accuracy))
        
        display.display('* total_corrects {} total_entries {}'.format(total_corrects, total_entries))
        
        if batch_idx >= max_batches:
            break
                
    return total_accuracy

In [134]:
batch_size = 64

dataloader_test = torch.utils.data.DataLoader(dataset_test, 
                                               batch_size=batch_size, 
                                               shuffle=True, 
                                               collate_fn = dataset_collate_fn)

total_accuracy = evaluate(model, 
             device, 
             dataloader_test,
             max_batches = 10)

print(total_accuracy)



'* batch accuracy 56.25% total accuracy 50.00%'

'* total_corrects 64.0 total_entries 128.0'

KeyboardInterrupt: 

# Predict

In [135]:
batch_size = 100

dataloader_test = torch.utils.data.DataLoader(dataset_test, 
                                               batch_size=batch_size, 
                                               shuffle=True, 
                                               collate_fn = dataset_collate_fn)

In [136]:
batch = next(iter(dataloader_test))

In [143]:
(code_batch, tokens_batch, block_tokens_list_batch, block_token_ids_list_batch), labels = batch

In [138]:
preds = model(batch)

In [139]:
#prediction probability.
preds

tensor([0.5516, 0.4518, 0.4650, 0.5386, 0.4599, 0.4969, 0.5494, 0.5296, 0.4370,
        0.5281, 0.4360, 0.4644, 0.4944, 0.4150, 0.5380, 0.4639, 0.4540, 0.5154,
        0.4979, 0.4472, 0.3965, 0.4357, 0.4526, 0.5105, 0.4816, 0.5293, 0.5432,
        0.5546, 0.4991, 0.5123, 0.5693, 0.5572, 0.5470, 0.5298, 0.5256, 0.5055,
        0.4457, 0.5336, 0.4651, 0.4156, 0.5618, 0.4192, 0.5007, 0.5477, 0.4318,
        0.5616, 0.5493, 0.5303, 0.4759, 0.5097, 0.5208, 0.6093, 0.4448, 0.4474,
        0.4505, 0.4704, 0.4651, 0.5663, 0.5273, 0.4797, 0.4846, 0.4990, 0.5117,
        0.4823, 0.5345, 0.4652, 0.4526, 0.4058, 0.3860, 0.5298, 0.5474, 0.4386,
        0.4566, 0.5452, 0.5097, 0.4501, 0.4545, 0.4192, 0.4863, 0.4925, 0.4404,
        0.4630, 0.5202, 0.4838, 0.4471, 0.4613, 0.4261, 0.5766, 0.4899, 0.5026,
        0.5173, 0.5254, 0.5192, 0.5498, 0.4550, 0.5099, 0.4647, 0.4765, 0.4908,
        0.4760], device='mps:0', grad_fn=<SqueezeBackward1>)

In [144]:
labels

tensor([0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0,
        1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1,
        1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1,
        1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1,
        1, 0, 0, 1])

In [145]:
(preds > 0.5).int()

tensor([1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1,
        0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1,
        0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0,
        0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1,
        0, 0, 0, 0], device='mps:0', dtype=torch.int32)

In [146]:
#average accuracy
torch.mean((preds > 0.5).int().cpu() == labels.int().cpu(), dtype = torch.float).cpu().item()

0.5299999713897705