In [1]:
import torch
import tables
import datasets
import numpy as np
import transformers
import tqdm.notebook as tqdm
import matplotlib.pyplot as plt

import sklearn.model_selection

In [2]:
dataset = datasets.load_dataset('code-search-net/code_search_net')

In [3]:
model = transformers.AutoModel.from_pretrained('microsoft/codebert-base')

In [4]:
tokenizer = transformers.AutoTokenizer.from_pretrained('microsoft/codebert-base')

In [5]:
id_to_label = { i: str(j) for i, j in enumerate(np.unique(dataset['train']['language'])) }

In [6]:
label_to_id = { j: i for i, j in id_to_label.items() }

In [7]:
dataset = dataset.map(lambda x: { 'language': label_to_id[x] }, input_columns=['language'])

In [8]:
model = model.cuda()

In [9]:
def process_dataset(dataset: datasets.Dataset, tokenizer: transformers.PreTrainedTokenizer, model: torch.nn.Module, path: str, title: str | None = None, batch_size: int = 64, max_length: int = 512):
    with tables.open_file(path, mode='w', title=title) as file, torch.no_grad():
        filters = tables.Filters(complevel=9, complib='blosc:lz4hc')

        data = target = index = None

        for i in tqdm.trange(0, len(dataset), batch_size):
            inputs = tokenizer(
                dataset['whole_func_string'][i:i + batch_size], 
                padding='max_length', 
                max_length=max_length, 
                truncation=True, 
                return_tensors='pt',
            ).to(model.device)
            
            outputs = model(**inputs).pooler_output.cpu().numpy()

            if data is None:
                data   = file.create_earray(file.root, 'input',  tables.Float32Atom(), (0, outputs.shape[-1]), 'Input',  filters=filters, expectedrows=len(dataset))
                target = file.create_earray(file.root, 'target', tables.Int32Atom(),   (0, ),                  'Target', filters=filters, expectedrows=len(dataset))

            data.append(outputs)
            target.append(dataset['language'][i:i + batch_size])

            del inputs

In [10]:
#process_dataset(dataset['train'], tokenizer, model, 'train_dataset.hdf5', 'Train', batch_size=512)
#process_dataset(dataset['validation'], tokenizer, model, 'validation_dataset.hdf5', 'Validation', batch_size=512)
#process_dataset(dataset['test'], tokenizer, model, 'test_dataset.hdf5', 'Test', batch_size=512)

In [11]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, path: str):
        super().__init__()
        self.path = path
        self.dataset = tables.open_file(self.path, "r")

    def __len__(self) -> int:
        return self.dataset.root.input.shape[0]

    def __getitem__(self, item: int) -> tuple[np.ndarray, np.ndarray]:
        return self.dataset.root.input[item], self.dataset.root.target[item]

In [12]:
train_dataset      = CustomDataset("train_dataset.hdf5")
test_dataset       = CustomDataset("test_dataset.hdf5")
validation_dataset = CustomDataset("validation_dataset.hdf5")

In [20]:
batch_size = 512
device = 'cuda'
pin_memory = device == 'cuda'

In [21]:
train_dataloader      = torch.utils.data.DataLoader(train_dataset,      batch_size=batch_size, shuffle=True,  num_workers=8, pin_memory=pin_memory, pin_memory_device=pin_memory and device or '')
test_dataloader       = torch.utils.data.DataLoader(test_dataset,       batch_size=batch_size, shuffle=False, num_workers=8, pin_memory=pin_memory, pin_memory_device=pin_memory and device or '')
validation_dataloader = torch.utils.data.DataLoader(validation_dataset, batch_size=batch_size, shuffle=False, num_workers=8, pin_memory=pin_memory, pin_memory_device=pin_memory and device or '')

In [22]:
class ModelTorch(torch.nn.Module):
    def __init__(self, n_feature: int, num_class: int):
        super().__init__()
        self.linear = torch.nn.Linear(n_feature, num_class)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.linear(x)

In [27]:
model = ModelTorch(768, 6).to(device)

In [28]:
optimizer = torch.optim.Adam(model.parameters())

In [29]:
loss_func = torch.nn.CrossEntropyLoss()

In [30]:
epochs = 1

pbar = tqdm.trange(epochs)
for i in pbar:
    train_acc, train_loss = 0, 0
    epbar = tqdm.tqdm(train_dataloader, leave=False)
    for i, (X, y) in enumerate(epbar):
        if device == 'cuda':
            X = X.cuda(non_blocking=True)
            y = y.cuda(non_blocking=True)
        
        optimizer.zero_grad()
        logits = model(X)
        loss = loss_func(logits, y.long())

        loss.backward()

        optimizer.step()

        train_loss += loss.item()
        train_acc += (logits.argmax(dim=-1) == y).float().mean().item()

        epbar.set_description(f'loss: {train_loss / (i + 1):.3f}, acc: {train_acc / (i + 1):.3f}')

    train_acc /= i + 1
    train_loss /= i + 1

    valid_acc, valid_loss = 0, 0
    with torch.no_grad():
        epbar = tqdm.tqdm(validation_dataloader, leave=False)
        for i, (X, y) in enumerate(epbar):
            if device == 'cuda':
                X = X.cuda(non_blocking=True)
                y = y.cuda(non_blocking=True)

            logits = model(X)
            loss = loss_func(logits, y.long())
    
            valid_loss += loss.item()
            valid_acc += (logits.argmax(dim=-1) == y).float().mean().item()

            epbar.set_description(f'loss: {valid_loss / (i + 1):.3f}, acc: {valid_acc / (i + 1):.3f}')
    
        valid_acc /= i + 1
        valid_loss /= i + 1

    pbar.set_description(f'loss: {train_loss:.3f}, acc: {train_acc:.3f}, val_loss: {valid_loss:.3f}, val_acc: {valid_acc:.3f}')

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3674 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]