In [1]:
import torch
from accelerate import Accelerator
from src import datasets_loader
from src import models
from src.constants import GFG_DATA_PATH

import sys

sys.path.append("../")

from exp_configs import EXP_GROUPS

In [2]:
accelerator = Accelerator()
exp_dict = EXP_GROUPS["base"][0]

In [3]:
# Create data loaders and model
train_data = datasets_loader.get_dataset(
    dataset_name="code_search_net",
    path_to_cache="/mnt/colab_public/datasets/joao/CodeSearchNet",
    split="train",
    maximum_raw_length=exp_dict["maximum_raw_length"],
)
train_loader = torch.utils.data.DataLoader(
    train_data,
    batch_size=exp_dict["train_batch_size"],
    num_workers=exp_dict["n_workers"],
    collate_fn=datasets_loader.TrainCollator(
        tokenizer_path=exp_dict["tokenizer_path"],
        maximum_length=exp_dict["maximum_input_length"],
        mlm_masking_probability=exp_dict["mlm_masking_probability"],
        contrastive_masking_probability=exp_dict["contrastive_masking_probability"],
    ),
    drop_last=True,
)

No config specified, defaulting to: code_search_net/all
Reusing dataset code_search_net (/mnt/colab_public/datasets/joao/CodeSearchNet/code_search_net/all/1.0.0/80a244ab541c6b2125350b764dc5c2b715f65f00de7a56107a28915fac173a27)


  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached processed dataset at /mnt/colab_public/datasets/joao/CodeSearchNet/code_search_net/all/1.0.0/80a244ab541c6b2125350b764dc5c2b715f65f00de7a56107a28915fac173a27/cache-43e4016a25a27a85.arrow


In [4]:
test_data = datasets_loader.get_dataset(
    dataset_name="gfg",
    path_to_cache=GFG_DATA_PATH,
    split="test",
    maximum_raw_length=exp_dict["maximum_raw_length"],
)
test_loader = torch.utils.data.DataLoader(
    test_data,
    batch_size=exp_dict["train_batch_size"],
    num_workers=exp_dict["n_workers"],
    collate_fn=datasets_loader.TestCollator(
        tokenizer_path=exp_dict["tokenizer_path"],
        maximum_length=exp_dict["maximum_input_length"],
    ),
    drop_last=True,
)

Loading cached processed dataset at resources/data/transcoder_evaluation_gfg/cache-8e11c09ed2c1f653.arrow


In [5]:
exp_dict["vocab_size"] = len(train_loader.collate_fn.tokenizer.vocab)

In [6]:
train_batch = next(iter(train_loader))

In [7]:
test_batch = next(iter(test_loader))

In [8]:
model = models.get_model(
        exp_dict=exp_dict,
        accelerator=accelerator,
    )

In [None]:
model.train_on_loader(train_loader, epoch=0)

In [10]:
model.eval_on_loader(test_loader)

{'test_contrastive_loss': 2.4658589363098145,
 'R@1': 0.0052083334885537624,
 'R@5': 0.0243055559694767,
 'MRR': 0.02183467149734497}