In [1]:
import os
os.chdir("/traindata/maksim/repos/unilm/simlm/src")
!pwd

/traindata/maksim/repos/unilm/simlm/src


In [2]:
import torch
torch.cuda.is_available()

True

In [3]:
import logging

import torch
from typing import Dict
from transformers.utils.logging import enable_explicit_format
from transformers.trainer_callback import PrinterCallback
from transformers import (
    AutoTokenizer,
    HfArgumentParser,
    EvalPrediction,
    Trainer,
    set_seed,
    PreTrainedTokenizerFast
)

from logger_config import logger, LoggerCallback
from config import Arguments
from trainers.reranker_trainer import RerankerTrainer
from loaders import CrossEncoderDataLoader
from collators import CrossEncoderCollator
from metrics import accuracy
from models import Reranker


def _common_setup(args: Arguments):
    if args.process_index > 0:
        logger.setLevel(logging.WARNING)
    enable_explicit_format()
    set_seed(args.seed)


def _compute_metrics(eval_pred: EvalPrediction) -> Dict:
    preds = eval_pred.predictions
    if isinstance(preds, tuple):
        preds = preds[-1]
    logits = torch.tensor(preds).float()
    labels = torch.tensor(eval_pred.label_ids).long()
    acc = accuracy(output=logits, target=labels)[0]

    return {'acc': acc}

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
import sys
sys.argv = ['src/train_cross_encoder.py', '--deepspeed', '/traindata/maksim/repos/unilm/simlm/ds_config.json', '--model_name_or_path', 'google/electra-base-discriminator', '--per_device_train_batch_size', '2', '--per_device_eval_batch_size', '2', '--gradient_accumulation_steps', '1', '--do_train', '--fp16', '--seed', '987', '--train_file', '/traindata/maksim/repos/unilm/simlm/data/msmarco_reranker/train.jsonl', '--validation_file', '/traindata/maksim/repos/unilm/simlm/data/msmarco_reranker/dev.jsonl', '--rerank_max_length', '192', '--rerank_use_rdrop', 'True', '--train_n_passages', '64', '--rerank_forward_factor', '4', '--dataloader_num_workers', '1', '--num_train_epochs', '3', '--learning_rate', '3e-5', '--warmup_steps', '1000', '--logging_steps', '50', '--output_dir', '/traindata/maksim/repos/unilm/simlm/data/checkpoint/cross_encoder_reranker/', '--data_dir', '/traindata/maksim/repos/unilm/simlm/data/msmarco_reranker/', '--save_total_limit', '5', '--save_strategy', 'epoch', '--evaluation_strategy', 'epoch', '--load_best_model_at_end', '--metric_for_best_model', 'acc', '--greater_is_better', 'True', '--remove_unused_columns', 'False', '--overwrite_output_dir', '--disable_tqdm', 'True', '--report_to', 'none']

In [5]:
parser = HfArgumentParser((Arguments,))
args: Arguments = parser.parse_args_into_dataclasses()[0]
_common_setup(args)
logger.info('Args={}'.format(str(args)))

tokenizer: PreTrainedTokenizerFast = AutoTokenizer.from_pretrained(args.model_name_or_path)

model: Reranker = Reranker.from_pretrained(
    all_args=args,
    pretrained_model_name_or_path=args.model_name_or_path,
    num_labels=1)

logger.info(model)
logger.info('Vocab size: {}'.format(len(tokenizer)))

data_collator = CrossEncoderCollator(
    tokenizer=tokenizer,
    pad_to_multiple_of=8 if args.fp16 else None)

rerank_data_loader = CrossEncoderDataLoader(args=args, tokenizer=tokenizer)
train_dataset = rerank_data_loader.train_dataset
eval_dataset = rerank_data_loader.eval_dataset

trainer: Trainer = RerankerTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset if args.do_train else None,
    eval_dataset=eval_dataset if args.do_eval else None,
    data_collator=data_collator,
    compute_metrics=_compute_metrics,
    tokenizer=tokenizer,
)
trainer.remove_callback(PrinterCallback)
trainer.add_callback(LoggerCallback)
rerank_data_loader.trainer = trainer

[2024-11-19 12:40:56,083 INFO] Args=Arguments(
_n_gpu=8,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'gradient_accumulation_kwargs': None},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
add_pooler=False,
all_use_mask_token=<ALL_USE_MASK_TOKEN>,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_dir=/traindata/maksim/repos/unilm/simlm/data/msmarco_reranker/,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=1,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=/traindata/maksim/repos/unilm/simlm/ds_config.json,
disable_tqdm=True,
dispatch_batches=None,
do_encode=False,
do_eval=True,
do_kd_biencoder=False,
do_kd_gen_score=False,
do_predict=False,
do_rerank=False,
do_sea

In [6]:
example = train_dataset[0]
list(example.keys())

['input_ids', 'token_type_ids', 'attention_mask']

In [7]:
len(example['input_ids']), len(example['input_ids'][0])

(64, 72)

In [8]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
tokenizer.decode(example["input_ids"][0])



'[CLS] ) what was the immediate impact of the success of the manhattan project? [SEP] introduction : the presence of communication amid scientific minds was equally important to the success of the manhattan project as scientific intellect was. the only cloud hanging over the impressive achievement of the atomic researchers and engineers is what their success truly meant ; hundreds of thousands of innocent lives obliterated. [SEP]'

In [9]:
for elem in example['input_ids']:
    print(tokenizer.decode(elem))


[CLS] ) what was the immediate impact of the success of the manhattan project? [SEP] introduction : the presence of communication amid scientific minds was equally important to the success of the manhattan project as scientific intellect was. the only cloud hanging over the impressive achievement of the atomic researchers and engineers is what their success truly meant ; hundreds of thousands of innocent lives obliterated. [SEP]
[CLS] ) what was the immediate impact of the success of the manhattan project? [SEP] introduction : abstract. the pivotal engineering and scientific success of the twentieth century was the manhattan project. the manhattan project assimilated concepts and leaders from all scientific fields and engineering disciplines to construct the first two atomic bombs. [SEP]
[CLS] ) what was the immediate impact of the success of the manhattan project? [SEP] truman is briefed on manhattan project : america ’ s secret development of the atomic bomb began in 1939 with then -

In [10]:
from torch.utils.data import DataLoader

dataloader_params = {
    "batch_size": trainer._train_batch_size,
    "collate_fn": data_collator,
    "num_workers": trainer.args.dataloader_num_workers,
    "pin_memory": trainer.args.dataloader_pin_memory,
    "persistent_workers": trainer.args.dataloader_persistent_workers,
}

if not isinstance(train_dataset, torch.utils.data.IterableDataset):
    dataloader_params["sampler"] = trainer._get_train_sampler()
    dataloader_params["drop_last"] = trainer.args.dataloader_drop_last
    dataloader_params["prefetch_factor"] = trainer.args.dataloader_prefetch_factor

train_dataloader = DataLoader(train_dataset, **dataloader_params)

In [11]:
for batch in train_dataloader:
    break
batch

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


{'input_ids': tensor([[  101,  2054,  2003,  ...,     0,     0,     0],
        [  101,  2054,  2003,  ...,     0,     0,     0],
        [  101,  2054,  2003,  ...,     0,     0,     0],
        ...,
        [  101,  7051, 23054,  ...,     0,     0,     0],
        [  101,  7051, 23054,  ...,     0,     0,     0],
        [  101,  7051, 23054,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])}

In [12]:
import torch
import torch.distributed as dist
import os

def init_distributed_single_gpu():
    # Set environment variables
    os.environ["MASTER_ADDR"] = "localhost"
    os.environ["MASTER_PORT"] = "12356"
    
    # Initialize process group
    dist.init_process_group(
        backend="nccl",  # Use NCCL backend for GPU
        rank=0,          # Single GPU, so rank is 0
        world_size=1     # Total number of processes is 1
    )
    
    # Set the device
    torch.cuda.set_device(1)

init_distributed_single_gpu()

In [29]:
DEVICE = "cpu"
model = model.to(DEVICE)
batch = {k: v.to(DEVICE) if isinstance(v, torch.Tensor) else v for k, v in batch.items()}

In [30]:
batch["input_ids"].shape

torch.Size([1024, 192])

In [33]:
trainer.state.epoch = 0

In [39]:
loss = trainer.compute_loss(model, batch)
loss

tensor(2.7864, grad_fn=<AddBackward0>)

: 