### For Collab

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!git clone https://github.com/azrails/diplom
%cd diplom
%pip install -r requirements.txt
# download
!mkdir datasets 
%cd datasets
!wget http://images.cocodataset.org/zips/train2014.zip

# unzip
!unzip train2014.zip -d images/ 
!rm -rf train2014.zip
# download
!wget https://bvisionweb1.cs.unc.edu/licheng/referit/data/refcoco.zip
# unzip
!unzip refcoco.zip
!rm -rf refcoco.zip

!bash ../inititalizing_utils/start.sh

# clean
!rm -rf refcoco

In [None]:
import os
BASE_DIR = '/content/gdrive/MyDrive/stage_one'
os.makedirs(BASE_DIR, exist_ok=True)

In [None]:
%load_ext tensorboard
%tensorboard --logdir /content/gdrive/MyDrive/stage_one/tb

### Imports

In [6]:
import random
import torch
import os
from tqdm.notebook import tqdm
from torch.utils.data import DataLoader
from utils import checkpoints, config
from data_utils import tokenizer, dataset
from model import vit, bert
from torch.utils.tensorboard import SummaryWriter

#sets random
random_seed=42
random.seed(42)
torch.manual_seed(random_seed)

device="cuda" if torch.cuda.is_available() else ("mps" if torch.backends.mps.is_available() else "cpu" )

match device:
    case "cuda":
        torch.cuda.manual_seed_all(random_seed)
    case "mps":
        torch.mps.manual_seed(random_seed)

### From Start

In [7]:
conf = config.load_config("configs/stage_one.yaml")
model = vit.StageOneEncoder(**conf['model']['VITEncoder'])
optimizer = torch.optim.AdamW(model.parameters(),  **conf['optimizer_params'])
scheduler = torch.optim.lr_scheduler.MultiStepLR(
            optimizer, 
            milestones=conf['train_settings']['milestones'], 
            gamma=conf['train_settings']['lr_decay']
        )
losses = []
val_scores = []

### From Checkpoint

In [None]:
model, optimizer, scheduler, conf, losses, val_scores = checkpoints.load_checkpoint(BASE_DIR, "checkpoint_name", device)

### Prepare data

In [8]:

text_model = bert.BertEmbedding(conf['model']['text_backbone'], pool=True)
bert_tokenizer = tokenizer.get_bert_tokenizer(conf['model']['text_backbone'])
train_dataset = dataset.ReferenceDataset(
    **conf['data']['train'],
    tokenizer=bert_tokenizer
)
train_data = DataLoader(
    train_dataset,
    batch_size=conf['train_settings']['batch_size'],
    shuffle=True,
    pin_memory=True,
    drop_last=True,
    #num_workers=8
)
val_dataset = dataset.ReferenceDataset(
    **conf['data']['val'],
    tokenizer=bert_tokenizer
)
val_data = DataLoader(
    val_dataset,
    batch_size=conf['train_settings']['batch_size'],
    shuffle=False,
    pin_memory=True,
    drop_last=True,
    #num_workers=8
)
epoch = (conf['train_settings']['start_epoch'], conf['train_settings']['epochs'])


In [12]:
class Trainer:
    def __init__(self, model, text_model, optimizer, checkpoint_path, scheduler, device="cpu", tb_path=None):
        self.model = model.to(device)
        self.text_model = text_model.to(device)
        self.optimizer = optimizer
        self.scheduler = scheduler
        self.device = device
        self.tb_path = tb_path
        self.checkpoint_path = checkpoint_path
        self.loss_fn = torch.nn.TripletMarginLoss()
        if self.tb_path is not None:
            self.writer =  SummaryWriter(self.checkpoint_path, self.tb_path)
    
    def train(self, train_data, val_data, epochs, losses, val_scores, checkpoint_step=2, scheduler_step=1):
        for epoch in tqdm(range(epochs[0], epochs[1]), desc='Epochs'):
            train_loss = self.train_epoch(train_data, epoch)
            val_acc = self.validate(val_data)
            if self.tb_path is not None:
                self.writer.add_scalar("Loss_epoche/train", train_loss, epoch)
                self.writer.add_scalar("MSE/val", val_acc, epoch)
            print(f"Epoch: {epoch}/{epochs} - Loss: {train_loss:.4f}")
            losses.append(train_loss)
            val_scores.append(val_acc)
            if epoch % scheduler_step == 0:
                self.scheduler.step()
            if epoch % checkpoint_step == 0:
                conf['train_settings']['start_epoch'] = epoch + 1
                checkpoints.save_checkpoint(
                    self.checkpoint_path, 
                    f'epoch_{epoch}', 
                    conf, 
                    self.model, 
                    self.optimizer, 
                    self.scheduler, 
                    losses, 
                    val_scores
                    )

    def train_epoch(self, train_data, epoch):
        self.model.train()
        loss = 0
        for step, (_, mask_batch, negative_batch, sentence_batch, att_mask_batch) in enumerate(tqdm(train_data, desc="Training", leave=False)):
            mask_batch = mask_batch.to(self.device)
            negative_batch = mask_batch.to(self.device)
            sentence_batch = sentence_batch.to(device)
            att_mask_batch = att_mask_batch.to(device)
            self.optimizer.zero_grad()
            positive_predictions, _ = self.model(mask_batch)
            negative_predictions, _ = self.model(negative_batch)
            anchor_predictions = self.text_model(sentence_batch, att_mask_batch)
            step_loss = self.loss_fn(anchor_predictions, positive_predictions, negative_predictions)
            step_loss.backward()
            self.optimizer.step()
            step_loss = step_loss.cpu().detach().item() * len(mask_batch)
            loss += step_loss * len(mask_batch)
            if self.tb_path is not None:
                self.writer.add_scalar("Loss_per_step/train", step_loss, epoch * len(train_data.dataset) + step)
        return loss / len(train_data.dataset)

    @torch.no_grad()
    def validate(self, val_data):
        self.model.eval()
        val_acc = 0
        mse = torch.nn.MSELoss()
        for _, (_, mask_batch, negative_batch, sentence_batch, att_mask_batch) in enumerate(tqdm(val_data, desc="Validating", leave=False)):
            mask_batch = mask_batch.to(self.device)
            sentence_batch = sentence_batch.to(device)
            att_mask_batch = att_mask_batch.to(device)
            positive_predictions, _ = self.model(mask_batch)
            anchor_predictions = self.text_model(sentence_batch, att_mask_batch)
            step_loss = mse(positive_predictions, anchor_predictions)
            val_acc += step_loss.cpu().detach().item() * len(mask_batch)
        return val_acc / len(val_data.dataset)

In [13]:
trainer = Trainer(model, text_model, optimizer, './runs', scheduler, device, 'tb')

In [14]:
trainer.train(train_data, val_data, epoch, losses, val_scores)

Epochs:   0%|          | 0/50 [00:00<?, ?it/s]

Training:   0%|          | 0/1413 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [6]:
_, _, _, sentence_batch, att_mask_batch = next(iter(train_data))

In [19]:
sentence_batch.squeeze().size(), att_mask_batch.size()

(torch.Size([30, 512]), torch.Size([30, 1, 512]))

In [23]:
res = text_model(sentence_batch.squeeze(), att_mask_batch.squeeze())

In [24]:
res.size()

torch.Size([30, 768])