In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cp -r drive/MyDrive/ALBEF/* .

In [None]:
%cd /content/data

/content/data


In [None]:
!unzip -q train2014.zip & unzip -q test2015.zip & unzip -q val2014.zip

In [None]:
%cd /content

/content


In [None]:
!pip install transformers==4.25.1

Collecting transformers==4.25.1
  Downloading transformers-4.25.1-py3-none-any.whl.metadata (93 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/93.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m93.9/93.9 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.25.1)
  Downloading tokenizers-0.13.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.8/5.8 MB[0m [31m109.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tokenizers-0.13.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m128.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tokenizers, transformers
  Attempting uninstall: 

In [None]:
!pip install ruamel.yaml==0.17.*

Collecting ruamel.yaml==0.17.*
  Downloading ruamel.yaml-0.17.40-py3-none-any.whl.metadata (19 kB)
Collecting ruamel.yaml.clib>=0.2.7 (from ruamel.yaml==0.17.*)
  Downloading ruamel.yaml.clib-0.2.12-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.7 kB)
Downloading ruamel.yaml-0.17.40-py3-none-any.whl (113 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m113.7/113.7 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ruamel.yaml.clib-0.2.12-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (739 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m739.1/739.1 kB[0m [31m52.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ruamel.yaml.clib, ruamel.yaml
Successfully installed ruamel.yaml-0.17.40 ruamel.yaml.clib-0.2.12


In [None]:
import argparse
import os
import ruamel.yaml as yaml
import numpy as np
import random
import time
import datetime
import json
from pathlib import Path
import subprocess

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
import torch.backends.cudnn as cudnn
import torch.distributed as dist

from models.model_vqa import ALBEF
from models.vit import interpolate_pos_embed
from models.tokenization_bert import BertTokenizer

import utils
from dataset.utils import save_result
from dataset import create_dataset, create_sampler, create_loader, vqa_collate_fn

from scheduler import create_scheduler
from optim import create_optimizer

In [None]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
def train(model, data_loader, optimizer, tokenizer, epoch, warmup_steps, device, scheduler, config):
    # train
    model.train()

    metric_logger = utils.MetricLogger(delimiter="  ")
    metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
    metric_logger.add_meter('loss', utils.SmoothedValue(window_size=1, fmt='{value:.4f}'))

    header = 'Train Epoch: [{}]'.format(epoch)
    print_freq = 50
    step_size = 100
    warmup_iterations = warmup_steps*step_size

    for i,(image, question, answer, weights, n) in enumerate(metric_logger.log_every(data_loader, print_freq, header)):
        image, weights = image.to(device,non_blocking=True), weights.to(device,non_blocking=True)
        question_input = tokenizer(question, padding='longest', truncation=True, max_length=25, return_tensors="pt").to(device)
        answer_input = tokenizer(answer, padding='longest', return_tensors="pt").to(device)

        if epoch>0 or not config['warm_up']:
            alpha = config['alpha']
        else:
            alpha = config['alpha']*min(1,i/len(data_loader))

        loss = model(image, question_input, answer_input, train=True, alpha=alpha, k=n, weights=weights)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        metric_logger.update(loss=loss.item())
        metric_logger.update(lr=optimizer.param_groups[0]["lr"])

        if epoch==0 and i%step_size==0 and i<=warmup_iterations:
            scheduler.step(i//step_size)

    # gather the stats from all processes
    metric_logger.synchronize_between_processes()
    print("Averaged stats:", metric_logger.global_avg())
    return {k: "{:.3f}".format(meter.global_avg) for k, meter in metric_logger.meters.items()}

In [None]:
@torch.no_grad()
def evaluation(model, data_loader, tokenizer, device, config) :
    # test
    model.eval()

    metric_logger = utils.MetricLogger(delimiter="  ")
    header = 'Generate VQA test result:'
    print_freq = 50

    result = []

    answer_list = [answer+config['eos'] for answer in data_loader.dataset.answer_list]
    answer_input = tokenizer(answer_list, padding='longest', return_tensors='pt').to(device)

    for n, (image, question, question_id) in enumerate(metric_logger.log_every(data_loader, print_freq, header)):
        image = image.to(device,non_blocking=True)
        question_input = tokenizer(question, padding='longest', return_tensors="pt").to(device)

        topk_ids, topk_probs = model(image, question_input, answer_input, train=False, k=config['k_test'])

        for ques_id, topk_id, topk_prob in zip(question_id, topk_ids, topk_probs):
            ques_id = int(ques_id.item())
            _, pred = topk_prob.max(dim=0)
            result.append({"question_id":ques_id, "answer":data_loader.dataset.answer_list[topk_id[pred]]})

    return result

In [None]:
args = argparse.Namespace()
args.config = './configs/VQA.yaml'
args.checkpoint = './ALBEF_4M.pth'
args.output_dir = './output/vqa'
args.evaluate = False
args.text_encoder = 'bert-base-uncased'
args.text_decoder = 'bert-base-uncased'
args.device = 'cuda'
args.seed = 42
args.distributed = False

config = yaml.load(open(args.config, 'r'), Loader=yaml.Loader)

In [None]:
args.result_dir = os.path.join(args.output_dir, 'result')

Path(args.output_dir).mkdir(parents=True, exist_ok=True)
Path(args.result_dir).mkdir(parents=True, exist_ok=True)

yaml.dump(config, open(os.path.join(args.output_dir, 'config.yaml'), 'w'))

In [None]:
utils.init_distributed_mode(args)

device = torch.device(args.device)

# fix the seed for reproducibility
seed = args.seed + utils.get_rank()
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)
cudnn.benchmark = True

start_epoch = 0
max_epoch = config['schedular']['epochs']
warmup_steps = config['schedular']['warmup_epochs']


#### Dataset ####
print("Creating vqa datasets")
datasets = create_dataset('vqa', config)

if args.distributed:
    num_tasks = utils.get_world_size()
    global_rank = utils.get_rank()
    samplers = create_sampler(datasets, [True, False], num_tasks, global_rank)
else:
    samplers = [None, None]

train_loader, test_loader = create_loader(datasets,samplers,
                                          batch_size=[config['batch_size_train'],config['batch_size_test']],
                                          num_workers=[4,4],is_trains=[True, False],
                                          collate_fns=[vqa_collate_fn,None])

tokenizer = BertTokenizer.from_pretrained(args.text_encoder)

#### Model ####
print("Creating model")
model = ALBEF(config=config, text_encoder=args.text_encoder, text_decoder=args.text_decoder, tokenizer=tokenizer)
model = model.to(device)

arg_opt = utils.AttrDict(config['optimizer'])
optimizer = create_optimizer(arg_opt, model)
arg_sche = utils.AttrDict(config['schedular'])
lr_scheduler, _ = create_scheduler(arg_sche, optimizer)

if args.checkpoint:
    checkpoint = torch.load(args.checkpoint, map_location='cpu')
    if args.evaluate:
        state_dict = checkpoint
    else:
        state_dict = checkpoint['model']

    # reshape positional embedding to accomodate for image resolution change
    pos_embed_reshaped = interpolate_pos_embed(state_dict['visual_encoder.pos_embed'],model.visual_encoder)
    state_dict['visual_encoder.pos_embed'] = pos_embed_reshaped

    if not args.evaluate:
        if config['distill']:
            m_pos_embed_reshaped = interpolate_pos_embed(state_dict['visual_encoder_m.pos_embed'],model.visual_encoder_m)
            state_dict['visual_encoder_m.pos_embed'] = m_pos_embed_reshaped

        for key in list(state_dict.keys()):
            if 'bert' in key:
                encoder_key = key.replace('bert.','')
                state_dict[encoder_key] = state_dict[key]
            # intialize text decoder as multimodal encoder (last 6 layers of model.text_encoder)
            if 'text_encoder' in key:
                if 'layer' in key:
                    encoder_keys = key.split('.')
                    layer_num = int(encoder_keys[4])
                    if layer_num<6:
                        del state_dict[key]
                        continue
                    else:
                        decoder_layer_num = (layer_num-6)
                        encoder_keys[4] = str(decoder_layer_num)
                        encoder_key = '.'.join(encoder_keys)
                else:
                    encoder_key = key
                decoder_key = encoder_key.replace('text_encoder','text_decoder')
                state_dict[decoder_key] = state_dict[key]

                del state_dict[key]

    msg = model.load_state_dict(state_dict,strict=False)
    print('load checkpoint from %s'%args.checkpoint)
    print(msg)


model_without_ddp = model
if args.distributed:
    model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
    model_without_ddp = model.module


print("Start training")
start_time = time.time()

for epoch in range(start_epoch, max_epoch):
    if epoch>0:
        lr_scheduler.step(epoch+warmup_steps)

    if not args.evaluate:
        if args.distributed:
            train_loader.sampler.set_epoch(epoch)

        train_stats = train(model, train_loader, optimizer, tokenizer, epoch, warmup_steps, device, lr_scheduler, config)

    if args.evaluate:
        break

    if utils.is_main_process():
        log_stats = {**{f'train_{k}': v for k, v in train_stats.items()},
                      'epoch': epoch,
                    }
        with open(os.path.join(args.output_dir, "log.txt"),"a") as f:
            f.write(json.dumps(log_stats) + "\n")

        save_obj = {
            'model': model_without_ddp.state_dict(),
            'optimizer': optimizer.state_dict(),
            'lr_scheduler': lr_scheduler.state_dict(),
            'config': config,
            'epoch': epoch,
        }
        torch.save(save_obj, os.path.join(args.output_dir, 'checkpoint_%02d.pth'%epoch))
        subprocess.run(['cp', os.path.join(args.output_dir, 'checkpoint_%02d.pth'%epoch), 'drive/MyDrive/ALBEF'])

vqa_result = evaluation(model, test_loader, tokenizer, device, config)
result_file = save_result(vqa_result, args.result_dir, 'vqa_result_epoch%d'%epoch)

total_time = time.time() - start_time
total_time_str = str(datetime.timedelta(seconds=int(total_time)))
print('Training time {}'.format(total_time_str))

Not using distributed mode
Creating vqa datasets




Creating model
reshape position embedding from 256 to 576
reshape position embedding from 256 to 576
load checkpoint from ./ALBEF_4M.pth
_IncompatibleKeys(missing_keys=[], unexpected_keys=['temp', 'image_queue', 'text_queue', 'queue_ptr', 'vision_proj.weight', 'vision_proj.bias', 'text_proj.weight', 'text_proj.bias', 'itm_head.weight', 'itm_head.bias', 'vision_proj_m.weight', 'vision_proj_m.bias', 'text_proj_m.weight', 'text_proj_m.bias', 'visual_encoder.blocks.6.norm1.weight', 'visual_encoder.blocks.6.norm1.bias', 'visual_encoder.blocks.6.attn.qkv.weight', 'visual_encoder.blocks.6.attn.qkv.bias', 'visual_encoder.blocks.6.attn.proj.weight', 'visual_encoder.blocks.6.attn.proj.bias', 'visual_encoder.blocks.6.norm2.weight', 'visual_encoder.blocks.6.norm2.bias', 'visual_encoder.blocks.6.mlp.fc1.weight', 'visual_encoder.blocks.6.mlp.fc1.bias', 'visual_encoder.blocks.6.mlp.fc2.weight', 'visual_encoder.blocks.6.mlp.fc2.bias', 'visual_encoder.blocks.7.norm1.weight', 'visual_encoder.blocks.7.no

  offset = -low * scale
  offset = -low * scale
  offset = -low * scale


Train Epoch: [0]  [    0/20565]  eta: 20:49:13  lr: 0.000010  loss: 31.4302  time: 3.6447  data: 1.3974  max mem: 11759


  offset = -low * scale


Train Epoch: [0]  [   50/20565]  eta: 2:10:36  lr: 0.000010  loss: 9.2759  time: 0.3159  data: 0.0002  max mem: 13312
Train Epoch: [0]  [  100/20565]  eta: 1:59:07  lr: 0.000010  loss: 6.5193  time: 0.3162  data: 0.0002  max mem: 13312
Train Epoch: [0]  [  150/20565]  eta: 1:55:07  lr: 0.000013  loss: 5.2038  time: 0.3165  data: 0.0003  max mem: 13312
Train Epoch: [0]  [  200/20565]  eta: 1:52:58  lr: 0.000013  loss: 7.9894  time: 0.3157  data: 0.0002  max mem: 13312
Train Epoch: [0]  [  250/20565]  eta: 1:51:41  lr: 0.000015  loss: 6.0901  time: 0.3173  data: 0.0002  max mem: 13312
Train Epoch: [0]  [  300/20565]  eta: 1:50:40  lr: 0.000015  loss: 6.1345  time: 0.3159  data: 0.0002  max mem: 13312
Train Epoch: [0]  [  350/20565]  eta: 1:49:48  lr: 0.000018  loss: 5.0508  time: 0.3151  data: 0.0002  max mem: 13312
Train Epoch: [0]  [  400/20565]  eta: 1:49:05  lr: 0.000018  loss: 5.6330  time: 0.3164  data: 0.0002  max mem: 13312
Train Epoch: [0]  [  450/20565]  eta: 1:48:30  lr: 0.000

  offset = -low * scale
  offset = -low * scale
  offset = -low * scale


Train Epoch: [1]  [    0/20565]  eta: 8:53:59  lr: 0.000019  loss: 4.8544  time: 1.5580  data: 1.1907  max mem: 14556


  offset = -low * scale


Train Epoch: [1]  [   50/20565]  eta: 1:57:16  lr: 0.000019  loss: 2.6268  time: 0.3178  data: 0.0003  max mem: 14556
Train Epoch: [1]  [  100/20565]  eta: 1:52:59  lr: 0.000019  loss: 3.2184  time: 0.3187  data: 0.0003  max mem: 14556
Train Epoch: [1]  [  150/20565]  eta: 1:51:01  lr: 0.000019  loss: 3.7658  time: 0.3168  data: 0.0002  max mem: 14556
Train Epoch: [1]  [  200/20565]  eta: 1:49:58  lr: 0.000019  loss: 2.6658  time: 0.3159  data: 0.0002  max mem: 14556
Train Epoch: [1]  [  250/20565]  eta: 1:49:12  lr: 0.000019  loss: 3.3373  time: 0.3173  data: 0.0002  max mem: 14556
Train Epoch: [1]  [  300/20565]  eta: 1:48:37  lr: 0.000019  loss: 3.1671  time: 0.3146  data: 0.0002  max mem: 14556
Train Epoch: [1]  [  350/20565]  eta: 1:48:07  lr: 0.000019  loss: 3.2101  time: 0.3188  data: 0.0002  max mem: 14556
Train Epoch: [1]  [  400/20565]  eta: 1:47:39  lr: 0.000019  loss: 3.5128  time: 0.3190  data: 0.0002  max mem: 14556
Train Epoch: [1]  [  450/20565]  eta: 1:47:17  lr: 0.000

  offset = -low * scale
  offset = -low * scale
  offset = -low * scale


Train Epoch: [2]  [    0/20565]  eta: 8:18:44  lr: 0.000017  loss: 3.5431  time: 1.4551  data: 1.0996  max mem: 14698


  offset = -low * scale


Train Epoch: [2]  [   50/20565]  eta: 1:56:13  lr: 0.000017  loss: 3.5246  time: 0.3142  data: 0.0002  max mem: 14698
Train Epoch: [2]  [  100/20565]  eta: 1:51:59  lr: 0.000017  loss: 3.8082  time: 0.3172  data: 0.0002  max mem: 14698
Train Epoch: [2]  [  150/20565]  eta: 1:49:59  lr: 0.000017  loss: 3.5631  time: 0.3156  data: 0.0002  max mem: 14698
Train Epoch: [2]  [  200/20565]  eta: 1:49:07  lr: 0.000017  loss: 3.1120  time: 0.3154  data: 0.0002  max mem: 14698
Train Epoch: [2]  [  250/20565]  eta: 1:48:17  lr: 0.000017  loss: 3.3682  time: 0.3119  data: 0.0002  max mem: 14698
Train Epoch: [2]  [  300/20565]  eta: 1:47:42  lr: 0.000017  loss: 3.5582  time: 0.3150  data: 0.0002  max mem: 14698
Train Epoch: [2]  [  350/20565]  eta: 1:47:14  lr: 0.000017  loss: 4.6747  time: 0.3165  data: 0.0002  max mem: 14698
Train Epoch: [2]  [  400/20565]  eta: 1:46:48  lr: 0.000017  loss: 2.6109  time: 0.3164  data: 0.0002  max mem: 14698
Train Epoch: [2]  [  450/20565]  eta: 1:46:22  lr: 0.000

  offset = -low * scale
  offset = -low * scale
  offset = -low * scale
  offset = -low * scale


Train Epoch: [3]  [    0/20565]  eta: 8:15:22  lr: 0.000014  loss: 3.0966  time: 1.4453  data: 1.0913  max mem: 14698
Train Epoch: [3]  [   50/20565]  eta: 1:56:25  lr: 0.000014  loss: 3.0405  time: 0.3184  data: 0.0002  max mem: 14698
Train Epoch: [3]  [  100/20565]  eta: 1:51:58  lr: 0.000014  loss: 2.3241  time: 0.3155  data: 0.0002  max mem: 14698
Train Epoch: [3]  [  150/20565]  eta: 1:50:17  lr: 0.000014  loss: 3.1893  time: 0.3148  data: 0.0002  max mem: 14698
Train Epoch: [3]  [  200/20565]  eta: 1:49:19  lr: 0.000014  loss: 2.8479  time: 0.3151  data: 0.0002  max mem: 14698
Train Epoch: [3]  [  250/20565]  eta: 1:48:31  lr: 0.000014  loss: 2.6318  time: 0.3126  data: 0.0002  max mem: 14698
Train Epoch: [3]  [  300/20565]  eta: 1:47:56  lr: 0.000014  loss: 2.2076  time: 0.3152  data: 0.0002  max mem: 14698
Train Epoch: [3]  [  350/20565]  eta: 1:47:28  lr: 0.000014  loss: 2.4531  time: 0.3156  data: 0.0002  max mem: 14698
Train Epoch: [3]  [  400/20565]  eta: 1:47:05  lr: 0.000

  offset = -low * scale
  offset = -low * scale
  offset = -low * scale
  offset = -low * scale


Train Epoch: [4]  [    0/20565]  eta: 8:51:34  lr: 0.000011  loss: 3.2468  time: 1.5509  data: 1.1587  max mem: 14771
Train Epoch: [4]  [   50/20565]  eta: 1:56:55  lr: 0.000011  loss: 3.9336  time: 0.3192  data: 0.0002  max mem: 14771
Train Epoch: [4]  [  100/20565]  eta: 1:52:20  lr: 0.000011  loss: 2.8361  time: 0.3152  data: 0.0002  max mem: 14771
Train Epoch: [4]  [  150/20565]  eta: 1:50:37  lr: 0.000011  loss: 3.3046  time: 0.3159  data: 0.0002  max mem: 14771
Train Epoch: [4]  [  200/20565]  eta: 1:49:35  lr: 0.000011  loss: 3.0243  time: 0.3183  data: 0.0002  max mem: 14771
Train Epoch: [4]  [  250/20565]  eta: 1:48:47  lr: 0.000011  loss: 3.3898  time: 0.3153  data: 0.0002  max mem: 14771
Train Epoch: [4]  [  300/20565]  eta: 1:48:10  lr: 0.000011  loss: 2.2508  time: 0.3144  data: 0.0002  max mem: 14771
Train Epoch: [4]  [  350/20565]  eta: 1:47:38  lr: 0.000011  loss: 3.7915  time: 0.3133  data: 0.0002  max mem: 14771
Train Epoch: [4]  [  400/20565]  eta: 1:47:10  lr: 0.000

  offset = -low * scale
  offset = -low * scale
  offset = -low * scale


Train Epoch: [5]  [    0/20565]  eta: 8:51:52  lr: 0.000007  loss: 3.1205  time: 1.5518  data: 1.1831  max mem: 14940


  offset = -low * scale


Train Epoch: [5]  [   50/20565]  eta: 1:57:17  lr: 0.000007  loss: 3.4401  time: 0.3202  data: 0.0002  max mem: 14940
Train Epoch: [5]  [  100/20565]  eta: 1:52:49  lr: 0.000007  loss: 2.2705  time: 0.3167  data: 0.0002  max mem: 14940
Train Epoch: [5]  [  150/20565]  eta: 1:50:46  lr: 0.000007  loss: 2.5417  time: 0.3177  data: 0.0002  max mem: 14940
Train Epoch: [5]  [  200/20565]  eta: 1:49:37  lr: 0.000007  loss: 2.6248  time: 0.3170  data: 0.0002  max mem: 14940
Train Epoch: [5]  [  250/20565]  eta: 1:48:50  lr: 0.000007  loss: 4.1115  time: 0.3157  data: 0.0002  max mem: 14940
Train Epoch: [5]  [  300/20565]  eta: 1:48:09  lr: 0.000007  loss: 2.6093  time: 0.3149  data: 0.0002  max mem: 14940
Train Epoch: [5]  [  350/20565]  eta: 1:47:36  lr: 0.000007  loss: 2.9761  time: 0.3141  data: 0.0002  max mem: 14940
Train Epoch: [5]  [  400/20565]  eta: 1:47:11  lr: 0.000007  loss: 3.0725  time: 0.3164  data: 0.0002  max mem: 14940
Train Epoch: [5]  [  450/20565]  eta: 1:46:46  lr: 0.000

  offset = -low * scale
  offset = -low * scale
  offset = -low * scale
  offset = -low * scale


Train Epoch: [6]  [    0/20565]  eta: 8:37:59  lr: 0.000004  loss: 2.3024  time: 1.5113  data: 1.1586  max mem: 14940
Train Epoch: [6]  [   50/20565]  eta: 1:56:55  lr: 0.000004  loss: 2.8837  time: 0.3168  data: 0.0002  max mem: 14940
Train Epoch: [6]  [  100/20565]  eta: 1:52:24  lr: 0.000004  loss: 3.3647  time: 0.3177  data: 0.0002  max mem: 14940
Train Epoch: [6]  [  150/20565]  eta: 1:50:42  lr: 0.000004  loss: 3.7999  time: 0.3167  data: 0.0002  max mem: 14940
Train Epoch: [6]  [  200/20565]  eta: 1:49:29  lr: 0.000004  loss: 2.4904  time: 0.3126  data: 0.0002  max mem: 14940
Train Epoch: [6]  [  250/20565]  eta: 1:48:43  lr: 0.000004  loss: 2.3861  time: 0.3126  data: 0.0002  max mem: 14940
Train Epoch: [6]  [  300/20565]  eta: 1:48:08  lr: 0.000004  loss: 2.3969  time: 0.3147  data: 0.0002  max mem: 14940
Train Epoch: [6]  [  350/20565]  eta: 1:47:33  lr: 0.000004  loss: 3.3599  time: 0.3134  data: 0.0002  max mem: 14940
Train Epoch: [6]  [  400/20565]  eta: 1:47:10  lr: 0.000

  offset = -low * scale
  offset = -low * scale
  offset = -low * scale


Train Epoch: [7]  [    0/20565]  eta: 8:28:09  lr: 0.000002  loss: 2.8332  time: 1.4826  data: 1.1055  max mem: 14940


  offset = -low * scale


Train Epoch: [7]  [   50/20565]  eta: 1:56:39  lr: 0.000002  loss: 3.6876  time: 0.3179  data: 0.0002  max mem: 14940
Train Epoch: [7]  [  100/20565]  eta: 1:52:23  lr: 0.000002  loss: 3.8490  time: 0.3169  data: 0.0002  max mem: 14940
Train Epoch: [7]  [  150/20565]  eta: 1:50:40  lr: 0.000002  loss: 2.6062  time: 0.3170  data: 0.0002  max mem: 14940
Train Epoch: [7]  [  200/20565]  eta: 1:49:40  lr: 0.000002  loss: 2.7558  time: 0.3147  data: 0.0002  max mem: 14940
Train Epoch: [7]  [  250/20565]  eta: 1:48:59  lr: 0.000002  loss: 2.6288  time: 0.3150  data: 0.0002  max mem: 14940
Train Epoch: [7]  [  300/20565]  eta: 1:48:21  lr: 0.000002  loss: 3.7193  time: 0.3156  data: 0.0002  max mem: 14940
Train Epoch: [7]  [  350/20565]  eta: 1:47:48  lr: 0.000002  loss: 2.6352  time: 0.3166  data: 0.0002  max mem: 14940
Train Epoch: [7]  [  400/20565]  eta: 1:47:16  lr: 0.000002  loss: 2.5980  time: 0.3121  data: 0.0002  max mem: 14940
Train Epoch: [7]  [  450/20565]  eta: 1:46:51  lr: 0.000

ValueError: Default process group has not been initialized, please make sure to call init_process_group.

In [None]:
result_file = save_result(vqa_result, args.result_dir, 'vqa_result_epoch%d'%epoch)

total_time = time.time() - start_time
total_time_str = str(datetime.timedelta(seconds=int(total_time)))
print('Training time {}'.format(total_time_str))

result file saved to ./output/vqa/result/vqa_result_epoch7.json
Training time 16:10:15


In [None]:
!cp -r output drive/MyDrive/ALBEF/