## Base Inference

In [1]:
from transformers import Pix2StructForConditionalGeneration, Pix2StructProcessor
from PIL import Image

model = Pix2StructForConditionalGeneration.from_pretrained("google/pix2struct-infographics-vqa-base").to("cuda")
processor = Pix2StructProcessor.from_pretrained("google/pix2struct-infographics-vqa-base")


In [2]:
image = Image.open("/home/jjh/level3-cv-productserving-cv-10/data/images/10065.jpeg")
question = "Which market crash had the lowest impact on the S&P 500, Dot-com crash, Coronavirus crash, or Great recession ?"
inputs = processor(images=image, text=question, return_tensors="pt").to("cuda")

In [3]:
inputs["flattened_patches"].shape

torch.Size([1, 2048, 770])

In [4]:
ins = processor(images = image,text=question,return_tensors='pt').to('cuda')

In [5]:
predictions = model.generate(**inputs)
pred = processor.decode(predictions[0], skip_special_tokens=True)
print(pred)

Great recession




## train

In [6]:
import os
import torch
from torch.utils.data import Dataset
import json
from PIL import Image

In [7]:
from transformers import AutoProcessor
auto_processor = AutoProcessor.from_pretrained("google/pix2struct-infographics-vqa-base")

In [21]:
class Pix2StructDataset(Dataset):
    def __init__(self, image_dir, json_dir, processor, train):
        self.img_dir = image_dir
        with open(json_dir) as f:
            self.json_data = json.load(f)
        self.processor = processor
        self.file_list = os.listdir(image_dir)
        self.train = train
        
    def __getitem__(self, index): 
        data = self.json_data["data"][index]
        image_name = data["image_local_name"]
        img = Image.open(os.path.join(self.img_dir, image_name))
        q = data["question"]
        inputs = self.processor(images=img, text=q, return_tensors="pt").to('cuda')
        if self.train:
            a = data["answers"][0]
            label = self.processor.tokenizer(text=a, padding="max_length", return_tensors="pt", add_special_tokens=True, max_length=20).input_ids.to("cuda")
            
            return inputs, label
        return inputs
  
    
    def __len__(self): 
        return len(self.file_list)

In [38]:
def collator(batch):
  new_batch = {"inputs":[], "labels":[]}
  
  for item in batch:
    new_batch["inputs"].append(item[0])
    new_batch["labels"].append(item[1])
  
  new_batch["inputs"] = torch.stack(new_batch["inputs"])
  new_batch["labels"] = torch.stack(new_batch["labels"])
  return new_batch

In [39]:
img_dir = '/home/jjh/level3-cv-productserving-cv-10/data/images/'
train_dataset = Pix2StructDataset(image_dir=img_dir, json_dir='../data/qas/infographicsVQA_train_v1.0.json', processor=auto_processor, train=True)
val_dataset = Pix2StructDataset(image_dir=img_dir, json_dir='../data/qas/infographicsVQA_val_v1.0_withQT.json', processor=auto_processor, train=True)
test_dataset = Pix2StructDataset(image_dir=img_dir, json_dir='../data/qas/infographicsVQA_test_v1.0.json', processor=auto_processor, train=False)


In [40]:
train_dataset[0]

({'flattened_patches': tensor([[[ 1.0000,  1.0000,  0.6817,  ..., -1.7981, -1.9302, -2.0035],
          [ 1.0000,  2.0000,  0.6817,  ..., -1.7981, -1.9302, -2.0035],
          [ 1.0000,  3.0000,  0.6817,  ..., -1.7981, -1.9302, -2.0035],
          ...,
          [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]]],
        device='cuda:0'), 'attention_mask': tensor([[1., 1., 1.,  ..., 0., 0., 0.]], device='cuda:0')},
 tensor([[ 4712,   935, 36637,     1,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0]],
        device='cuda:0'))

In [41]:
train_dataloader = torch.utils.data.DataLoader(train_dataset, shuffle=True, batch_size=4, collate_fn=collator)

In [138]:
for i, batch in enumerate(train_dataloader):
    inputs = batch[0]
    labels = batch[1]
    if i>3:
        break

In [42]:
import requests
from PIL import Image
from transformers import AutoProcessor, Pix2StructVisionModel
image_processor = AutoProcessor.from_pretrained("google/pix2struct-infographics-vqa-base")
model = Pix2StructForConditionalGeneration.from_pretrained("google/pix2struct-infographics-vqa-base")
image = Image.open("/home/jjh/level3-cv-productserving-cv-10/data/images/10065.jpeg")
question = "Which market crash had the lowest impact on the S&P 500, Dot-com crash, Coronavirus crash, or Great recession ?"
labels = image_processor.tokenizer(text="Dot-com crash", padding="max_length", return_tensors="pt", add_special_tokens=True, max_length=20).input_ids.to("cuda")
inputs = image_processor(images=image, text=question, return_tensors="pt").to("cuda")
print(inputs)
labels = train_dataset[0][1]
inputs = train_dataset[0][0]
model.to("cuda")
model.train()
with torch.no_grad():
    outputs = model(**inputs, labels = labels)
loss = outputs.loss

print("Loss:", loss.item())

{'flattened_patches': tensor([[[ 1.0000,  1.0000,  2.6019,  ...,  2.2636,  2.2636,  2.2636],
         [ 1.0000,  2.0000,  2.6019,  ..., -0.6839, -0.6839, -0.6839],
         [ 1.0000,  3.0000,  2.6019,  ...,  2.1186,  2.1186,  2.1186],
         ...,
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]]],
       device='cuda:0'), 'attention_mask': tensor([[1., 1., 1.,  ..., 0., 0., 0.]], device='cuda:0')}
Loss: 35.865047454833984


In [43]:
EPOCHS = 5

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

model.train()

for epoch in range(EPOCHS):
  print("Epoch:", epoch)
  for idx, batch in enumerate(train_dataloader):
    # labels = batch.pop("labels").to(device)
    # flattened_patches = batch.pop("flattened_patches").to(device)
    # attention_mask = batch.pop("attention_mask").to(device)
    # inputs["flattened_patches"] = batch[0]["flattened_patches"]
    # inputs["attention_mask"] = batch[0]["attention_mask"]
    inputs = batch[0]          
    labels = batch[1]
    with torch.no_grad():
      outputs = model(**inputs, labels = labels)
    
    loss = outputs.loss

    
    loss.requires_grad_(True)
    loss.backward()

    optimizer.step()
    optimizer.zero_grad()
  
    if (epoch + 1) % 20 == 0:
        model.eval()

        predictions = model.generate(**inputs)        
        print("Predictions:", processor.batch_decode(predictions, skip_special_tokens=True))

        model.train()
  print("Loss:", loss.item())

Epoch: 0


TypeError: expected Tensor as element 0 in argument 0, but got BatchFeature

In [30]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="pix2struct_1",
    learning_rate=2e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

In [31]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=collator
)

trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None)
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33maurantiacus1220[0m ([33mai_tech_6th_cv_level1[0m). Use [1m`wandb login --relogin`[0m to force relogin


({'flattened_patches': tensor([[[ 1.0000,  1.0000,  0.7127,  ...,  0.7127,  0.7127,  0.7127],
         [ 1.0000,  2.0000,  0.7127,  ..., -2.4389, -2.4389, -2.4389],
         [ 1.0000,  3.0000,  0.7127,  ..., -0.9633, -0.9633, -0.9633],
         ...,
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]]],
       device='cuda:0'), 'attention_mask': tensor([[1., 1., 1.,  ..., 0., 0., 0.]], device='cuda:0')}, tensor([[ 9208,   334,   307,   287,  8642,  4538, 23511, 26471,     1,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]],
       device='cuda:0'))


RuntimeError: cannot pin 'torch.cuda.FloatTensor' only dense CPU tensors can be pinned