Lets extract our VISION model and parse out how we can begin trying to retrain the weights for this -- we have some label information for our images (container present or not) which we can probably use to fine tune our vision model to better understand the type of pictures that will be thrown at it - for this application.

*effectively we're trying to overfit the vision model on the images that we eventually want to label*

In [None]:
from transformers import Blip2Processor, Blip2ForConditionalGeneration
from pathlib import Path
import pandas as pd
from PIL import Image
import itertools
import torch
import os

In [None]:
def get_model(model_name, device='cuda'):
    processor = Blip2Processor.from_pretrained(model_name)
    model = Blip2ForConditionalGeneration.from_pretrained(
                            model_name, 
                            ).to(device)
    return model, processor


def get_file_names(path, extension='.jpg'): 
    return [file_name for file_name in os.listdir(path) if file_name[-4:] == '.png']


def get_img_paths(folder_path, img_names):
     return [str(folder_path) + '/' + img_name for img_name in img_names]
    

def get_imgs(img_paths): return [Image.open(path) for path in img_paths]


def get_lbls(images, processor, model, device=None):
    if device is None: device = 'cuda' if torch.cuda.is_available() else 'cpu'
    inputs = processor(images=images, return_tensors='pt').to(device, torch.float32)
    generated_ids = model.generate(**inputs)
    return processor.batch_decode(generated_ids, skip_special_tokens=True)

def get_batch_idxs(img_paths, batch_size=10):
    idxs = [x for x in range(0, len(img_paths)+1, batch_size)]
    if len(img_paths) % batch_size != 0:
        last_idx = len(img_paths) - 1
        idxs.append(last_idx + 1)
    idx_tuples = [(idxs[x-1], idxs[x]) for x in range(1, len(idxs))]
    return idx_tuples

def get_batch(img_paths, idx_tup): 
        return get_imgs(img_paths[idx_tup[0] : idx_tup[1]])
    

def run_inf(img_paths, processor, model, device=None, batch_size=10):
    if device is None: device = 'cuda' if torch.cuda.is_available() else 'cpu'
    idxs = get_batch_idxs(img_paths)
    return list(itertools.chain(*[get_lbls(get_batch(img_paths, idx_tup), processor, model, device) \
                                  for idx_tup in get_batch_idxs(img_paths)]))

In [None]:
img_folder = Path('data/campaign2/0021')
img_names = get_file_names(img_folder)
img_paths = get_img_paths(img_folder, img_names)
img_paths[:3]

['data/campaign2/0021/image_0021835.png',
 'data/campaign2/0021/image_0021801.png',
 'data/campaign2/0021/background_0021520.png']

In [None]:
model_name = 'Salesforce/blip2-flan-t5-xl'
model, processor = get_model(model_name)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
vision_model = model.vision_model

In [None]:
vision_model.config

Blip2VisionConfig {
  "attention_dropout": 0.0,
  "dropout": 0.0,
  "hidden_act": "gelu",
  "hidden_size": 1408,
  "image_size": 224,
  "initializer_factor": 1.0,
  "initializer_range": 1e-10,
  "intermediate_size": 6144,
  "layer_norm_eps": 1e-05,
  "model_type": "blip_2_vision_model",
  "num_attention_heads": 16,
  "num_channels": 3,
  "num_hidden_layers": 39,
  "patch_size": 14,
  "projection_dim": 512,
  "qkv_bias": true,
  "transformers_version": "4.27.0.dev0"
}

Now lets extract a single image

In [None]:
import numpy as np
np_img = np.array(Image.open(img_paths[0]))
tns_img = torch.tensor(np_img.transpose(2, 0,1))
np_img.shape, tns_img.shape

((720, 720, 3), torch.Size([3, 720, 720]))

In [None]:
#this is how we can add the batch dimension
tns_img[None,...].shape

torch.Size([1, 3, 720, 720])

In [None]:
#if you want to look inside the model
#vision_model??

In [None]:
#the first embedding layer (with a conv2d) that reduces the image feature space down
#vision_model.embeddings??
vision_model.embeddings.patch_embedding

Conv2d(3, 1408, kernel_size=(14, 14), stride=(14, 14))

In [None]:
out = vision_model(tns_img[:,:224, :224][None, ...].float().cuda())
out

BaseModelOutputWithPooling(last_hidden_state=tensor([[[-0.4659, -0.8540,  0.2021,  ..., -0.1645, -0.2268, -0.3574],
         [-0.7202, -1.2308, -0.1918,  ...,  0.5010, -0.6158, -0.6872],
         [-0.8480, -0.6770, -0.5201,  ..., -0.1174, -0.0808, -1.1502],
         ...,
         [-0.3844, -2.3725, -0.2407,  ...,  0.1078, -0.9313, -0.1320],
         [-0.0489, -2.1740, -0.3484,  ...,  0.1823, -0.5096,  0.0825],
         [-0.9160, -1.4308, -0.4508,  ...,  0.8405, -0.6817,  0.6056]]],
       device='cuda:0', grad_fn=<NativeLayerNormBackward0>), pooler_output=tensor([[-0.9260, -1.5141,  0.2457,  ..., -0.6644, -0.2725, -0.7160]],
       device='cuda:0', grad_fn=<NativeLayerNormBackward0>), hidden_states=None, attentions=None)

In [None]:
out['last_hidden_state'].shape

torch.Size([1, 257, 1408])