Lets extract our VISION model and parse out how we can begin trying to retrain the weights for this -- we have some label information for our images (container present or not) which we can probably use to fine tune our vision model to better understand the type of pictures that will be thrown at it - for this application.

*effectively we're trying to overfit the vision model on the images that we eventually want to label*

In [None]:
from transformers import Blip2Processor, Blip2ForConditionalGeneration
from pathlib import Path
import pandas as pd
from PIL import Image
import itertools
import torch
import os

In [None]:
def get_model(model_name, device='cuda'):
    processor = Blip2Processor.from_pretrained(model_name)
    model = Blip2ForConditionalGeneration.from_pretrained(
                            model_name, 
                            ).to(device)
    return model, processor


def get_file_names(path, extension='.jpg'): 
    return [file_name for file_name in os.listdir(path) if file_name[-4:] == '.png']


def get_img_paths(folder_path, img_names):
     return [str(folder_path) + '/' + img_name for img_name in img_names]
    

def get_imgs(img_paths): return [Image.open(path) for path in img_paths]


def get_lbls(images, processor, model, device=None):
    if device is None: device = 'cuda' if torch.cuda.is_available() else 'cpu'
    inputs = processor(images=images, return_tensors='pt').to(device, torch.float32)
    generated_ids = model.generate(**inputs)
    return processor.batch_decode(generated_ids, skip_special_tokens=True)

def get_batch_idxs(img_paths, batch_size=10):
    idxs = [x for x in range(0, len(img_paths)+1, batch_size)]
    if len(img_paths) % batch_size != 0:
        last_idx = len(img_paths) - 1
        idxs.append(last_idx + 1)
    idx_tuples = [(idxs[x-1], idxs[x]) for x in range(1, len(idxs))]
    return idx_tuples

def get_batch(img_paths, idx_tup): 
        return get_imgs(img_paths[idx_tup[0] : idx_tup[1]])
    

def run_inf(img_paths, processor, model, device=None, batch_size=10):
    if device is None: device = 'cuda' if torch.cuda.is_available() else 'cpu'
    idxs = get_batch_idxs(img_paths)
    return list(itertools.chain(*[get_lbls(get_batch(img_paths, idx_tup), processor, model, device) \
                                  for idx_tup in get_batch_idxs(img_paths)]))

In [None]:
img_folder = Path('data/campaign2/0021')
img_names = get_file_names(img_folder)
img_paths = get_img_paths(img_folder, img_names)
img_paths[:3]

['data/campaign2/0021/image_0021835.png',
 'data/campaign2/0021/image_0021801.png',
 'data/campaign2/0021/background_0021520.png']

In [None]:
model_name = 'Salesforce/blip2-flan-t5-xl'
model, processor = get_model(model_name)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
vision_model = model.vision_model

In [None]:
vision_model.config

Blip2VisionConfig {
  "attention_dropout": 0.0,
  "dropout": 0.0,
  "hidden_act": "gelu",
  "hidden_size": 1408,
  "image_size": 224,
  "initializer_factor": 1.0,
  "initializer_range": 1e-10,
  "intermediate_size": 6144,
  "layer_norm_eps": 1e-05,
  "model_type": "blip_2_vision_model",
  "num_attention_heads": 16,
  "num_channels": 3,
  "num_hidden_layers": 39,
  "patch_size": 14,
  "projection_dim": 512,
  "qkv_bias": true,
  "transformers_version": "4.27.0.dev0"
}

Now lets extract a single image

In [None]:
import numpy as np
np_img = np.array(Image.open(img_paths[0]))
tns_img = torch.tensor(np_img.transpose(2, 0,1))
np_img.shape, tns_img.shape

((720, 720, 3), torch.Size([3, 720, 720]))

In [None]:
#this is how we can add the batch dimension
tns_img[None,...].shape

torch.Size([1, 3, 720, 720])

In [None]:
#if you want to look inside the model
#vision_model??

In [None]:
#the first embedding layer (with a conv2d) that reduces the image feature space down
#vision_model.embeddings??
vision_model.embeddings.patch_embedding

Conv2d(3, 1408, kernel_size=(14, 14), stride=(14, 14))

In [None]:
out = vision_model(tns_img[:,:224, :224][None, ...].float().cuda())
out

BaseModelOutputWithPooling(last_hidden_state=tensor([[[-0.4659, -0.8540,  0.2021,  ..., -0.1645, -0.2268, -0.3574],
         [-0.7202, -1.2308, -0.1918,  ...,  0.5010, -0.6158, -0.6872],
         [-0.8480, -0.6770, -0.5201,  ..., -0.1174, -0.0808, -1.1502],
         ...,
         [-0.3844, -2.3725, -0.2407,  ...,  0.1078, -0.9313, -0.1320],
         [-0.0489, -2.1740, -0.3484,  ...,  0.1823, -0.5096,  0.0825],
         [-0.9160, -1.4308, -0.4508,  ...,  0.8405, -0.6817,  0.6056]]],
       device='cuda:0', grad_fn=<NativeLayerNormBackward0>), pooler_output=tensor([[-0.9260, -1.5141,  0.2457,  ..., -0.6644, -0.2725, -0.7160]],
       device='cuda:0', grad_fn=<NativeLayerNormBackward0>), hidden_states=None, attentions=None)

In [None]:
out['last_hidden_state'].shape

torch.Size([1, 257, 1408])

In [None]:
out['last_hidden_state'][:, 0, :].shape

torch.Size([1, 1408])

Lets just use the first tokens activations for now and see how we can add a simple head on top of it to generate a positive or negative result for our container being present....

In [None]:
torch.nn.Linear(1408, 2)(out['last_hidden_state'][:, 0, :].cpu())

tensor([[ 0.2849, -0.7905]], grad_fn=<AddmmBackward0>)

Obviously we'll use something more complex than a single linear layer (alone) but it does allow us to flesh out whats needed.....

In [None]:
def get_head(activation=True, normalization=True):
    layer_list = []
    if activation: layer_list.append(torch.nn.ReLU())
    if normalization: layer_list.append(torch.nn.LayerNorm(1408))
    layer_list.append(torch.nn.Linear(1408, 2))
    return torch.nn.Sequential(*layer_list)

In [None]:
get_head()(out['last_hidden_state'][:, 0, :].cpu())

tensor([[ 0.2658, -0.3907]], grad_fn=<AddmmBackward0>)

### Now lets incorporate the fastai learner to allow us to effectively/quickly try out new ideas and implement them in a training loop!

Maybe it's better to use their image processor to generate the original images --> can add augmentations after the fact if necessary?

In [None]:
processor.attributes

['image_processor', 'tokenizer']

In [None]:
processor.image_processor

BlipImageProcessor {
  "do_convert_rgb": true,
  "do_normalize": true,
  "do_rescale": true,
  "do_resize": true,
  "image_mean": [
    0.48145466,
    0.4578275,
    0.40821073
  ],
  "image_processor_type": "BlipImageProcessor",
  "image_std": [
    0.26862954,
    0.26130258,
    0.27577711
  ],
  "processor_class": "Blip2Processor",
  "resample": 3,
  "rescale_factor": 0.00392156862745098,
  "size": {
    "height": 224,
    "width": 224
  }
}

In [None]:
processor(images=get_imgs([img_paths[0],]))['pixel_values'][0].shape

(3, 224, 224)

Lets parse our json to determine how we can extract our label class

In [None]:
(img_paths[0].split(sep='.')[0] + '.json')

'data/campaign2/0021/image_0021835.json'

In [None]:
import json
with open((img_paths[0].split(sep='.')[0] + '.json'), 'r') as f:
    data = f.read()

In [None]:
json.loads(data)['annotations'][0]['category']

'synthetic'

In [None]:
json.loads(data)#['annotations']

{'annotations': [{'category': 'synthetic'},
  {'category': '30B',
   'contour_mode': 'XY_ABS',
   'contours': [[[553.0, 233.1333302754254],
     [553.4239452464909, 233.0],
     [554.0, 232.82463911810595],
     [555.0, 232.6024527481582],
     [556.0, 232.53644850887912],
     [557.0, 232.39485150466885],
     [558.0, 232.11329594945667],
     [558.1634417063007, 232.0],
     [559.0, 231.57412769282575],
     [559.8515886810253, 231.0],
     [560.0, 230.85852512707737],
     [560.7388848860415, 230.0],
     [561.0, 229.69263866623342],
     [561.4824412485552, 229.0],
     [562.0, 228.24800990604362],
     [562.211911154025, 228.0],
     [562.6020802285308, 227.0],
     [563.0, 226.33878197382913],
     [563.2402223766401, 226.0],
     [563.5522057652987, 225.0],
     [563.9975581841112, 224.0],
     [564.0, 223.99690496748354],
     [564.4744195287193, 223.0],
     [564.7354455176578, 222.0],
     [565.0, 221.4656056730678],
     [565.2365786200631, 221.0],
     [565.5094373657602, 2

In [None]:
def get_img_lbl(img_path):
    json_path = (img_path.split(sep='.')[0] + '.json')
    with open(json_path, 'r') as f:
        data = f.read()
    return json.loads(data)['annotations'][0]['category']

In [None]:
#[get_img_lbl(img_paths[x]) for x in range(500)]

In [None]:
from torch.utils.data import Dataset, DataLoader

class imgDataset(Dataset):
    def __init__(self, img_paths):
        self.img_paths = img_paths
    def __len__(self):
        return len(self.img_paths)
    def __getitem__(self,idx):
        img = processor(images=get_imgs([self.img_paths[idx],]))['pixel_values'][0]
        lbl = get_img_lbl(self.img_paths[idx])
        return img, lbl
    
ds = imgDataset(img_paths)
len(ds), ds[100]

(1100,
 (array([[[ 1.1128243 ,  1.1274228 ,  1.1274228 , ...,  1.9303361 ,
            1.9303361 ,  1.9303361 ],
          [ 1.098226  ,  1.1128243 ,  1.1128243 , ...,  1.9303361 ,
            1.9303361 ,  1.9303361 ],
          [ 1.098226  ,  1.098226  ,  1.1128243 , ...,  1.9303361 ,
            1.9303361 ,  1.9303361 ],
          ...,
          [-0.405412  , -0.42001042, -0.42001042, ...,  0.42669836,
            0.44129676,  0.44129676],
          [-0.42001042, -0.42001042, -0.43460885, ...,  0.42669836,
            0.42669836,  0.42669836],
          [-0.43460885, -0.43460885, -0.43460885, ...,  0.41209993,
            0.41209993,  0.42669836]],
  
         [[ 1.2344488 ,  1.2494565 ,  1.2494565 , ...,  2.0748837 ,
            2.0748837 ,  2.0748837 ],
          [ 1.219441  ,  1.2344488 ,  1.2344488 , ...,  2.0748837 ,
            2.0748837 ,  2.0748837 ],
          [ 1.219441  ,  1.219441  ,  1.2344488 , ...,  2.0748837 ,
            2.0748837 ,  2.0748837 ],
          ...,
     

In [None]:
all_lbls = [get_img_lbl(img_paths[x]) for x in range(len(img_paths))]
len(all_lbls), pd.Series(all_lbls).unique()

(1100, array(['synthetic'], dtype=object))

all of our labels are `synthetic` ??? this seems to indicate they should not be? https://limbo-ml.readthedocs.io/en/latest/campaign2.html -- but only have 1100 of the approximately 55k image/labels. So don't have the full campaigns data locally available on peronsal rig....

Looking at some of the other campaigns, maybe could turn it into a multi-category classification task for the retraining of the vision weights???? --> campaign 8 shows a bunch of these potential labels that can be used (https://limbo-ml.readthedocs.io/en/latest/campaign8.html)


Unrelated to vision model, but came to mind when looking at campaign 8 --> Could we use the specific labels from the dataset to prioritize our generated labels??? By using a prompt injection of BLIP2