## Label compilation pipeline

- This notebook will provide the functionality to generate and compile different labels into file(s)

In [None]:
from transformers import Blip2Processor, Blip2ForConditionalGeneration
from pathlib import Path
import pandas as pd
from PIL import Image
import itertools
import torch
import os

In [None]:
def get_model(model_name, device='cuda'):
    processor = Blip2Processor.from_pretrained(model_name)
    model = Blip2ForConditionalGeneration.from_pretrained(
                            model_name, 
                            ).to(device)
    return model, processor


def get_file_names(path, extension='.jpg'): 
    return [file_name for file_name in os.listdir(path) if file_name[-4:] == '.png']


def get_img_paths(folder_path, img_names):
     return [str(folder_path) + '/' + img_name for img_name in img_names]
    

def get_imgs(img_paths): return [Image.open(path) for path in img_paths]


def get_lbls(images, processor, model, device=None):
    if device is None: device = 'cuda' if torch.cuda.is_available() else 'cpu'
    inputs = processor(images=images, return_tensors='pt').to(device, torch.float32)
    generated_ids = model.generate(**inputs)
    return processor.batch_decode(generated_ids, skip_special_tokens=True)

def get_batch_idxs(img_paths, batch_size=10):
    idxs = [x for x in range(0, len(img_paths)+1, batch_size)]
    if len(img_paths) % batch_size != 0:
        last_idx = len(img_paths) - 1
        idxs.append(last_idx + 1)
    idx_tuples = [(idxs[x-1], idxs[x]) for x in range(1, len(idxs))]
    return idx_tuples

def get_batch(img_paths, idx_tup): 
        return get_imgs(img_paths[idx_tup[0] : idx_tup[1]])
    

def run_inf(img_paths, processor, model, device=None, batch_size=10):
    if device is None: device = 'cuda' if torch.cuda.is_available() else 'cpu'
    idxs = get_batch_idxs(img_paths)
    return list(itertools.chain(*[get_lbls(get_batch(img_paths, idx_tup), processor, model, device) \
                                  for idx_tup in get_batch_idxs(img_paths)]))

In [None]:
img_folder = Path('data/campaign2/0021')
img_names = get_file_names(img_folder)
img_paths = get_img_paths(img_folder, img_names)
img_paths[:3]

['data/campaign2/0021/image_0021835.png',
 'data/campaign2/0021/image_0021801.png',
 'data/campaign2/0021/background_0021520.png']

In [None]:
#model_name = 'Salesforce/blip2-flan-t5-xl-coco'
#model_name = 'Salesforce/blip2-opt-2.7b'
#device = 'cuda'
#model, processor = get_model(model_name, device=device)

In [None]:
#lbls = run_inf(img_paths, processor,model)
#len(lbls), lbls[:3]

In [None]:
#model_base = model_name.split('/')[1]
#model_base

In [None]:
#pd.DataFrame(zip(img_paths[:73], lbls), columns=['img_path', 'label'] \
#            ).to_csv(f'{model_base}.csv', index=False)

In [None]:
#pd.read_csv(model_base + '.csv').head(3)

Looks like it works! lets wrap all of this into a single function and start compiling some labels!

* the models are HUGE, so will have to restart notebook between model runs *

In [None]:
def compile_lbls(img_paths, model_name, batch_size=10):
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model, processor = get_model(model_name, device=device)
    model_base = model_name.split('/')[1]
    lbls = run_inf(img_paths, processor, model, device, batch_size=25)
    pd.DataFrame(zip(img_paths, lbls), columns=['img_path', 'label'] \
            ).to_csv(f'{model_base}.csv', index=False)

In [None]:
model_name = 'Salesforce/blip2-flan-t5-xl'
#model_name = 'Salesforce/blip2-opt-2.7b'
compile_lbls(img_paths, model_name)

Downloading (…)rocessor_config.json:   0%|          | 0.00/432 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Downloading (…)"spiece.model";:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/7.68k [00:00<?, ?B/s]

Downloading (…)model.bin.index.json:   0%|          | 0.00/128k [00:00<?, ?B/s]

Downloading (…)00001-of-00002.bin";:   0%|          | 0.00/9.44G [00:00<?, ?B/s]

Downloading (…)00002-of-00002.bin";:   0%|          | 0.00/6.33G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [None]:
model_base = model_name.split('/')[1]
df = pd.read_csv(model_base + '.csv')
df.head(3)

Unnamed: 0,img_path,label
0,data/campaign2/0021/image_0021835.png,a green cylinder sitting on a dirt road
1,data/campaign2/0021/image_0021801.png,a green pipe is sitting on the ground in a dir...
2,data/campaign2/0021/background_0021520.png,a wooden planter with plants in it


In [None]:
df.shape

(1100, 2)

## TO-DO:

- lets add nbdev/quatro support!
- ^this will alllow for cleaner code base and for running things from the terminal vs in notebooks --> which will allow for multiple runs back to back vs having to restart notebook bc of memory being held