In [1]:
import torch
import pickle
import sys
import os
import pandas as pd
import numpy as np
from datasets import load_dataset
from transformers import AutoProcessor, BlipForConditionalGeneration
from torch.utils.data import Dataset, DataLoader
from pathlib import Path
from PIL import Image

sys.path.append('/kaggle/input/sentence-transformers-222/sentence-transformers')
from sentence_transformers import SentenceTransformer, models

comp_path = Path('/kaggle/input/stable-diffusion-image-to-prompts/')

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [4]:
processor = AutoProcessor.from_pretrained('Salesforce/blip-image-captioning-base')
model = BlipForConditionalGeneration.from_pretrained('Salesforce/blip-image-captioning-base')

Downloading (…)rocessor_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/438 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/4.56k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

In [5]:
pickle.dump(model, open('/kaggle/working/model_pretrained.pkl','wb'))
pickle.dump(processor, open('/kaggle/working/processor_pretrained.pkl','wb'))

In [33]:
model = pickle.load(open('/kaggle/input/blip-image-captioning/model_pretrained.pkl', 'rb'))
processor = pickle.load(open('/kaggle/input/blip-image-captioning/processor_pretrained.pkl', 'rb'))

In [None]:
model.to(device)

In [34]:
images = os.listdir(comp_path / 'images')
imgIds = [i.split('.')[0] for i in images]

In [35]:
prompts = []
images_path = "../input/stable-diffusion-image-to-prompts/images/"

for image_name in images:
    image = Image.open(images_path + image_name).convert('RGB')
    inputs = processor(images = image, return_tensors = 'pt').to(device)
    pixel_values = inputs.pixel_values
    generated_ids = model.generate(pixel_values = pixel_values, max_length = 50)
    generated_caption = processor.batch_decode(generated_ids, skip_special_tokens = True)[0]
    prompts.append(generated_caption)

In [36]:
st_model = SentenceTransformer('/kaggle/input/sentence-transformers-222/all-MiniLM-L6-v2')
prompt_embeddings = st_model.encode(prompts).flatten()

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [37]:
df_submission = pd.read_csv(comp_path / 'sample_submission.csv', index_col='imgId_eId')

In [38]:
EMBEDDING_LENGTH = 384
eIds = list(range(EMBEDDING_LENGTH))

imgId_eId = [
    '_'.join(map(str, i)) for i in zip(
        np.repeat(imgIds, EMBEDDING_LENGTH),
        np.tile(range(EMBEDDING_LENGTH), len(imgIds)))]

assert sorted(imgId_eId) == sorted(df_submission.index)

In [39]:
submission = pd.DataFrame(
                index=imgId_eId,
                data=prompt_embeddings,
                columns=['val']).rename_axis('imgId_eId')

In [None]:
submission.to_csv('submission.csv')