In [None]:
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
import torch
from PIL import Image
import pickle
import sys
from pathlib import Path
import os
import numpy as np
import pandas as pd
import io

sys.path.append('/kaggle/input/sentence-transformers-222/sentence-transformers')
from sentence_transformers import SentenceTransformer, models

comp_path = Path('/kaggle/input/stable-diffusion-image-to-prompts/')

In [None]:
model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
feature_extractor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

pickle.dump(model, open('/kaggle/working/model.pkl','wb'))
pickle.dump(feature_extractor, open('/kaggle/working/feature_extractor.pkl','wb'))
pickle.dump(tokenizer, open('/kaggle/working/tokenizer.pkl','wb'))

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

max_length = 16
num_beams = 4
gen_kwargs = {"max_length": max_length, "num_beams": num_beams}

def predict_step(image_names):
  images = []
  for image in image_names:
    i_image = Image.open(images_path + image)
    if i_image.mode != "RGB":
      i_image = i_image.convert(mode="RGB")

    images.append(i_image)

  pixel_values = feature_extractor(images=images, return_tensors="pt").pixel_values
  pixel_values = pixel_values.to(device)

  output_ids = model.generate(pixel_values, **gen_kwargs)

  preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
  preds = [pred.strip() for pred in preds]
  return preds

In [None]:
class CPU_Unpickler(pickle.Unpickler):
    def find_class(self, module, name):
        if module == 'torch.storage' and name == '_load_from_bytes':
            return lambda b: torch.load(io.BytesIO(b), map_location='cpu')
        else: return super().find_class(module, name)

model = CPU_Unpickler(open('/kaggle/working/model.pkl', 'rb')).load()
feature_extractor = CPU_Unpickler(open('/kaggle/working/feature_extractor.pkl', 'rb')).load()
tokenizer = CPU_Unpickler(open('/kaggle/working/tokenizer.pkl', 'rb')).load()

model.to(device)

In [None]:
images = os.listdir(comp_path / 'images')
imgIds = [i.split('.')[0] for i in images]

images_path = "../input/stable-diffusion-image-to-prompts/images/"

prompts = predict_step(images)
    
st_model = SentenceTransformer('/kaggle/input/sentence-transformers-222/all-MiniLM-L6-v2')
prompt_embeddings = st_model.encode(prompts).flatten()

df_submission = pd.read_csv(comp_path / 'sample_submission.csv', index_col='imgId_eId')

EMBEDDING_LENGTH = 384
eIds = list(range(EMBEDDING_LENGTH))

imgId_eId = [
    '_'.join(map(str, i)) for i in zip(
        np.repeat(imgIds, EMBEDDING_LENGTH),
        np.tile(range(EMBEDDING_LENGTH), len(imgIds)))]

assert sorted(imgId_eId) == sorted(df_submission.index)

submission = pd.DataFrame(
                index=imgId_eId,
                data=prompt_embeddings,
                columns=['val']).rename_axis('imgId_eId')

In [None]:
submission.to_csv('submission.csv')