Useful Links  
A fine-tuning of the Idefics3-8B-Llama3 model with the ROCO dataset

*   https://huggingface.co/eltorio/IDEFICS3_ROCO
*   https://colab.research.google.com/#scrollTo=8F3w0kcbAMtC&fileId=https%3A//huggingface.co/eltorio/IDEFICS3_ROCO/blob/main/ROCO-idefics3.ipynb  


Example notebook provided by the BioViL-T model  
https://notebooks.gesis.org/binder/jupyter/user/microsoft-hi-ml-w1rabu8m/doc/tree/hi-ml-multimodal/notebooks/phrase_grounding.ipynb

ROCO dataset  
https://huggingface.co/datasets/eltorio/ROCO-radiology


In [None]:
pip_source = "hi-ml-multimodal"

In [None]:
%pip install {pip_source}

In [None]:
!pip install datasets

In [None]:
import tempfile
from pathlib import Path

import torch

from health_multimodal.common.visualization import plot_phrase_grounding_similarity_map
from health_multimodal.text import get_bert_inference
from health_multimodal.text.utils import BertEncoderType
from health_multimodal.image import get_image_inference
from health_multimodal.image.utils import ImageModelType
from health_multimodal.vlp import ImageTextInferenceEngine

In [None]:
text_inference = get_bert_inference(BertEncoderType.BIOVIL_T_BERT)
image_inference = get_image_inference(ImageModelType.BIOVIL_T)

In [None]:
image_text_inference = ImageTextInferenceEngine(
    image_inference_engine=image_inference,
    text_inference_engine=text_inference,
)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
image_text_inference.to(device)

In [None]:
from transformers import TrainingArguments, Trainer
training_args = TrainingArguments(
  output_dir="./output_directory",
  report_to = [], # default is "all" which is annoying
  per_device_train_batch_size=8,
  per_device_eval_batch_size=8,
  num_train_epochs=3,
  save_steps = 10,
  resume_from_checkpoint = True,
)

In [None]:
import datasets
from datasets import load_dataset

train_dataset = load_dataset("eltorio/ROCO-radiology")

In [None]:
def tokenize(input):
  """Reshapes the dataset to contains only the embedded information for use in the model

  Args:
    input: A dictionary from the the dataset containing a 'image' and 'caption' keys for extracting embeddings

  Returns:
    A dictionary with the keys 'input' containing the image embedding, and 'labels' containing the text embedding
  """
  image = input['image']
  caption = input['caption']
  image_path = Path(tempfile.tempdir, "downloaded_chest_xray.jpg")
  image.save(image_path)
  image_tokens = image_text_inference.image_inference_engine.get_projected_global_embedding(image_path=image_path)
  text_tokens = image_text_inference.text_inference_engine.get_embeddings_from_prompt(input['caption'])
  return {
    "input": image_tokens,
    "labels": text_tokens
  }

In [None]:
from datasets import DatasetDict
# grabs the first 5 as to not load everything when testing code
small_dataset = DatasetDict({
    split: dataset.select(range(5)) for split, dataset in train_dataset.items()
})

In [None]:
small = False # Set to True for testing, False for production
used_dataset = small_dataset if small else train_dataset
tokenized_dataset = used_dataset.map(tokenize)

In [None]:
# remove unused columns when training
tokenized_dataset = tokenized_dataset.remove_columns(train_dataset["test"].column_names)

In [None]:
# set up the training arguments for training the model
from transformers import TrainingArguments
training_args = TrainingArguments(
  output_dir="./output_directory",
  report_to = [], # default is "all" which is annoying
  per_device_train_batch_size=8,
  per_device_eval_batch_size=8,
  num_train_epochs=3,
  save_steps = 10,
  resume_from_checkpoint = True,
)

In [None]:
import torch
from torch import nn
# wrapper model for training
class FineTunedModel(nn.Module):
    def __init__(self, original_model):
        super(FineTunedModel, self).__init__()
        self.original_model = original_model
        self.criterion = nn.CrossEntropyLoss()

    def forward(self, input=None, labels=None):
        return {"loss": self.criterion(torch.tensor(input, requires_grad=True), labels.squeeze(1)) }

In [None]:
from transformers import Trainer
trainable_model = FineTunedModel(image_text_inference)
trainer = Trainer(
  model=trainable_model,
  args=training_args,
  train_dataset=tokenized_dataset["train"],
  eval_dataset=tokenized_dataset["validation"],
)

In [None]:
# training takes a very long time
trainer.train()

In [None]:
def getImageEmbeddings(input):
  """Transforms an image to its embedded form using the model

  Args:
    input: A dictionary from the the dataset containing a 'image' key for extracting embeddings

  Returns:
    A tensor of the embeddings
  """
  image = input['image']
  image_path = Path(tempfile.tempdir, "downloaded_chest_xray.jpg")
  image.save(image_path)
  image_tokens = image_text_inference.image_inference_engine.get_projected_global_embedding(image_path=image_path)
  return image_tokens

In [None]:
def getImageEmbeddingsFromImage(image_path):
  """Transforms an image to its embedded form using the model

  Args:
    input: A file path to an image for extracting embeddings

  Returns:
    A tensor of the embeddings
  """
  path = Path(image_path)
  return image_text_inference.image_inference_engine.get_projected_global_embedding(image_path=path)

In [None]:
import torch.nn.functional as F
def GetBestLabel(image_embeddings, labels):
  """Chooses the best label for a given image_embeddings

  Args:
    image_embeddings: the embeddings for a particular image of interest obtained from getImageEmbeddings
    labels: the collection of labels that to choose the 'best' label from

  Returns:
    A text label for the particular image
  """
  embeddings = image_embeddings
  sim = []
  for label in labels:
    a = embeddings
    b = torch.tensor(label[0])
    s = F.cosine_similarity(a, b, dim=-1)
    sim.append(s)
  best = sim.index(max(sim))
  final = train_dataset['train'][best]['caption']
  return final



In [None]:
def scoreLabels(caption, label):
  """Scores how well a given caption label pair is using cosine simularity

  Args:
    caption: A text based caption to score
    label: The text based label to compare with caption

  Returns:
    A float for how well the input is correlated in the range (-1,1)
  """
  return image_text_inference.text_inference_engine.get_pairwise_similarities(caption, label)

In [None]:
def GetCaptionForImage(file_path):
  """Gets the caption for any arbitrary image using the model

  Args:
    file_path: the path to the image to caption

  Returns:
    A text based caption for the image
  """
  labels = tokenized_dataset['train']['labels']
  embeddings = getImageEmbeddingsFromImage(file_path)
  label = GetBestLabel(embeddings, labels)
  return label.strip()

Runs the evaluation on the model. It looks at its guessed label, its original caption, and its score between the two. A score closer to 1 is better.

In [None]:
# evaluation
labels = tokenized_dataset['train']['labels']
start = 0
end = 5
for i in range(start, end):
  curr = train_dataset['validation'][i]
  image = curr['image']
  caption = curr['caption']
  embeddings = getImageEmbeddings(curr)
  label = GetBestLabel(embeddings, labels)
  score = scoreLabels(caption, label)
  print(f"image: {image}")
  print(f"caption: {caption.strip()}")
  print(f"label: {label.strip()}")
  print(f"score: {score}\n")


Upload a file and get a caption back using the model

In [None]:
# only works in google colab
from google.colab import files
file_name = 'uploaded_file.png'
file_path = '/content/' + file_name
uploaded = files.upload_file(file_name)

caption = GetCaptionForImage(file_path)
print(caption)