<a href="https://colab.research.google.com/github/Zain506/Similarity/blob/main/notebooks/MedCLIP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Load API keys



In [38]:
import os
from dotenv import load_dotenv
try:
    from google.colab import userdata
    tok = userdata.get('HF_TOKEN')
except:
    load_dotenv()
    tok = os.getenv("HF_TOKEN")
os.environ["HF_TOKEN"] = tok

## Text Encoder class

In [39]:
from transformers import AutoTokenizer, AutoModel
import torch

class textEncoder:
    """Import model with AutoTokenizer and Automodel. Defaults to BioClinicalBERT"""
    def __init__(self, link="emilyalsentzer/Bio_ClinicalBERT"):
        self.tokenizer = AutoTokenizer.from_pretrained(link)
        self.model = AutoModel.from_pretrained(link) # Load Neural Network

    def encode(self, texts):
        inputs = self.tokenizer(texts, padding=True, truncation=True, return_tensors="pt") # Tokens

        with torch.no_grad(): # Not training: Apply forward pass without calculating gradients for speed
            outputs = self.model(**inputs) # Apply Attention Mechanism to each token to generate embeddings

        hidden_states = outputs.last_hidden_state # "Hidden_states" is the attention-mechanism output
        # Apply Pooling with a mask (similar to filtering Pandas DataFrame)
        mask = inputs["attention_mask"].unsqueeze(-1)
        pooled = (hidden_states * mask).sum(dim=1) / mask.sum(dim=1)
        return pooled

## Image Encoder class

In [40]:
from transformers import AutoImageProcessor, AutoModelForImageClassification
from PIL import Image
import torch

class imageEncoder:
  """Image Encoder"""
  def __init__(self, model="microsoft/swinv2-tiny-patch4-window16-256"):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    self.processor = AutoImageProcessor.from_pretrained(model)
    self.model = AutoModelForImageClassification.from_pretrained(model).to(device)
    self.device = device
    self.model.eval()

  def encode(self, img: Image.Image):
    image = img.convert("RGB") # Convert from greyscale for dataset "X-iZhang/CheXpert-plus-RRG", "findings_section"
    inputs = self.processor(images=image, return_tensors="pt")

    inputs = {k: v.to(self.device) for k,v in inputs.items()}
    with torch.no_grad():
      outputs = self.model(**inputs)
    logits = outputs.logits
    return logits


## Batch process multiple datapoints with GPU

In [None]:
from transformers import AutoImageProcessor, AutoModelForImageClassification
import torch
from typing import Dict, List
from datasets import load_dataset

device = "cuda" if torch.cuda.is_available() else "cpu"
model = "microsoft/swinv2-tiny-patch4-window16-256"


ds = load_dataset("X-iZhang/CheXpert-plus-RRG", "findings_section") # Load dataset

images = [ds["valid"][i]["main_image"].convert("RGB") for i in range(8)] # Create batch from data

processor = AutoImageProcessor.from_pretrained(model) # Initialise encoder

inputs: Dict[str, torch.Tensor] = processor(images=images, return_tensors="pt") # Encode batch -> tensor on the CPU
inputs = {k: v.to(device) for k, v in inputs.items()} # Move tensor to GPU


with torch.no_grad(): # Without calculating gradients (no need to + improves speed)
  model = AutoModelForImageClassification.from_pretrained(model).to(device) # Initialise model (Neural Network) on GPU
  outputs: torch.Tensor = model(**inputs).logits # GPU model supplied with GPU tensor