<a href="https://colab.research.google.com/github/bogdart/phi-multimodal/blob/master/YT_Phi_4_Multimodal.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip -q install torch==2.5.1
!pip -q install flash_attn==2.7.4.post1
!pip -q install transformers==4.48.2
!pip -q install accelerate==1.3.0
!pip -q install soundfile==0.13.1
!pip -q install pillow==11.1.0
!pip -q install scipy==1.15.2
!pip -q install torchvision==0.21.0
!pip -q install backoff==2.2.1
!pip -q install peft==0.13.2
!pip -q install hf_transfer

In [None]:
import requests
import torch
import os
import io
from PIL import Image
import soundfile as sf
from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig
from IPython.display import display, Markdown

In [None]:
os.environ ['HF_HUB_ENABLE_HF_TRANSFER'] = '1'

In [None]:
# Load the model and processor
model_path = "microsoft/Phi-4-multimodal-instruct"


model = AutoModelForCausalLM.from_pretrained(
    model_path,
    device_map="cuda",
    torch_dtype="auto",
    trust_remote_code=True,
    _attn_implementation='eager',
).cuda()

processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)

In [None]:
generation_config = GenerationConfig.from_pretrained(model_path)

In [None]:
!nvidia-smi

In [None]:
import io
from PIL import Image
from google.colab import files

# Upload the image
uploaded = files.upload()

# Get the uploaded file (assuming a single file is uploaded)
file_name = list(uploaded.keys())[0]  # Get the first uploaded file's name
image_data = uploaded[file_name]      # Get the file's binary content

# Open the image from the uploaded bytes
image = Image.open(io.BytesIO(image_data))

# Optional: Display the image to confirm it worked
image.show()  # This opens the image in a viewer (works locally; in Colab, see below)

In [None]:
image

In [None]:
messages = [
    {"role": "user", "content": """<|image_1|>\nYou are a service that converts images of notebooks in mixed English and Russian to Markdown format.
Convert line by line.
Keep formatting like underscore, bold etc.
If there is a picture, describe it.
If there is a table, make in Markdown format.
If there is a schema, show it in pseudocode.
If there is a separator (it is usually ''), replace it with ***.
If there is a header as a date, put it as #
Don't write any technical text, don't add ```, it will be wrapped in other service.
"""},
    # {"role": "assistant", "content": "The chart displays the percentage of respondents who agree with various statements about their preparedness for meetings. It shows five categories: 'Having clear and pre-defined goals for meetings', 'Knowing where to find the information I need for a meeting', 'Understanding my exact role and responsibilities when I'm invited', 'Having tools to manage admin tasks like note-taking or summarization', and 'Having more focus time to sufficiently prepare for meetings'. Each category has an associated bar indicating the level of agreement, measured on a scale from 0% to 100%."},
    # {"role": "user", "content": "Provide insightful questions to spark discussion."}
]


In [None]:
prompt = processor.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

inputs = processor(prompt, [image], return_tensors="pt").to("cuda:0")

In [None]:
generation_args = {
    "max_new_tokens": 512,
    "do_sample": False,
    # "temperature": 0.0,

}

In [None]:
generate_ids = model.generate(**inputs, eos_token_id=processor.tokenizer.eos_token_id, **generation_args)

# remove input tokens
generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]



In [None]:
Markdown(response)

## Images

In [None]:
def vqa_phi4(image, prompt):
    messages = [
        {"role": "user", "content": f"<|image_1|>\n{prompt}"},
    ]
    prompt = processor.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

    inputs = processor(text=prompt, images=[image], return_tensors="pt").to("cuda:0")

    generate_ids = model.generate(**inputs, eos_token_id=processor.tokenizer.eos_token_id, **generation_args)

    # remove input tokens
    generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
    response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

    print(response)
    return response


In [None]:
img_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/bee.JPG?download=true"

response = requests.get(img_url, stream=True)
response.raise_for_status()
image = Image.open(io.BytesIO(response.content))
image

In [None]:
prompt = "What is shown in this image?"

vqa_phi4(image, prompt)

In [None]:
prompt = "What kind of bee is in the image?"

vqa_phi4(image, prompt)

In [None]:
prompt = "Apart from pink what bright color is another flower?"

vqa_phi4(image, prompt)

In [None]:
img_path = "/content/planes_phi4.png"

image = Image.open(img_path).convert("RGB")
image

In [None]:
prompt = "What is shown in this image?"

vqa_phi4(image, prompt)

In [None]:
prompt = "How many planes are there?"

vqa_phi4(image, prompt)

In [None]:
prompt = "Give me the bounding boxes for the planes"

vqa_phi4(image, prompt)

In [None]:
prompt = "what airport is it?"

vqa_phi4(image, prompt)

## OCR

In [None]:
img_path = "/content/blog_text.png"

image = Image.open(img_path).convert("RGB")
image

In [None]:
prompt = "Please transcribe the text in this"

vqa_phi4(image, prompt)

In [None]:
prompt = "please summarize this text"

vqa_phi4(image, prompt)

In [None]:
prompt = "How big is the model?"

vqa_phi4(image, prompt)

## Audio

In [None]:
audio_path = "/content/mark_zuckerberg_30_01.mp3"


# Read audio file using open()
with open(audio_path, "rb") as f:
    audio, samplerate = sf.read(io.BytesIO(f.read()))

from IPython.display import Audio
Audio(audio, rate=samplerate)


In [None]:
def process_audio(prompt, audio, samplerate):
    messages = [
        {"role": "user", "content": f"<|audio_1|>\n{prompt}"},
    ]
    prompt = processor.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

    # Process with the model
    inputs = processor(text=prompt, audios=[(audio, samplerate)], return_tensors='pt').to('cuda:0')

    generate_ids = model.generate(**inputs, eos_token_id=processor.tokenizer.eos_token_id, **generation_args)

    # remove input tokens
    generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
    response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

    # print(response)
    return response

In [None]:
prompt = "Transcribe the audio"
process_audio(prompt, audio, samplerate)


In [None]:
Audio(audio, rate=samplerate)

In [None]:
Audio(audio, rate=samplerate)

prompt = "Transcribe the audio to text, and then translate the audio to French. Use <sep> as a separator between the original transcript and the translation."
process_audio(prompt, audio, samplerate)

## Just Text

In [None]:
def just_text(prompt):
    messages = [
        {"role": "user", "content": f"{prompt}"},
    ]
    prompt = processor.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

    inputs = processor(text=prompt, return_tensors="pt").to("cuda:0")

    generate_ids = model.generate(**inputs, eos_token_id=processor.tokenizer.eos_token_id, **generation_args)

    # remove input tokens
    generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
    response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

    print(response)
    return response

In [None]:
just_text("What is difference between Llama, Vicunas and Alpacas?")