In [None]:
# %pip install transformers einops diffusers accelerate scipy safetensors datasets[audio] TTS
# %pip install --upgrade protobuf

In [1]:
import warnings
warnings.filterwarnings("ignore")

# Multi-modal models

Multi modality means that the inputs and outputs of a machine learning model come in distinct forms. For example: A captioning model takes as input an image and generates a description of that image as an output.


## Image question answering

Visual large language models can be used to answer questions about

In [2]:
# Set the model name and revision in the. HuggingFace Hub
model_name = "vikhyatk/moondream2" #"dumbequation/llama3.2-medical-visualqa-11B" #
model_revision = "2024-08-26"

Everey model is pinned to its tokenizer. The tokenizer splits a string into sub-words and it is trained specifically for it, therefore we use the same model name to load it.

In [3]:
# Load the dependencies, using the Auto classes
from transformers import AutoModelForCausalLM, AutoTokenizer

model = AutoModelForCausalLM.from_pretrained(
    model_name, trust_remote_code=True, revision=model_revision
)

tokenizer = AutoTokenizer.from_pretrained(model_name, revision=model_revision)

PhiForCausalLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


In [None]:
# Optionally, move the model to the GPU, is available
model = model.to("mps")

We will be asking questions about an image. For this purpose we will load a test image generated using AI.

In [None]:
from PIL import Image

image = Image.open('test.jpeg')
image

The first step towards answering questions is encoding the image with the transformer

In [None]:
# Notice how this architecture has a method called encode_image
enc_image = model.encode_image(image)

# We get a tensor as a result
enc_image

In [None]:
enc_image.size()

Now, the second modality, text, is used to present a question

In [None]:
question = "What is the color of the eyes of this individual?"

# Notice how there is another method that takes the question and the tensor that represents the encoded image
# It also requires the tokenizer
model.answer_question(enc_image, question, tokenizer)

In [None]:
question = "What is the boy eating and drinking?"

model.answer_question(enc_image, question, tokenizer)

In [None]:
question = "What is the overall color of this picture?"

model.answer_question(enc_image, question, tokenizer)

# Difussion models

Diffusion models go in the opposite direction. You pass as input a prompt, describing the image. This prompt is encoded with a transformer to generate an embedding representation.

The embedding representation is used to generate, or decode, a noisy image.
This image then goes through a _diffusion_ cycle, where each iteration reduces the noise in the image, eventually leading to the result.

In [None]:
# Import the dependencies
from diffusers import DiffusionPipeline
import torch

# Load the model and send it to the GPU
pipe = DiffusionPipeline.from_pretrained("stablediffusionapi/realistic-vision-v51", torch_dtype=torch.float16)
pipe.to("cuda")


In [None]:

# run image variation
image = pipe("a cat eating a bag of chips").images[0]
image

In [None]:
image = pipe("A young couple playing pickleball").images[0]
image

In [None]:
# run image variation
image = pipe("a turtle riding a bicycle").images[0]
image

# Automatic speech recognition

Takes as input a sound file with speech. Any conversation will be transcribed into text

In [None]:
# Load a sample file with multiple phrases

from IPython.display import Audio, display
display(Audio('harvard.wav', autoplay=True))

In [None]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from datasets import load_dataset


device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model_id = "openai/whisper-large-v3"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)

processor = AutoProcessor.from_pretrained(model_id)

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    torch_dtype=torch_dtype,
    device=device,
)

In [None]:
result = pipe('harvard.wav')
print(result["text"])

In [None]:
display(Audio('twister.wav', autoplay=True))

In [None]:
result = pipe('twister.wav')
print(result["text"])

# Text to Speech

TTS goes in the oposite direction. It takes a prompt and potentially other parameters, such as a voice sample, or the target language, then generates a sound file uttering the input prompt.

In [None]:
# We are using Coqui, not HuggingFace here
import torch
from pprint import pprint
from TTS.api import TTS

# Get device
device = "cuda" if torch.cuda.is_available() else "cpu"

# List available 🐸TTS models
pprint(TTS().list_models())


In [None]:
# Instantiate a model. Let's use a bilingual
tts = TTS("tts_models/multilingual/multi-dataset/your_tts")

In [None]:
tts.tts_to_file("This is voice cloning.", speaker_wav="sample.m4a", language="en", file_path="output.wav")

display(Audio('output.wav', autoplay=True))

In [None]:
tts.tts_to_file("C'est le clonage de la voix.", speaker_wav="sample.m4a", language="fr-fr", file_path="output.wav")

display(Audio('output.wav', autoplay=True))

In [None]:
tts.tts_to_file("Isso é clonagem de voz.", speaker_wav="sample.m4a", language="pt-br", file_path="output.wav")

display(Audio('output.wav', autoplay=True))

In [None]:
tts = TTS('tts_models/es/mai/tacotron2-DDC') # This is a model for a female spanish voice

# Generate some speech
tts.tts_to_file("Si seis sables son seis sables, ¿por qué seis sabios no son seis sabios?", file_path="output.wav")

display(Audio('output.wav', autoplay=True))