# Creating "Text Embed" from research

In [1]:
from transformers import AutoTokenizer, AutoModel

# Load frozen LLM as the text encoder
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B")
text_encoder = AutoModel.from_pretrained("meta-llama/Meta-Llama-3-8B")

# Freeze the model to prevent updates
for param in text_encoder.parameters():
    param.requires_grad = False

OSError: You are trying to access a gated repo.
Make sure to have access to it at https://huggingface.co/meta-llama/Meta-Llama-3-8B.
403 Client Error. (Request ID: Root=1-67cd8e33-05fafe382e6cb5526fb38a4c;220ff58f-76f7-4245-baaa-2395ed96dbb7)

Cannot access gated repo for url https://huggingface.co/meta-llama/Meta-Llama-3-8B/resolve/main/config.json.
Access to model meta-llama/Meta-Llama-3-8B is restricted and you are not in the authorized list. Visit https://huggingface.co/meta-llama/Meta-Llama-3-8B to ask for access.

In [2]:
import torch
import torch.nn as nn
from transformers import CLIPVisionModel

# Load image encoder (CLIP vision backbone)
image_encoder = CLIPVisionModel.from_pretrained("openai/clip-vit-large-patch14")

# Projection layer to match LLM text space
class ImageProjection(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.fc = nn.Linear(input_dim, output_dim)

    def forward(self, x):
        return self.fc(x)

image_projector = ImageProjection(input_dim=1024, output_dim=text_encoder.config.hidden_size)

config.json:   0%|          | 0.00/4.52k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.71G [00:00<?, ?B/s]

KeyboardInterrupt: 

In [None]:
from datasets import load_dataset

# Load a sample of LAION-400M
dataset = load_dataset("laion/laion400m", split="train[:1%]")

# Get image-text pairs
images = [item["jpg"] for item in dataset]
captions = [item["TEXT"] for item in dataset]

In [None]:
from transformers import Wav2Vec2Model

# Load pretrained audio encoder
audio_encoder = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-large-960h")

# Projection layer for audio features
audio_projector = ImageProjection(input_dim=audio_encoder.config.hidden_size, output_dim=text_encoder.config.hidden_size)

In [None]:
clotho = load_dataset("cdav/Clotho", split="train")

# Extract audio-text pairs
audio_files = [item["audio"] for item in clotho]
audio_captions = [item["caption"] for item in clotho]

In [None]:
librispeech = load_dataset("librispeech_asr", split="train.clean.100")

# Extract speech-text pairs
speech_audio = [item["audio"]["array"] for item in librispeech]
speech_text = [item["text"] for item in librispeech]

In [None]:
import torch.nn.functional as F

def contrastive_loss(embeddings_a, embeddings_b, temperature=0.07):
    """Computes contrastive loss between two embedding sets"""
    logits = torch.matmul(embeddings_a, embeddings_b.T) / temperature
    labels = torch.arange(len(embeddings_a)).to(logits.device)
    return F.cross_entropy(logits, labels)

In [None]:
import torch.optim as optim

# Optimizer
optimizer = optim.AdamW(list(image_projector.parameters()) + list(audio_projector.parameters()), lr=1e-4)

for epoch in range(10):
    # Encode text
    text_inputs = tokenizer(captions, return_tensors="pt", padding=True, truncation=True)
    text_embeddings = text_encoder(**text_inputs).last_hidden_state[:, 0, :]
    
    # Encode images
    image_features = image_encoder(images).last_hidden_state[:, 0, :]
    image_embeddings = image_projector(image_features)

    # Encode audio
    audio_features = audio_encoder(audio_files).last_hidden_state[:, 0, :]
    audio_embeddings = audio_projector(audio_features)

    # Compute contrastive loss
    loss = contrastive_loss(text_embeddings, image_embeddings) + contrastive_loss(text_embeddings, audio_embeddings)

    # Backpropagation
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    print(f"Epoch {epoch}: Loss {loss.item()}")