In [9]:
from datasets import load_dataset
import json
from tqdm import tqdm

# Load dataset in streaming mode
dataset = load_dataset("omegalabsinc/omega-multimodal", split="train", streaming=True)

# Number of samples to collect
num_samples = 50000
output_file = "omega_50k_samples.json"

# Stream and save data
with open(output_file, "w", encoding="utf-8") as f:
    for i, sample in tqdm(enumerate(dataset), total=num_samples, desc="Saving Dataset"):
        json.dump(sample, f)
        f.write("\n")  # New line for each JSON object
        if i >= num_samples - 1:
            break

print(f"Saved {num_samples} samples to {output_file}.")


Saving Dataset: 100%|█████████▉| 49999/50000 [02:52<00:00, 290.63it/s]

Saved 50000 samples to omega_50k_samples.json.





In [14]:
import json

# Path to saved dataset
dataset_path = "omega_50k_samples.json"

# Load and inspect dataset
def inspect_dataset_structure(file_path, num_samples=5):
    with open(file_path, "r", encoding="utf-8") as f:
        for i in range(num_samples):
            line = f.readline().strip()
            if not line:
                print(f"🔴 Empty line detected at entry {i+1}")
                continue
            try:
                sample = json.loads(line)
                print(f"\n🔹 **Sample {i+1} Field Types:**")
                for key, value in sample.items():
                    print(f"  - {key}: {type(value)}")
                print("\n" + "="*50 + "\n")
            except json.JSONDecodeError:
                print(f"🔴 JSON Decode Error in line {i+1}")

# Run preview function
inspect_dataset_structure(dataset_path)



🔹 **Sample 1 Field Types:**
  - video_id: <class 'str'>
  - youtube_id: <class 'str'>
  - description: <class 'str'>
  - views: <class 'int'>
  - start_time: <class 'int'>
  - end_time: <class 'int'>
  - video_embed: <class 'list'>
  - audio_embed: <class 'list'>
  - description_embed: <class 'list'>
  - description_relevance_score: <class 'float'>
  - query_relevance_score: <class 'float'>
  - query: <class 'str'>
  - submitted_at: <class 'int'>



🔹 **Sample 2 Field Types:**
  - video_id: <class 'str'>
  - youtube_id: <class 'str'>
  - description: <class 'str'>
  - views: <class 'int'>
  - start_time: <class 'int'>
  - end_time: <class 'int'>
  - video_embed: <class 'list'>
  - audio_embed: <class 'list'>
  - description_embed: <class 'list'>
  - description_relevance_score: <class 'float'>
  - query_relevance_score: <class 'float'>
  - query: <class 'str'>
  - submitted_at: <class 'int'>



🔹 **Sample 3 Field Types:**
  - video_id: <class 'str'>
  - youtube_id: <class 'str'>
  - d

In [16]:
import json

dataset_path = "omega_50k_samples.json"

def inspect_vector_sizes(file_path, num_samples=5):
    with open(file_path, "r", encoding="utf-8") as f:
        for i in range(num_samples):
            line = f.readline().strip()
            if not line:
                print(f"Empty line at sample {i+1}")
                continue
            try:
                sample = json.loads(line)
                video_size = len(sample.get("video_embed", []))
                description_size = len(sample.get("description_embed", []))
                audio_size = len(sample.get("audio_embed", []))
                print(f"Sample {i+1}:")
                print(f"  video_embed size: {video_size}")
                print(f"  description_embed size: {description_size}")
                print(f"  audio_embed size: {audio_size}")
                print("="*40)
            except json.JSONDecodeError:
                print(f"JSON Decode Error in sample {i+1}")

inspect_vector_sizes(dataset_path)


Sample 1:
  video_embed size: 1024
  description_embed size: 1024
  audio_embed size: 1024
Sample 2:
  video_embed size: 1024
  description_embed size: 1024
  audio_embed size: 1024
Sample 3:
  video_embed size: 1024
  description_embed size: 1024
  audio_embed size: 1024
Sample 4:
  video_embed size: 1024
  description_embed size: 1024
  audio_embed size: 1024
Sample 5:
  video_embed size: 1024
  description_embed size: 1024
  audio_embed size: 1024


In [15]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import json
from tqdm import tqdm

# Define MLP model for modality transformation
class MLPHead(nn.Module):
    def __init__(self, input_dim, output_dim, hidden_dim=2028):
        super(MLPHead, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim)
        )

    def forward(self, x):
        return self.model(x)

# Custom dataset class for loading preprocessed embeddings
class OmegaMultimodalDataset(Dataset):
    def __init__(self, file_path, num_samples=50000):
        self.data = []
        with open(file_path, "r", encoding="utf-8") as f:
            for i, line in tqdm(enumerate(f), total=num_samples, desc="Loading Dataset"):
                sample = json.loads(line)

                # Ensure all required fields are present
                if "audio_embed" in sample and "description_embed" in sample and "video_embed" in sample:
                    self.data.append({
                        "audio": torch.tensor(sample["audio_embed"], dtype=torch.float32),
                        "text": torch.tensor(sample["description_embed"], dtype=torch.float32),
                        "video": torch.tensor(sample["video_embed"], dtype=torch.float32),
                    })

                if len(self.data) >= num_samples:
                    break

        if len(self.data) == 0:
            raise ValueError("🔴 No valid samples found! Check JSON structure.")

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

# Load dataset
dataset_path = "omega_50k_samples.json"
dataset = OmegaMultimodalDataset(dataset_path)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

# Determine embedding size
embedding_dim = len(dataset[0]["audio"])

# Initialize MLP heads for modality transformation
audio_to_text = MLPHead(embedding_dim, embedding_dim)
text_to_video = MLPHead(embedding_dim, embedding_dim)
video_to_audio = MLPHead(embedding_dim, embedding_dim)

# Move models to device
device = "cuda" if torch.cuda.is_available() else "cpu"
audio_to_text.to(device)
text_to_video.to(device)
video_to_audio.to(device)

# Optimizers for each MLP
optimizer_a2t = optim.Adam(audio_to_text.parameters(), lr=1e-4)
optimizer_t2v = optim.Adam(text_to_video.parameters(), lr=1e-4)
optimizer_v2a = optim.Adam(video_to_audio.parameters(), lr=1e-4)

# Loss function (Cosine Similarity Loss)
criterion = nn.CosineEmbeddingLoss(margin=0.1)

# Training loop
num_epochs = 20

for epoch in range(num_epochs):
    total_loss_a2t = 0
    total_loss_t2v = 0
    total_loss_v2a = 0

    for batch in tqdm(dataloader, desc=f"Epoch {epoch+1}/{num_epochs}"):
        # Move data to device
        audio_emb = batch["audio"].to(device)
        text_emb = batch["text"].to(device)
        video_emb = batch["video"].to(device)

        # Forward passes through MLPs
        pred_text = audio_to_text(audio_emb)  # Audio → Text
        pred_video = text_to_video(pred_text)  # Text → Video
        pred_audio = video_to_audio(pred_video)  # Video → Audio

        # Define positive labels for CosineEmbeddingLoss
        positive_labels = torch.ones(audio_emb.shape[0]).to(device)

        # Compute individual losses per MLP
        loss_a2t = criterion(pred_text, text_emb, positive_labels)  # Audio → Text Loss
        loss_t2v = criterion(pred_video, video_emb, positive_labels)  # Text → Video Loss
        loss_v2a = criterion(pred_audio, audio_emb, positive_labels)  # Video → Audio Loss

        # Zero gradients for each optimizer
        optimizer_a2t.zero_grad()
        optimizer_t2v.zero_grad()
        optimizer_v2a.zero_grad()

        # Backpropagate individual losses
        loss_a2t.backward()
        loss_t2v.backward()
        loss_v2a.backward()

        # Update parameters
        optimizer_a2t.step()
        optimizer_t2v.step()
        optimizer_v2a.step()

        # Accumulate loss
        total_loss_a2t += loss_a2t.item()
        total_loss_t2v += loss_t2v.item()
        total_loss_v2a += loss_v2a.item()

    # Compute average loss per epoch
    avg_loss_a2t = total_loss_a2t / len(dataloader)
    avg_loss_t2v = total_loss_t2v / len(dataloader)
    avg_loss_v2a = total_loss_v2a / len(dataloader)

    print(f"Epoch {epoch+1}/{num_epochs}")
    print(f"  🔹 Audio → Text Loss: {avg_loss_a2t:.4f}")
    print(f"  🔹 Text → Video Loss: {avg_loss_t2v:.4f}")
    print(f"  🔹 Video → Audio Loss: {avg_loss_v2a:.4f}")

# Save trained models
torch.save(audio_to_text.state_dict(), "audio_to_text.pth")
torch.save(text_to_video.state_dict(), "text_to_video.pth")
torch.save(video_to_audio.state_dict(), "video_to_audio.pth")

print("✅ Models saved successfully.")


Loading Dataset:  11%|█         | 5568/50000 [00:03<00:27, 1636.99it/s]

In [None]:
import numpy as np
import umap
import matplotlib.pyplot as plt

# Assume audio_embed, text_embed, video_embed are 2D arrays of shape (N, 1024) for raw embeddings
# and audio_embed_aligned, text_embed_aligned, video_embed_aligned for transformed embeddings.
# Combine embeddings for UMAP (stack in order: audio, text, video)
raw_embeddings = np.vstack([audio_embed, text_embed, video_embed])
aligned_embeddings = np.vstack([audio_embed_aligned, text_embed_aligned, video_embed_aligned])

# Create modality labels for color-coding
labels = (['Audio'] * len(audio_embed) + 
          ['Text'] * len(text_embed) + 
          ['Video'] * len(video_embed))
labels = np.array(labels)

# 2D UMAP projection for raw and aligned embeddings
umap_2d = umap.UMAP(n_components=2, n_neighbors=15, min_dist=0.1, metric='cosine', random_state=42)  # :contentReference[oaicite:2]{index=2}
raw_2d = umap_2d.fit_transform(raw_embeddings)
aligned_2d = umap_2d.fit_transform(aligned_embeddings)  # separate fit to see new arrangement

# Plot side-by-side 2D scatterplots
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
modalities = ['Audio', 'Text', 'Video']
colors = {'Audio': 'red', 'Text': 'green', 'Video': 'blue'}

for ax, data, title in zip(axes, [raw_2d, aligned_2d], ['Raw Embeddings (UMAP 2D)', 'Aligned Embeddings (UMAP 2D)']):
    for mod in modalities:
        idx = (labels == mod)
        ax.scatter(data[idx, 0], data[idx, 1], s=10, color=colors[mod], label=mod, alpha=0.7)
    ax.set_title(title)
    ax.legend()

plt.tight_layout()
plt.show()

# 3D UMAP projection for raw and aligned embeddings
umap_3d = umap.UMAP(n_components=3, n_neighbors=15, min_dist=0.1, metric='cosine', random_state=42)
raw_3d = umap_3d.fit_transform(raw_embeddings)
aligned_3d = umap_3d.fit_transform(aligned_embeddings)

# Plot 3D scatter for raw vs. aligned (in separate figures for clarity)
fig = plt.figure(figsize=(6,5))
ax = fig.add_subplot(111, projection='3d')
for mod in modalities:
    idx = (labels == mod)
    ax.scatter(raw_3d[idx, 0], raw_3d[idx, 1], raw_3d[idx, 2], s=15, color=colors[mod], label=mod, alpha=0.7)
ax.set_title('Raw Embeddings (UMAP 3D)')
ax.legend()
plt.show()

fig = plt.figure(figsize=(6,5))
ax = fig.add_subplot(111, projection='3d')
for mod in modalities:
    idx = (labels == mod)
    ax.scatter(aligned_3d[idx, 0], aligned_3d[idx, 1], aligned_3d[idx, 2], s=15, color=colors[mod], label=mod, alpha=0.7)
ax.set_title('Aligned Embeddings (UMAP 3D)')
ax.legend()
plt.show()


In [None]:
from sklearn.manifold import TSNE

# 2D t-SNE projection for raw and aligned embeddings
tsne = TSNE(n_components=2, perplexity=30, init='pca', random_state=42)
raw_tsne_2d = tsne.fit_transform(raw_embeddings)      # :contentReference[oaicite:4]{index=4}
aligned_tsne_2d = tsne.fit_transform(aligned_embeddings)

# Plot side-by-side 2D t-SNE scatterplots
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
for ax, data, title in zip(axes, [raw_tsne_2d, aligned_tsne_2d], ['Raw Embeddings (t-SNE 2D)', 'Aligned Embeddings (t-SNE 2D)']):
    for mod in modalities:
        idx = (labels == mod)
        ax.scatter(data[idx, 0], data[idx, 1], s=10, color=colors[mod], label=mod, alpha=0.7)
    ax.set_title(title)
    ax.legend()
plt.tight_layout()
plt.show()

# (Optional) For 3D t-SNE, set n_components=3 in TSNE() and use a 3D scatter plot as done for UMAP.


In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import seaborn as sns

# Compute cosine distance matrices for raw and aligned embeddings
cos_sim_raw = cosine_similarity(raw_embeddings)
cos_sim_aligned = cosine_similarity(aligned_embeddings)
dist_matrix_raw = 1 - cos_sim_raw   # cosine distance = 1 - cosine similarity&#8203;:contentReference[oaicite:6]{index=6}
dist_matrix_aligned = 1 - cos_sim_aligned

# Define a common color scale for fair comparison
max_val = max(dist_matrix_raw.max(), dist_matrix_aligned.max())
min_val = min(dist_matrix_raw.min(), dist_matrix_aligned.min())  # (should be ~0)
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Raw embeddings distance heatmap
sns.heatmap(dist_matrix_raw, ax=axes[0], vmin=min_val, vmax=max_val, cmap="YlGnBu", 
            xticklabels=False, yticklabels=False, cbar_kws={'label': 'Cosine Distance'})
axes[0].set_title('Pairwise Distances (Raw)')

# Aligned embeddings distance heatmap
sns.heatmap(dist_matrix_aligned, ax=axes[1], vmin=min_val, vmax=max_val, cmap="YlGnBu", 
            xticklabels=False, yticklabels=False, cbar_kws={'label': 'Cosine Distance'})
axes[1].set_title('Pairwise Distances (Aligned)')

# (Optional) Add white lines to separate modality blocks for clarity
N_a, N_t, N_v = len(audio_embed), len(text_embed), len(video_embed)
for ax in axes:
    ax.axvline(N_a, color='white'); ax.axvline(N_a+N_t, color='white')
    ax.axhline(N_a, color='white'); ax.axhline(N_a+N_t, color='white')

plt.tight_layout()
plt.show()
