# Multi-Modal Model Development

In [2]:
import torch.nn as nn
from transformers import BertModel
from torchvision.models import resnet50
import torch.optim as optim




In [3]:
class MultiModalModel(nn.Module):
    def __init__(self, text_encoder, image_encoder, hidden_dim=512, output_dim=1):
        super(MultiModalModel, self).__init__()
        self.text_encoder = text_encoder
        self.image_encoder = image_encoder
        self.classifier = nn.Sequential(
            nn.Linear(768 + 2048, hidden_dim),  # BERT (768) + ResNet (2048)
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(hidden_dim, output_dim),
            nn.Sigmoid()
        )

    def forward(self, input_ids, attention_mask, images):
        # Text features
        text_embedding = self.text_encoder(input_ids=input_ids, attention_mask=attention_mask).pooler_output
        
        # Image features
        image_embedding = self.image_encoder(images).squeeze()
        
        # Combine features
        combined = torch.cat((text_embedding, image_embedding), dim=1)
        
        # Classifier
        return self.classifier(combined)



In [4]:
# Initialize models
text_encoder = BertModel.from_pretrained("bert-base-uncased")
image_encoder = resnet50(pretrained=True)
image_encoder = nn.Sequential(*list(image_encoder.children())[:-1])  # Remove final layer

# Create the multi-modal model
model = MultiModalModel(text_encoder, image_encoder)



## Loss & Optimizer

In [5]:

# Define loss function and optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)

## Model Training

In [6]:
# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Training loop
num_epochs = 5
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for input_ids, attention_mask, images, labels in dataloader:
        # Move data to GPU if available
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        images = images.to(device)
        labels = labels.to(device)
        
        # Zero the gradients
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(input_ids, attention_mask, images)
        loss = criterion(outputs, labels)
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        
        # Update running loss
        running_loss += loss.item()
    
    # Print epoch loss
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(dataloader):.4f}")

NameError: name 'torch' is not defined

## Model Evaluation

In [7]:
model.eval()
val_loss = 0.0
with torch.no_grad():
    for input_ids, attention_mask, images, labels in dataloader:
        # Move data to GPU if available
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        images = images.to(device)
        labels = labels.to(device)
        
        # Forward pass
        outputs = model(input_ids, attention_mask, images)
        loss = criterion(outputs, labels)
        val_loss += loss.item()

# Print validation loss
print(f"Validation Loss: {val_loss/len(dataloader):.4f}")

NameError: name 'torch' is not defined