In [3]:
import torch
import torch.nn as nn
import torchvision.transforms as transforms
import math

# Define Vision Transformer from scratch
class VisionTransformer(nn.Module):
    def __init__(self, image_size=224, patch_size=16, num_classes=10, embed_dim=768, num_heads=8, num_layers=12):
        super(VisionTransformer, self).__init__()
        self.patch_size = patch_size
        self.num_patches = (image_size // patch_size) ** 2
        self.embed_dim = embed_dim

        self.patch_embedding = nn.Conv2d(3, embed_dim, kernel_size=patch_size, stride=patch_size)
        self.cls_token = nn.Parameter(torch.randn(1, 1, embed_dim))
        self.pos_embedding = nn.Parameter(torch.randn(1, self.num_patches + 1, embed_dim))

        self.transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=embed_dim, nhead=num_heads, batch_first=True),
            num_layers=num_layers
        )

        self.mlp_head = nn.Linear(embed_dim, embed_dim)

    def forward(self, x):
        x = self.patch_embedding(x).flatten(2).transpose(1, 2)
        batch_size = x.shape[0]
        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
        x = torch.cat((cls_tokens, x), dim=1)
        x += self.pos_embedding
        x = self.transformer(x)
        x = self.mlp_head(x[:, 0])
        return x

# Define Transformer Encoder for text from scratch
class TransformerEncoder(nn.Module):
    def __init__(self, vocab_size=30522, embed_dim=256, num_heads=8, num_layers=6, max_length=10):
        super(TransformerEncoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.pos_embedding = nn.Parameter(torch.randn(1, max_length, embed_dim))

        self.transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=embed_dim, nhead=num_heads, batch_first=True),
            num_layers=num_layers
        )

        self.fc = nn.Linear(embed_dim, embed_dim)

    def forward(self, input_ids):
        x = self.embedding(input_ids) + self.pos_embedding[:, :input_ids.shape[1], :]
        x = self.transformer(x)
        x = x.mean(dim=1)  # Global average pooling to ensure fixed shape output
        return self.fc(x)

# Define Multimodal Fusion Model
class MultiModalClassifier(nn.Module):
    def __init__(self, num_classes=10):
        super(MultiModalClassifier, self).__init__()
        self.image_encoder = VisionTransformer()
        self.text_encoder = TransformerEncoder()
        self.fusion = nn.Linear(768 + 256, 512)
        self.classifier = nn.Linear(512, num_classes)

    def forward(self, image, input_ids):
        img_features = self.image_encoder(image)
        txt_features = self.text_encoder(input_ids)
        fused = torch.cat((img_features, txt_features), dim=1)
        fused = torch.relu(self.fusion(fused))
        output = self.classifier(fused)
        return output

# Define Tokenizer and Data Preprocessing
def simple_tokenizer(texts, vocab, max_length=10):
    tokenized = [[vocab.get(word, 0) for word in text.split()] for text in texts]
    # Pad sequences to ensure consistent length
    padded = [seq + [0] * (max_length - len(seq)) if len(seq) < max_length else seq[:max_length] for seq in tokenized]
    return padded

transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Example Usage
if __name__ == "__main__":
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = MultiModalClassifier(num_classes=10).to(device)

    # Dummy inputs
    image = torch.randn(2, 3, 224, 224).to(device)  # Batch of 2 images
    vocab = {word: idx for idx, word in enumerate(["this", "is", "a", "sample", "text", "another", "example"])}
    text = ["this is a sample text", "another example text"]
    text_inputs = torch.tensor(simple_tokenizer(text, vocab)).to(device)

    output = model(image, text_inputs)
    print(output.shape)  # Should print torch.Size([2, 10])

torch.Size([2, 10])


In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
import math

# Define Vision Transformer from scratch
class VisionTransformer(nn.Module):
    def __init__(self, image_size=224, patch_size=16, num_classes=10, embed_dim=768, num_heads=8, num_layers=12):
        super(VisionTransformer, self).__init__()
        self.patch_size = patch_size
        self.num_patches = (image_size // patch_size) ** 2
        self.embed_dim = embed_dim

        self.patch_embedding = nn.Conv2d(3, embed_dim, kernel_size=patch_size, stride=patch_size)
        self.cls_token = nn.Parameter(torch.randn(1, 1, embed_dim))
        self.pos_embedding = nn.Parameter(torch.randn(1, self.num_patches + 1, embed_dim))

        self.transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=embed_dim, nhead=num_heads, batch_first=True),
            num_layers=num_layers
        )

        self.mlp_head = nn.Linear(embed_dim, embed_dim)

    def forward(self, x):
        x = self.patch_embedding(x).flatten(2).transpose(1, 2)
        batch_size = x.shape[0]
        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
        x = torch.cat((cls_tokens, x), dim=1)
        x += self.pos_embedding
        x = self.transformer(x)
        x = self.mlp_head(x[:, 0])
        return x

# Define Transformer Encoder for text from scratch
class TransformerEncoder(nn.Module):
    def __init__(self, vocab_size=30522, embed_dim=256, num_heads=8, num_layers=6, max_length=10):
        super(TransformerEncoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.pos_embedding = nn.Parameter(torch.randn(1, max_length, embed_dim))

        self.transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=embed_dim, nhead=num_heads, batch_first=True),
            num_layers=num_layers
        )

        self.fc = nn.Linear(embed_dim, embed_dim)

    def forward(self, input_ids):
        x = self.embedding(input_ids) + self.pos_embedding[:, :input_ids.shape[1], :]
        x = self.transformer(x)
        x = x.mean(dim=1)  # Global average pooling to ensure fixed shape output
        return self.fc(x)

# Define Multimodal Fusion Model
class MultiModalClassifier(nn.Module):
    def __init__(self, num_classes=10):
        super(MultiModalClassifier, self).__init__()
        self.image_encoder = VisionTransformer()
        self.text_encoder = TransformerEncoder()
        self.fusion = nn.Linear(768 + 256, 512)
        self.classifier = nn.Linear(512, num_classes)

    def forward(self, image, input_ids):
        img_features = self.image_encoder(image)
        txt_features = self.text_encoder(input_ids)
        fused = torch.cat((img_features, txt_features), dim=1)
        fused = torch.relu(self.fusion(fused))
        output = self.classifier(fused)
        return output

# Training and Testing Functions
def train(model, dataloader, criterion, optimizer, device):
    model.train()
    for images, texts, labels in dataloader:
        images, texts, labels = images.to(device), texts.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(images, texts)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

def test(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    correct = 0
    with torch.no_grad():
        for images, texts, labels in dataloader:
            images, texts, labels = images.to(device), texts.to(device), labels.to(device)
            outputs = model(images, texts)
            total_loss += criterion(outputs, labels).item()
            correct += (outputs.argmax(dim=1) == labels).sum().item()
    return total_loss / len(dataloader), correct / len(dataloader.dataset)

# Example Usage
if __name__ == "__main__":
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = MultiModalClassifier(num_classes=10).to(device)
    optimizer = optim.Adam(model.parameters(), lr=1e-4)
    criterion = nn.CrossEntropyLoss()

    # Placeholder dataloader (replace with actual data)
    class DummyDataset(torch.utils.data.Dataset):
        def __len__(self):
            return 100
        def __getitem__(self, idx):
            return torch.randn(3, 224, 224), torch.randint(0, 30522, (10,)), torch.tensor(idx % 10)

    train_loader = torch.utils.data.DataLoader(DummyDataset(), batch_size=8, shuffle=True)
    test_loader = torch.utils.data.DataLoader(DummyDataset(), batch_size=8, shuffle=False)

    for epoch in range(5):
        train(model, train_loader, criterion, optimizer, device)
        test_loss, test_acc = test(model, test_loader, criterion, device)
        print(f"Epoch {epoch+1}: Test Loss = {test_loss:.4f}, Test Accuracy = {test_acc:.4f}")

Epoch 1: Test Loss = 2.3340, Test Accuracy = 0.1000
Epoch 2: Test Loss = 2.3296, Test Accuracy = 0.1000
Epoch 3: Test Loss = 2.3083, Test Accuracy = 0.1000
Epoch 4: Test Loss = 2.3094, Test Accuracy = 0.1000
Epoch 5: Test Loss = 2.3214, Test Accuracy = 0.1000


In [6]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
import math

# Define Vision Transformer from scratch
class VisionTransformer(nn.Module):
    def __init__(self, image_size=224, patch_size=16, num_classes=10, embed_dim=768, num_heads=8, num_layers=12):
        super(VisionTransformer, self).__init__()
        self.patch_size = patch_size
        self.num_patches = (image_size // patch_size) ** 2
        self.embed_dim = embed_dim

        self.patch_embedding = nn.Conv2d(3, embed_dim, kernel_size=patch_size, stride=patch_size)
        self.cls_token = nn.Parameter(torch.randn(1, 1, embed_dim))
        self.pos_embedding = nn.Parameter(torch.randn(1, self.num_patches + 1, embed_dim))

        self.transformer = nn.Sequential(
            *[TransformerBlock(embed_dim, num_heads) for _ in range(num_layers)]
        )

        self.mlp_head = nn.Linear(embed_dim, embed_dim)

    def forward(self, x):
        x = self.patch_embedding(x).flatten(2).transpose(1, 2)
        batch_size = x.shape[0]
        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
        x = torch.cat((cls_tokens, x), dim=1)
        x += self.pos_embedding
        x = self.transformer(x)
        x = self.mlp_head(x[:, 0])
        return x

# Define Transformer Block from scratch
class TransformerBlock(nn.Module):
    def __init__(self, embed_dim, num_heads):
        super(TransformerBlock, self).__init__()
        self.query = nn.Linear(embed_dim, embed_dim)
        self.key = nn.Linear(embed_dim, embed_dim)
        self.value = nn.Linear(embed_dim, embed_dim)
        self.num_heads = num_heads
        self.scale = math.sqrt(embed_dim // num_heads)

        self.norm1 = nn.LayerNorm(embed_dim)
        self.norm2 = nn.LayerNorm(embed_dim)
        self.feed_forward = nn.Sequential(
            nn.Linear(embed_dim, 4 * embed_dim),
            nn.ReLU(),
            nn.Linear(4 * embed_dim, embed_dim)
        )

    def forward(self, x):
        q = self.query(x)
        k = self.key(x)
        v = self.value(x)

        attn_scores = torch.matmul(q, k.transpose(-2, -1)) / self.scale
        attn_weights = torch.nn.functional.softmax(attn_scores, dim=-1)
        attn_output = torch.matmul(attn_weights, v)

        x = self.norm1(x + attn_output)
        ff_output = self.feed_forward(x)
        x = self.norm2(x + ff_output)
        return x

# Define Transformer Encoder for text from scratch
class TransformerEncoder(nn.Module):
    def __init__(self, vocab_size=30522, embed_dim=256, num_heads=8, num_layers=6, max_length=10):
        super(TransformerEncoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.pos_embedding = nn.Parameter(torch.randn(1, max_length, embed_dim))

        self.transformer = nn.Sequential(
            *[TransformerBlock(embed_dim, num_heads) for _ in range(num_layers)]
        )

        self.fc = nn.Linear(embed_dim, embed_dim)

    def forward(self, input_ids):
        x = self.embedding(input_ids) + self.pos_embedding[:, :input_ids.shape[1], :]
        x = self.transformer(x)
        x = x.mean(dim=1)  # Global average pooling to ensure fixed shape output
        return self.fc(x)

# Define Multimodal Fusion Model
class MultiModalClassifier(nn.Module):
    def __init__(self, num_classes=10):
        super(MultiModalClassifier, self).__init__()
        self.image_encoder = VisionTransformer()
        self.text_encoder = TransformerEncoder()
        self.fusion = nn.Linear(768 + 256, 512)
        self.classifier = nn.Linear(512, num_classes)

    def forward(self, image, input_ids):
        img_features = self.image_encoder(image)
        txt_features = self.text_encoder(input_ids)
        fused = torch.cat((img_features, txt_features), dim=1)
        fused = torch.relu(self.fusion(fused))
        output = self.classifier(fused)
        return output

# Training and Testing Functions
# (Same as before)

# Example Usage
if __name__ == "__main__":
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = MultiModalClassifier(num_classes=10).to(device)
    optimizer = optim.Adam(model.parameters(), lr=1e-4)
    criterion = nn.CrossEntropyLoss()

    # Placeholder dataloader (replace with actual data)
    class DummyDataset(torch.utils.data.Dataset):
        def __len__(self):
            return 100
        def __getitem__(self, idx):
            return torch.randn(3, 224, 224), torch.randint(0, 30522, (10,)), torch.tensor(idx % 10)

    train_loader = torch.utils.data.DataLoader(DummyDataset(), batch_size=8, shuffle=True)
    test_loader = torch.utils.data.DataLoader(DummyDataset(), batch_size=8, shuffle=False)

    for epoch in range(5):
        train(model, train_loader, criterion, optimizer, device)
        test_loss, test_acc = test(model, test_loader, criterion, device)
        print(f"Epoch {epoch+1}: Test Loss = {test_loss:.4f}, Test Accuracy = {test_acc:.4f}")

Epoch 1: Test Loss = 2.3572, Test Accuracy = 0.1000
Epoch 2: Test Loss = 2.3175, Test Accuracy = 0.1000
Epoch 3: Test Loss = 2.3106, Test Accuracy = 0.1000
Epoch 4: Test Loss = 2.3088, Test Accuracy = 0.1000
Epoch 5: Test Loss = 2.3125, Test Accuracy = 0.1000
