Name:- Amarpreet kaur lotte

E-mail:- amarpreetkaurlotte@gmai.com

Task:- Image-to-HTML Model Using WebSight Dataset on Google Colab


In [5]:
# ✅ **Step 1: Install Dependencies**
!pip install transformers torch torchvision datasets sentencepiece evaluate bs4 accelerate

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as transforms
import torchvision.models as models
from torch.utils.data import DataLoader, Dataset
from datasets import load_dataset
from PIL import Image
import evaluate
from bs4 import BeautifulSoup
import os

# ✅ **Step 2: Load WebSight Dataset**
dataset = load_dataset("HuggingFaceM4/WebSight", "v0.2", split="train", streaming=True)

# ✅ **Step 3: Define Preprocessing**
transform = transforms.Compose([
    transforms.Resize((128, 128)),  # Resize image
    transforms.ToTensor(),  # Convert to tensor
])

# ✅ **Step 4: Tokenizer**
token_to_index = {"<SOS>": 0, "<EOS>": 1, "<PAD>": 2}
index_to_token = {0: "<SOS>", 1: "<EOS>", 2: "<PAD>"}

def tokenize_html(html):
    """Tokenize HTML into indexed tokens."""
    tokens = html.split()
    indexed_tokens = [token_to_index.get(t, len(token_to_index)) for t in tokens]
    for t in tokens:
        if t not in token_to_index:
            token_to_index[t] = len(token_to_index)
            index_to_token[len(token_to_index) - 1] = t
    return [0] + indexed_tokens + [1]  # <SOS> and <EOS>

# ✅ **Step 5: Custom Dataset**
class WebSightDataset(Dataset):
    def __init__(self, dataset, max_samples=5000):
        self.samples = []
        for i, sample in enumerate(dataset):
            if i >= max_samples:
                break
            try:
                image = transform(sample["image"])  # Apply transformations
                html_tokens = tokenize_html(sample["text"])  # Tokenize HTML
                self.samples.append((image, torch.tensor(html_tokens)))
            except Exception as e:
                print(f"Skipping sample due to error: {e}")

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        return self.samples[idx]

dataset = WebSightDataset(dataset)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True, collate_fn=lambda x: x)

# ✅ **Step 6: Define Model**
class ImageToHTML(nn.Module):
    def __init__(self, vocab_size, embedding_dim=128, hidden_dim=256, num_layers=2):
        super(ImageToHTML, self).__init__()
        self.cnn = models.resnet18(weights="IMAGENET1K_V1")

        # ✅ Ensure CNN output matches LSTM hidden_dim
        self.cnn.fc = nn.Linear(512, hidden_dim)

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, images, html_tokens):
        img_features = self.cnn(images)
        embedded = self.embedding(html_tokens)
        output, _ = self.lstm(embedded)
        return self.fc(output)

# ✅ **Step 7: Initialize Model, Loss, Optimizer**
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
vocab_size = len(token_to_index)
model = ImageToHTML(vocab_size).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# ✅ **Step 8: Training Loop**
num_epochs = 5
for epoch in range(num_epochs):
    total_loss = 0
    for batch in dataloader:
        images, html_tokens = zip(*batch)
        images = torch.stack(images).to(device)
        html_tokens = torch.nn.utils.rnn.pad_sequence(html_tokens, batch_first=True, padding_value=2).to(device)

        optimizer.zero_grad()
        outputs = model(images, html_tokens[:, :-1])

        loss = criterion(outputs[:, :html_tokens.shape[1]-1, :].reshape(-1, vocab_size), html_tokens[:, 1:].reshape(-1))
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss / len(dataloader)}")

# ✅ **Step 9: Save Model**
torch.save(model.state_dict(), "image_to_html_cnn_lstm.pth")
print("✅ Model saved!")




Resolving data files:   0%|          | 0/738 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/738 [00:00<?, ?it/s]

Epoch 1, Loss: 2.9964825994491577
Epoch 2, Loss: 1.8020394264936448
Epoch 3, Loss: 1.4852245891809464
Epoch 4, Loss: 1.2822781770467757
Epoch 5, Loss: 1.1299167957425118
✅ Model saved!


In [1]:
import torch
import torchvision.transforms as transforms
from PIL import Image
from bs4 import BeautifulSoup
from google.colab import files
import evaluate

# ✅ **Step 2: Define Preprocessing**
transform = transforms.Compose([
    transforms.Resize((128, 128)),  # Resize image
    transforms.ToTensor(),  # Convert to tensor
])

# ✅ **Step 3: Define Model (EXACTLY like Training)**
class ImageToHTML(torch.nn.Module):
    def __init__(self, vocab_size, embedding_dim=128, hidden_dim=256, num_layers=2):
        super(ImageToHTML, self).__init__()
        self.cnn = torch.hub.load('pytorch/vision:v0.10.0', 'resnet18', weights="IMAGENET1K_V1")

        # ✅ Keep the same CNN structure as training
        self.cnn.fc = torch.nn.Linear(512, hidden_dim)

        self.embedding = torch.nn.Embedding(vocab_size, embedding_dim)
        self.lstm = torch.nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = torch.nn.Linear(hidden_dim, vocab_size)

    def forward(self, images, html_tokens):
        img_features = self.cnn(images).unsqueeze(1)  # (batch, 256) → (batch, 1, 256)

        embedded = self.embedding(html_tokens)  # (batch, seq_len, 128)

        # ✅ Fix Shape Issue: Resize CNN output to match LSTM input (convert 256 → 128)
        img_features = torch.nn.functional.adaptive_avg_pool1d(img_features.permute(0, 2, 1), 128).permute(0, 2, 1)

        lstm_input = embedded + img_features.expand(-1, embedded.shape[1], -1)  # (batch, seq_len, 128)
        output, _ = self.lstm(lstm_input)
        return self.fc(output)

# ✅ **Step 4: Load Model & Extract Correct Vocabulary Size**
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 🔹 Load the trained model's weights **safely** to extract vocab size
state_dict = torch.load("image_to_html_cnn_lstm.pth", map_location=device)

vocab_size = state_dict["embedding.weight"].shape[0]  # Auto-detect vocab size

# 🔹 Initialize the model with correct vocab size
model = ImageToHTML(vocab_size).to(device)

# 🔹 Load state_dict **strictly** to avoid mismatches
model.load_state_dict(state_dict, strict=True)
model.eval()

# ✅ **Step 5: BLEU Score Calculation**
bleu = evaluate.load("bleu")

def generate_html(model, image_path, max_length=512):
    """Generate HTML using CNN and LSTM."""

    image = Image.open(image_path).convert("RGB")
    image = transform(image).unsqueeze(0).to(device)

    with torch.no_grad():
        img_features = model.cnn(image).unsqueeze(1)  # (batch, 256) → (batch, 1, 256)

        # ✅ Fix Shape Issue: Resize CNN output to match LSTM input (convert 256 → 128)
        img_features = torch.nn.functional.adaptive_avg_pool1d(img_features.permute(0, 2, 1), 128).permute(0, 2, 1)

        sequences = [[0]]  # Start with <SOS>
        for _ in range(max_length):
            token_tensor = torch.tensor(sequences[0]).unsqueeze(0).to(device)
            embedded = model.embedding(token_tensor)

            # ✅ Expand image features to match sequence length
            lstm_input = embedded + img_features.expand(-1, embedded.shape[1], -1)

            output, _ = model.lstm(lstm_input)
            logits = model.fc(output[:, -1, :])

            predicted_token = torch.argmax(logits, dim=1).item()
            if predicted_token == 1:  # Stop if <EOS>
                break
            sequences[0].append(predicted_token)

    return " ".join(str(t) for t in sequences[0] if t > 2)

# ✅ **Step 6: Upload and Process Image**
uploaded = files.upload()
image_path = list(uploaded.keys())[0]

generated_html = generate_html(model, image_path)

# ✅ **Step 7: Save and Download HTML**
html_filename = "generated_page.html"
with open(html_filename, "w", encoding="utf-8") as f:
    f.write(generated_html)

files.download(html_filename)
print("✅ HTML page saved & ready for download!")

ModuleNotFoundError: No module named 'evaluate'