In [None]:
!git clone https://github.com/dekana1/DLProject.git

Cloning into 'DLProject'...
remote: Enumerating objects: 4, done.[K
remote: Counting objects: 100% (4/4), done.[K
remote: Compressing objects: 100% (3/3), done.[K
remote: Total 4 (delta 0), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (4/4), done.


In [None]:
# prompt: change directory to data folder in drive

%cd /content/drive/MyDrive/DeepLearningProject/data


/content/drive/MyDrive/DeepLearningProject/data


In [None]:
!wget http://images.cocodataset.org/zips/train2017.zip
!unzip train2017.zip

In [None]:
!wget http://images.cocodataset.org/zips/val2017.zip
!unzip val2017.zip

In [None]:
!wget http://images.cocodataset.org/annotations/annotations_trainval2017.zip
!unzip annotations_trainval2017.zip

In [None]:
!pip install Pillow



## Dataset Prep

In [None]:
# Define paths to COCO dataset
coco_path = '../data/data'
train_image_dir = f'{coco_path}/train2017'
val_image_dir = f'{coco_path}/val2017'
annotations_dir = f'{coco_path}/annotations'

## COCO DATASET CLASS

In [None]:
import os
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import json
from torchvision import transforms
import numpy as np

class COCODataset(Dataset):
    def __init__(self, root_dir, annotation_file, transform=None, max_len=50):
        self.root_dir = root_dir
        self.transform = transform
        self.max_len = max_len

        # Load annotations
        with open(annotation_file, 'r') as f:
            self.annotations = json.load(f)

        # Process annotations to get image-caption pairs
        self.img_ids = []
        self.captions = []
        self.image_paths = []

        for ann in self.annotations['annotations']:
            image_id = ann['image_id']
            caption = ann['caption']

            # Find image path
            image_path = os.path.join(
                self.root_dir,
                f'{image_id:012d}.jpg'
            )

            if os.path.exists(image_path):
                self.img_ids.append(image_id)
                self.captions.append(caption)
                self.image_paths.append(image_path)

        # Build vocabulary (will be implemented later)
        self.vocab = Vocabulary()
        self.build_vocabulary()

    def build_vocabulary(self):
       # Collect all captions
        all_captions = [caption for caption in self.captions]

        # Build vocabulary
        self.vocab.build_vocabulary(all_captions)

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image_path = self.image_paths[idx]
        image = Image.open(image_path).convert('RGB')

        if self.transform:
            image = self.transform(image)

        # Process caption (tokenization, etc.)

        caption = self.captions[idx]
        # Convert caption to numerical form
        numerical_caption = [self.vocab.stoi["<SOS>"]]
        numerical_caption += self.vocab.numericalize(caption)
        numerical_caption.append(self.vocab.stoi["<EOS>"])

        # Pad to max_len
        if len(numerical_caption) < self.max_len:
            numerical_caption += [self.vocab.stoi["<PAD>"]] * (self.max_len - len(numerical_caption))
        else:
            numerical_caption = numerical_caption[:self.max_len]



        return image, torch.tensor(numerical_caption)


In [None]:
import nltk


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

## Vocabulary and Caption Processing

In [None]:
import nltk
import re
from collections import Counter

class Vocabulary:
    def __init__(self, freq_threshold=5):
        self.itos = {0: "<PAD>", 1: "<SOS>", 2: "<EOS>", 3: "<UNK>"}
        self.stoi = {"<PAD>": 0, "<SOS>": 1, "<EOS>": 2, "<UNK>": 3}
        self.freq_threshold = freq_threshold

    def __len__(self):
        return len(self.itos)

    def build_vocabulary(self, caption_list):
        frequencies = Counter()
        idx = 4
        nltk.download('punkt', quiet=True)
        nltk.download('punkt_tab', quiet=True)

        for caption in caption_list:
            # Tokenize and count words
            for word in nltk.tokenize.word_tokenize(caption.lower()):
                frequencies[word] += 1

                # Add word to vocabulary if it meets threshold
                if frequencies[word] == self.freq_threshold:
                    self.stoi[word] = idx
                    self.itos[idx] = word
                    idx += 1

    def numericalize(self, text):
        tokenized_text = nltk.tokenize.word_tokenize(text.lower())

        return [
            self.stoi[token] if token in self.stoi else self.stoi["<UNK>"]
            for token in tokenized_text
        ]


## ViT Feature Extractor

In [None]:
import torch.nn as nn
from transformers import ViTModel, ViTFeatureExtractor as TransformerEncoder

class ViTFeatureExtractor(nn.Module):
    def __init__(self, model_name="google/vit-base-patch16-224"):
        super(ViTFeatureExtractor, self).__init__()
        self.vit = ViTModel.from_pretrained(model_name)
        self.feature_extractor = TransformerEncoder.from_pretrained(model_name)

        # Freeze the ViT parameters
        for param in self.vit.parameters():
            param.requires_grad = False

    def forward(self, images):
        # Prepare images for ViT
        inputs = self.feature_extractor(images=images, return_tensors="pt")
        inputs = {k: v.to(images.device) for k, v in inputs.items()}
        # Extract features
        outputs = self.vit(**inputs)
        # OUTPUT OF VIT torch.Size([32, 768])
        # Return the [CLS] token as image representation
        return outputs.last_hidden_state[:, 0, :]  # Shape: [batch_size, hidden_size]


## LSTM Decoder for Caption Generation

In [None]:
class LSTMDecoder(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers=2, dropout=0.5):
        super(LSTMDecoder, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.linear = nn.Linear(hidden_size, vocab_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, features, captions):
        # Embed captions
        embeddings = self.dropout(self.embed(captions))

        # Concatenate image features with embedded captions
        # We need to expand features to match batch_size x 1 x embed_size
        features = features.unsqueeze(1)
        embeddings = torch.cat((features, embeddings), dim=1)

        # Pass through LSTM
        hiddens, _ = self.lstm(embeddings)
        outputs = self.linear(hiddens)

        return outputs

    def sample(self, features, states=None, max_len=20):
        """Generate captions for given image features using greedy search."""
        sampled_ids = []
        inputs = features.unsqueeze(1)

        for i in range(max_len):
            # Forward propagation
            hiddens, states = self.lstm(inputs, states)
            outputs = self.linear(hiddens.squeeze(1))

            # Get predicted word id
            predicted = outputs.argmax(1)
            sampled_ids.append(predicted)

            # Early stopping if EOS token is predicted
            if predicted.item() == 2:  # <EOS> token
                break

            # Prepare input for next time step
            inputs = self.embed(predicted).unsqueeze(1)

        return torch.stack(sampled_ids, dim=1)


## Complete Image Captioning Model

In [None]:
class ImageCaptioningModel(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, vit_model="google/vit-base-patch16-224", num_layers=2):
        super(ImageCaptioningModel, self).__init__()
        self.encoder = ViTFeatureExtractor(vit_model)
        self.decoder = LSTMDecoder(embed_size, hidden_size, vocab_size, num_layers)

    def forward(self, images, captions):
        features = self.encoder(images)
        outputs = self.decoder(features, captions)
        return outputs

    def caption_image(self, image, vocabulary, max_length=20):
        """Generate a caption for an image."""
        # Set model to evaluation mode
        self.eval()

        with torch.no_grad():
            features = self.encoder(image.unsqueeze(0))
            sampled_ids = self.decoder.sample(features, max_len=max_length)

        # Convert word indices to words
        sampled_caption = []
        for word_id in sampled_ids[0].cpu().numpy():
            word = vocabulary.itos[word_id]
            if word == "<EOS>":
                break
            if word not in ["<SOS>", "<PAD>"]:
                sampled_caption.append(word)

        return ' '.join(sampled_caption)


## Training Loop

In [None]:
def train_model(model, train_loader, criterion, optimizer, device, num_epochs=10):
    model.train()

    for epoch in range(num_epochs):
        total_loss = 0

        for images, captions in train_loader:
            images = images.to(device)
            captions = captions.to(device)

            # Zero the gradients
            optimizer.zero_grad()

            # Forward pass
            outputs = model(images, captions[:, :-1])  # Exclude the <EOS> token for inputs
            # Calculate loss
            loss = criterion(
                outputs.reshape(-1, outputs.shape[2]),
                captions.reshape(-1)  # Exclude the <SOS> token for targets
            )

            # Backward pass and optimize
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        # Print epoch statistics
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss/len(train_loader):.4f}')


## Evaluation and Metrics

In [None]:
from nltk.translate.bleu_score import corpus_bleu

def evaluate_model(model, data_loader, vocabulary, device):
    model.eval()
    all_predictions = []
    all_references = []

    with torch.no_grad():
        for images, captions in data_loader:
            images = images.to(device)

            # Generate captions
            for i in range(images.size(0)):
                image = images[i].unsqueeze(0)
                predicted_caption = model.caption_image(image, vocabulary)
                actual_caption = ' '.join([vocabulary.itos[idx.item()] for idx in captions[i]
                                         if idx.item() not in [0, 1, 2]])  # Exclude special tokens

                all_predictions.append(predicted_caption.split())
                all_references.append([actual_caption.split()])

    # Calculate BLEU-4 score
    bleu4 = corpus_bleu(all_references, all_predictions, weights=(0.25, 0.25, 0.25, 0.25))
    print(f'BLEU-4 Score: {bleu4:.4f}')

    return bleu4


## Putting It All Together

In [None]:
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define transformations
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor()
])

# Create dataset and data loaders
train_dataset = COCODataset(
    root_dir=train_image_dir,
    annotation_file=f'{annotations_dir}/captions_train2017.json',
    transform=transform
)

train_loader = DataLoader(
    train_dataset,
    batch_size=32,
    shuffle=True,
    num_workers=4
)

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-13-61fd0f4e8097>", line 11, in <cell line: 0>
    train_dataset = COCODataset(
                    ^^^^^^^^^^^^
  File "<ipython-input-5-b8cf571c29c1>", line None, in __init__
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py", line 2099, in showtraceback
    stb = value._render_traceback_()
          ^^^^^^^^^^^^^^^^^^^^^^^^
AttributeError: 'KeyboardInterrupt' object has no attribute '_render_traceback_'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/IPython/core/ultratb.py", line 1101, in get_records
    return _fi

In [None]:
# Initialize model
embed_size = 768
hidden_size = 512
vocab_size = len(train_dataset.vocab)

model = ImageCaptioningModel(
    embed_size=embed_size,
    hidden_size=hidden_size,
    vocab_size=vocab_size
).to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=train_dataset.vocab.stoi["<PAD>"])
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Train model
# train_model(model, train_loader, criterion, optimizer, device, num_epochs=10)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/69.7k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


preprocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]

It looks like you are trying to rescale already rescaled images. If the input images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again.


In [None]:
# Evaluate model
val_dataset = COCODataset(
    root_dir=val_image_dir,
    annotation_file=f'{annotations_dir}/captions_val2017.json',
    transform=transform
)

val_loader = DataLoader(
    val_dataset,
    batch_size=32,
    shuffle=False,
    num_workers=4
)

evaluate_model(model, val_loader, train_dataset.vocab, device)