# __Final Project: Adding captions to your photos__

__Install__

```sh
pip install kaggle
```

__Download__

```sh
kaggle datasets download -d adityajn105/flickr8k
```

__Extract__

```sh
tar -xf flickr8k.zip
```
```

### __Libraries__

In [None]:
import os
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import models, transforms
from PIL import Image
import sentencepiece as spm
from torch.nn.utils.rnn import pack_padded_sequence

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


### __Dataframe Creation__

In [2]:
# Read the captions CSV file into a DataFrame
captions_file = 'captions.txt'
df = pd.read_csv(captions_file)

# Shuffle if desired
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
df

Unnamed: 0,image,caption
0,2973269132_252bfd0160.jpg,A large wild cat is pursuing a horse across a ...
1,270263570_3160f360d3.jpg,Two brown dogs fight on the leafy ground .
2,2053006423_6adf69ca67.jpg,A man in shorts is standing on a rock looking ...
3,512101751_05a6d93e19.jpg,a muzzled white dog is running on the grass .
4,3156406419_38fbd52007.jpg,A person skiing downhill .
...,...,...
40450,2220175999_081aa9cce8.jpg,Two big dogs wade in the ocean
40451,2555622234_3e531e4014.jpg,A wet German Shepherd runs along the waves on ...
40452,525887861_4cc7a1beca.jpg,Little girl in pink skateboard .
40453,1204996216_71d7519d9a.jpg,A boy lays on a picnic table bench .


### __Using Google's sentencepiece library to create (essentially) vocab.itos and vocab.stoi (torchtext had incompatibility issues with torch CUDA installations)__

In [3]:
spm.SentencePieceTrainer.train(input='captions.txt', model_prefix='m', vocab_size=10000)

captions_only = df['caption'].tolist()

# Write them to a text file (one caption per line)
with open('captions_only.txt', 'w', encoding='utf-8') as f:
    for c in captions_only:
        f.write(c.strip() + '\n')

# Now you have vocab.itos and vocab.stoi

In [4]:
spm.SentencePieceTrainer.train(
    input='captions_only.txt',
    model_prefix='m',         # will generate m.model and m.vocab
    vocab_size=6311,
    character_coverage=1.0,   # use full character coverage
    model_type='unigram',     # can also try bpe or others
    user_defined_symbols=['<pad>', '<start>', '<end>'] # optional
)

sp = spm.SentencePieceProcessor()
sp.load('m.model')

True

In [5]:
text = "A child in a pink dress is climbing up a set of stairs in an entry way."
token_ids = sp.encode_as_ids(text.lower())
pieces = sp.encode_as_pieces(text.lower())
print("Token IDs:", token_ids)
print("Subword Pieces:", pieces)

Token IDs: [6, 58, 9, 6, 115, 196, 15, 150, 67, 6, 427, 19, 479, 9, 43, 2687, 37, 954, 918]
Subword Pieces: ['▁a', '▁child', '▁in', '▁a', '▁pink', '▁dress', '▁is', '▁climbing', '▁up', '▁a', '▁set', '▁of', '▁stairs', '▁in', '▁an', '▁entr', 'y', '▁way', '.']


In [6]:
# stoi
stoi = {}
# itos will be a list of pieces in order
itos = [sp.id_to_piece(i) for i in range(sp.get_piece_size())]

for i, piece in enumerate(itos):
    stoi[piece] = i

In [7]:
stoi

{'<unk>': 0,
 '<s>': 1,
 '</s>': 2,
 '<pad>': 3,
 '<start>': 4,
 '<end>': 5,
 '▁a': 6,
 '▁.': 7,
 '▁A': 8,
 '▁in': 9,
 '▁the': 10,
 's': 11,
 'ing': 12,
 '▁on': 13,
 '▁': 14,
 '▁is': 15,
 '▁dog': 16,
 '▁with': 17,
 '▁man': 18,
 '▁of': 19,
 'and': 20,
 '▁Two': 21,
 '▁girl': 22,
 '▁white': 23,
 '▁black': 24,
 '▁boy': 25,
 '▁are': 26,
 '▁The': 27,
 '▁to': 28,
 '▁,': 29,
 '▁woman': 30,
 '▁wear': 31,
 'ed': 32,
 'nd': 33,
 '▁water': 34,
 '▁at': 35,
 '▁red': 36,
 'y': 37,
 '▁brown': 38,
 '▁young': 39,
 '▁people': 40,
 '▁his': 41,
 '▁blue': 42,
 '▁an': 43,
 '▁grass': 44,
 '▁running': 45,
 '▁through': 46,
 '▁shirt': 47,
 '▁snow': 48,
 'e': 49,
 '▁playing': 50,
 '▁while': 51,
 '▁dogs': 52,
 '▁hold': 53,
 '▁down': 54,
 '▁ball': 55,
 '▁standing': 56,
 'le': 57,
 '▁child': 58,
 '▁jumping': 59,
 'd': 60,
 '▁over': 61,
 '▁person': 62,
 '▁litt': 63,
 'front': 64,
 '▁sitting': 65,
 '▁field': 66,
 '▁up': 67,
 '▁two': 68,
 '▁small': 69,
 '▁by': 70,
 '▁green': 71,
 '▁large': 72,
 '▁her': 73,
 '▁yellow': 

In [8]:
itos

['<unk>',
 '<s>',
 '</s>',
 '<pad>',
 '<start>',
 '<end>',
 '▁a',
 '▁.',
 '▁A',
 '▁in',
 '▁the',
 's',
 'ing',
 '▁on',
 '▁',
 '▁is',
 '▁dog',
 '▁with',
 '▁man',
 '▁of',
 'and',
 '▁Two',
 '▁girl',
 '▁white',
 '▁black',
 '▁boy',
 '▁are',
 '▁The',
 '▁to',
 '▁,',
 '▁woman',
 '▁wear',
 'ed',
 'nd',
 '▁water',
 '▁at',
 '▁red',
 'y',
 '▁brown',
 '▁young',
 '▁people',
 '▁his',
 '▁blue',
 '▁an',
 '▁grass',
 '▁running',
 '▁through',
 '▁shirt',
 '▁snow',
 'e',
 '▁playing',
 '▁while',
 '▁dogs',
 '▁hold',
 '▁down',
 '▁ball',
 '▁standing',
 'le',
 '▁child',
 '▁jumping',
 'd',
 '▁over',
 '▁person',
 '▁litt',
 'front',
 '▁sitting',
 '▁field',
 '▁up',
 '▁two',
 '▁small',
 '▁by',
 '▁green',
 '▁large',
 '▁her',
 '▁yellow',
 '▁group',
 '▁walking',
 '-',
 '▁into',
 '▁beach',
 '▁men',
 '▁Thre',
 '▁rock',
 '▁near',
 '▁mouth',
 '▁air',
 '▁children',
 '▁jumps',
 '▁for',
 '▁one',
 '▁street',
 '▁another',
 '▁other',
 '▁sit',
 '▁its',
 '▁runs',
 '▁riding',
 'n',
 '▁walk',
 '▁bike',
 'm',
 '▁stands',
 '▁play',
 '▁

In [9]:
pad_id = sp.piece_to_id('<pad>')
start_id = sp.piece_to_id('<start>')
end_id = sp.piece_to_id('<end>')
unk_id = sp.unk_id()  # usually 0 by default

print(f'pad_id: {pad_id}, start_id: {start_id}, end_id: {end_id}, unk_id: {unk_id}')

pad_id: 3, start_id: 4, end_id: 5, unk_id: 0


### __Creating the loaded dataset from the Flickr8k Dataset__

In [10]:
class FlickrDataset(Dataset):
    def __init__(self, df, img_folder, transform=None, max_len=80):
        self.df = df
        self.img_folder = img_folder
        self.transform = transform
        self.max_len = max_len

        # If you want a train/val split, you can filter df here.
        # Here we use entire df for demonstration.
        self.images = self.df['image'].values
        self.captions = self.df['caption'].values

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        img_name = self.images[idx]
        caption_text = self.captions[idx]

        # Load image
        img_path = os.path.join(self.img_folder, img_name)
        image = Image.open(img_path).convert('RGB')
        if self.transform is not None:
            image = self.transform(image)

        # Convert caption text to indices
        pieces = sp.encode_as_pieces(caption_text.lower().strip())
        caption_ids = [start_id] + [stoi.get(p, unk_id) for p in pieces] + [end_id]

        # Pad or truncate to max_len
        if len(caption_ids) < self.max_len:
            caption_ids += [pad_id] * (self.max_len - len(caption_ids))
        else:
            caption_ids = caption_ids[:self.max_len]

        length = sum([1 for token in caption_ids if token != pad_id])

        return image, torch.tensor(caption_ids), length
    
# Define a collate_fn to handle variable lengths
def collate_fn(batch):
    # batch: list of (image, caption, length)
    images = [item[0] for item in batch]
    captions = [item[1] for item in batch]
    lengths = [item[2] for item in batch]

    images = torch.stack(images, 0)
    captions = torch.stack(captions, 0)
    lengths = torch.tensor(lengths, dtype=torch.int64)

    return images, captions, lengths

In [11]:
transform = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.ToTensor(),
    transforms.Normalize((0.485,0.456,0.406),(0.229,0.224,0.225))
])

dataset = FlickrDataset(df, img_folder='Images', transform=transform)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)

### __CNN + LSTM Architecture__

In [12]:
class EncoderCNN(nn.Module):
    def __init__(self, embed_size):
        super(EncoderCNN, self).__init__()
        resnet = models.resnet152(pretrained=True)
        # remove the last fc layer
        modules = list(resnet.children())[:-1]
        self.resnet = nn.Sequential(*modules)
        self.linear = nn.Linear(resnet.fc.in_features, embed_size)
        self.bn = nn.BatchNorm1d(embed_size, momentum=0.01)

    def forward(self, images):
        with torch.no_grad():
            features = self.resnet(images)
        features = features.reshape(features.size(0), -1)
        features = self.bn(self.linear(features))
        return features

class DecoderRNN(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers=1, max_seq_length=80, itos=None):
        super(DecoderRNN, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size, padding_idx=pad_id)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.linear = nn.Linear(hidden_size, vocab_size)
        self.max_seq_length = max_seq_length
        self.itos = itos

    def forward(self, features, captions, lengths):
        # captions include <start>, ..., <end>
        # lengths is count of all non-pad tokens in captions
        embeddings = self.embed(captions)
        embeddings = torch.cat((features.unsqueeze(1), embeddings), 1)
        
        # Use lengths as is
        packed = pack_padded_sequence(embeddings, lengths.cpu(), batch_first=True, enforce_sorted=False)
        outputs, _ = self.lstm(packed)
        outputs = self.linear(outputs[0])
        return outputs
    
    def predict(self, features, states=None):
        """Generate captions for given image features using greedy search."""
        sampled_ids = []
        inputs = features.unsqueeze(1)

        for i in range(self.max_seq_length):
            hiddens, states = self.lstm(inputs, states)
            outputs = self.linear(hiddens.squeeze(1))
            _, predicted = outputs.max(1)
            sampled_ids.append(predicted)
            inputs = self.embed(predicted).unsqueeze(1)

        sampled_ids = torch.stack(sampled_ids, 1)

        # Convert word_ids to words using itos
        sentences = []
        for sample_id in sampled_ids:
            sample_id = sample_id.cpu().numpy()
            sampled_caption = []
            for word_id in sample_id:
                word = self.itos[word_id]
                if word == '<end>':
                    break
                if word not in ['<start>', '<pad>', '<end>']:
                    sampled_caption.append(word)
            # Join the subwords or decode with SentencePiece if desired
            sentence = ''.join(sampled_caption).replace('▁', ' ').strip()
            sentences.append(sentence)
        return sentences

### __Training__

In [13]:
embed_size = 256
hidden_size = 512
num_layers = 1
learning_rate = 0.001
num_epochs = 1  # just as an example

encoder = EncoderCNN(embed_size=embed_size).to(device)
decoder = DecoderRNN(embed_size=embed_size, hidden_size=hidden_size, vocab_size=len(itos), num_layers=num_layers, itos=itos).to(device)

criterion = nn.CrossEntropyLoss(ignore_index=pad_id)
params = list(decoder.parameters()) + list(encoder.linear.parameters()) + list(encoder.bn.parameters())
optimizer = torch.optim.Adam(params, lr=learning_rate)

for epoch in range(num_epochs):
    for i, (images, captions, lengths) in enumerate(dataloader):
        images = images.to(device)
        captions = captions.to(device)
        lengths = lengths.to(device)

        # Forward pass
        features = encoder(images)
        outputs = decoder(features, captions, lengths)

        # Because we used pack_padded_sequence, outputs is already the right shape
        # But we need to adjust the target as well (exclude the feature timestep)
        targets = pack_padded_sequence(captions[:, 1:], lengths.cpu(), batch_first=True, enforce_sorted=False)[0]

        loss = criterion(outputs, targets)

        # Backprop and optimize
        decoder.zero_grad()
        encoder.zero_grad()
        loss.backward()
        optimizer.step()

        if i % 100 == 0:
            print(f"Epoch [{epoch}/{num_epochs}], Step [{i}], Loss: {loss.item():.4f}")

print("Training completed.")



Epoch [0/1], Step [0], Loss: 8.7566
Epoch [0/1], Step [100], Loss: 4.6292
Epoch [0/1], Step [200], Loss: 4.5793
Epoch [0/1], Step [300], Loss: 4.0145
Epoch [0/1], Step [400], Loss: 4.2624
Epoch [0/1], Step [500], Loss: 4.1454
Epoch [0/1], Step [600], Loss: 4.2229
Epoch [0/1], Step [700], Loss: 4.0770
Epoch [0/1], Step [800], Loss: 4.1591
Epoch [0/1], Step [900], Loss: 3.7216
Epoch [0/1], Step [1000], Loss: 3.6038
Epoch [0/1], Step [1100], Loss: 3.7738
Epoch [0/1], Step [1200], Loss: 3.9284
Training completed.


### __Example Caption!__

In [14]:
encoder.eval()
decoder.eval()

# Define the same transformations you used during training
transform = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.ToTensor(),
    transforms.Normalize((0.485,0.456,0.406),(0.229,0.224,0.225))
])

# Load and preprocess a new image
image_path = 'Images/10815824_2997e03d76.jpg'
image = Image.open(image_path).convert('RGB')
image = transform(image).unsqueeze(0).to(device)  # (1, 3, 224, 224)

# Extract features from the encoder
features = encoder(image)

# Generate caption using the decoder's predict method
caption = decoder.predict(features)  # returns a list of sentences (strings)

print("Generated Caption:", caption[0])

Generated Caption: a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a dogs a a dogs


__TODO__
* Changing the model architecture, e.g. include an attention module.
* Doing more hyper parameter tuning (learning rate, batch size, number of layers, number of units, dropout rate, batch normalization etc.).
* Use the cross validation set to understand overfitting.
* Using BLEU Score to evaluate and measure the performance of the model.

__Maybe:__
* Compare with transformer-based image captioning results (if resources permitted).