<a href="https://colab.research.google.com/github/andreac941/tutorials/blob/main/ProyectoFinal.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 1. Preprocess the Data
First of all, we'll need to preprocess both the images and textcaptions.

We use use a pre-trained model like ResNet for feature extraction to preprocess the images to match the input format expected by this model.

For the captions preprocessing, we need to tokenize them, create a vocabulary and convert the captions to sequences of integers.

In [None]:
# Importing the libraries:
import torch
import torchvision.transforms as transforms # for data transformations.
from PIL import Image # For image visualizarion
from torch.nn.utils.rnn import pad_sequence # Used to pad a list of variable-length sequences to the same length (for RNN)
from torch.utils.data import DataLoader, Dataset # For data processing
from torchvision.models import resnet50 # For transfer learning
import spacy  # For tokenization
import os # To interact with operating system.
import pandas as pd # To manage data manipulation

class Vocabulary:
    def __init__(self, freq_threshold):
        # Special tokens with fixed indices
        self.itos = {0: "<PAD>", 1: "<SOS>", 2: "<EOS>", 3: "<UNK>"}
        # Reverse mapping: token to index
        self.stoi = {v: k for k, v in self.itos.items()}
        # Frequency threshold for including words in the vocabulary
        self.freq_threshold = freq_threshold

    def __len__(self):
        # Return the length of the vocabulary
        return len(self.itos)

    def tokenizer_eng(self, text):
        # Tokenize English text using Spacy
        return [tok.text.lower() for tok in spacy_eng.tokenizer(text)]

    def build_vocabulary(self, sentence_list):
        # Dictionary to store word frequencies
        frequencies = {}
        # Starting index for words in the vocabulary
        idx = 4

        # Iterate through each sentence in the input list
        for sentence in sentence_list:
            # Tokenize the sentence and iterate through each word
            for word in self.tokenizer_eng(sentence):
                # Update word frequencies
                if word not in frequencies:
                    frequencies[word] = 1
                else:
                    frequencies[word] += 1

                # Check if the word frequency reaches the threshold
                if frequencies[word] == self.freq_threshold:
                    # Add the word to the vocabulary
                    self.stoi[word] = idx
                    self.itos[idx] = word
                    idx += 1

    def numericalize(self, text):
        # Tokenize the input text
        tokenized_text = self.tokenizer_eng(text)

        # Convert tokens to their corresponding indices or use <UNK> if not in vocabulary
        return [
            self.stoi[token] if token in self.stoi else self.stoi["<UNK>"]
            for token in tokenized_text
        ]

class FlickrDataset(Dataset):
    def __init__(self, root_dir, captions_file, transform=None, freq_threshold=5):
        self.root_dir = root_dir
        self.df = pd.read_csv(captions_file)

        # Handling missing or non-string values in captions
        self.df['caption'] = self.df['caption'].fillna('').astype(str)

        self.transform = transform

        # Get img, caption columns
        self.imgs = self.df["image"]
        self.captions = self.df["caption"]

        # Initialize vocabulary and build vocab
        self.vocab = Vocabulary(freq_threshold)
        self.vocab.build_vocabulary(self.captions.tolist())

    # rest of the class remains the same


    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        caption = self.captions[index]
        img_id = self.imgs[index]
        img = Image.open(os.path.join(self.root_dir, img_id)).convert("RGB")

        if self.transform is not None:
            img = self.transform(img)

        numericalized_caption = [self.vocab.stoi["<SOS>"]]
        numericalized_caption += self.vocab.numericalize(caption)
        numericalized_caption.append(self.vocab.stoi["<EOS>"])

        return img, torch.tensor(numericalized_caption)

class MyCollate:
    def __init__(self, pad_idx):
        self.pad_idx = pad_idx

    def __call__(self, batch):
        imgs = [item[0].unsqueeze(0) for item in batch]
        imgs = torch.cat(imgs, dim=0)
        targets = [item[1] for item in batch]
        targets = pad_sequence(targets, batch_first=False, padding_value=self.pad_idx)

        return imgs, targets

# Transformations for the image
transform = transforms.Compose(
    [
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
    ]
)

# DataLoader
def get_loader(
    root_folder,
    annotation_file,
    transform,
    batch_size=32,
    num_workers=8,
    shuffle=True,
    pin_memory=True,
):
    dataset = FlickrDataset(root_folder, annotation_file, transform=transform)

    pad_idx = dataset.vocab.stoi["<PAD>"]

    loader = DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        num_workers=num_workers,
        shuffle=shuffle,
        pin_memory=pin_memory,
        collate_fn=MyCollate(pad_idx=pad_idx),
    )

    return loader, dataset.vocab


In [None]:


# Use Spacy for tokenization
spacy_eng = spacy.load("en_core_web_sm")

# Initialize the data loader and get the vocabulary
root_folder='flickr30k_images/flickr30k_images'
annotation_file='flickr30k_images/results.csv'
#annotation_file = annotation_file['caption'].astype(str)

data_loader, vocab = get_loader(root_folder, annotation_file, transform)



## 2. Build the Model
We will create a CNN-RNN model. The CNN part will be a pre-trained ResNet model (without the classification head) for feature extraction, and the RNN part will be an LSTM network for generating captions.

In this code:

EncoderCNN uses a pre-trained ResNet50 model for image feature extraction.

DecoderRNN is an LSTM network for generating captions.

CNNtoRNN combines both the encoder and decoder.work for generating captions.
CNNtoRNN combines both the encoder and decoder..

In [None]:
import torch
import torch.nn as nn

class EncoderCNN(nn.Module):
    def __init__(self, embed_size, train_CNN=False):
        super(EncoderCNN, self).__init__()
        self.train_CNN = train_CNN
        self.inception = resnet50(pretrained=True)
        self.inception.fc = nn.Linear(self.inception.fc.in_features, embed_size)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5)

    def forward(self, images):
        features = self.inception(images)

        # Don't backpropagate through the entire network if not training CNN
        if not self.train_CNN:
            for param in self.inception.parameters():
                param.requires_grad = False

        return self.dropout(self.relu(features))

class DecoderRNN(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers):
        super(DecoderRNN, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers)
        self.linear = nn.Linear(hidden_size, vocab_size)
        self.dropout = nn.Dropout(0.5)

    def forward(self, features, captions):
        embeddings = self.dropout(self.embed(captions))
        embeddings = torch.cat((features.unsqueeze(0), embeddings), dim=0)
        hiddens, _ = self.lstm(embeddings)
        outputs = self.linear(hiddens)
        return outputs

class CNNtoRNN(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers):
        super(CNNtoRNN, self).__init__()
        self.encoderCNN = EncoderCNN(embed_size)
        self.decoderRNN = DecoderRNN(embed_size, hidden_size, vocab_size, num_layers)

    def forward(self, images, captions):
        features = self.encoderCNN(images)
        outputs = self.decoderRNN(features, captions)
        return outputs

    def caption_image(self, image, vocabulary, max_length=50):
        result_caption = []

        with torch.no_grad():
            x = self.encoderCNN(image).unsqueeze(0)
            states = None

            for _ in range(max_length):
                hiddens, states = self.decoderRNN.lstm(x, states)
                output = self.decoderRNN.linear(hiddens.squeeze(0))
                predicted = output.argmax(1)
                result_caption.append(predicted.item())
                x = self.decoderRNN.embed(predicted).unsqueeze(0)

                if vocabulary.itos[predicted.item()] == "<EOS>":
                    break

        return [vocabulary.itos[idx] for idx in result_caption]


## 3. Training
Now, let's set up the training loo. In this training function, we calculate the loss for each batch and update the model's weights.:

In [None]:
def train(model, data_loader, optimizer, criterion, vocab_size, device):
    model.train()
    total_loss = 0

    for idx, (images, captions) in enumerate(data_loader):
        images, captions = images.to(device), captions.to(device)

        # Zero the gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(images, captions[:-1])
        loss = criterion(outputs.reshape(-1, vocab_size), captions.reshape(-1))

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(data_loader)


## Step 4: Evaluation with BLEU
For evaluating an image captioning model, you can use metrics like BLEU (Bilingual Evaluation Understudy Score). However, please note that BLEU is not perfect and might not always align with human judgment. It's typically used to get a quantitative estimate of the model's performance. Here's an example of how you can implement a simple evaluation function:

In [None]:
from nltk.translate.bleu_score import corpus_bleu

def evaluate(model, data_loader, device):
    model.eval()
    references = []
    hypotheses = []

    with torch.no_grad():
        for images, captions in data_loader:
            images = images.to(device)
            captions = captions.to(device)

            # Generate captions
            outputs = model.caption_image(images, vocab)
            outputs = [[vocab.itos[idx] for idx in seq] for seq in outputs]

            # Convert targets to words
            targets = [[vocab.itos[idx] for idx in seq] for seq in captions.cpu().numpy()]

            # Collect references and hypotheses for BLEU calculation
            references.extend(targets)
            hypotheses.extend(outputs)

    # Calculate BLEU-4 score
    bleu4 = corpus_bleu(references, hypotheses, weights=(0.25, 0.25, 0.25, 0.25))
    return bleu4


## 5. Train the Data

Finally,we  initialize the model, optimizer, and loss function, and start the training process..

In [None]:


# Hyperparameters
embed_size = 256
hidden_size = 512
vocab_size = len(vocab)
num_layers = 1
learning_rate = 3e-4

# Initialize the data loader and get the vocabulary
data_loader, vocab = get_loader(root_folder, annotation_file, transform)

# Initialize model, optimizer, and loss function
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CNNtoRNN(embed_size, hidden_size, vocab_size, num_layers).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss(ignore_index=vocab.stoi["<PAD>"])

# Define number of epochs
num_epochs = 10

# Training and evaluation loop
for epoch in range(num_epochs):
    # Training
    model.train()
    total_loss = 0
    for idx, (images, captions) in enumerate(data_loader):
        images, captions = images.to(device), captions.to(device)

        # Forward pass
        outputs = model(images, captions[:-1])
        loss = criterion(outputs.reshape(-1, vocab_size), captions.reshape(-1))

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(data_loader)
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}")

    # Evaluation
    bleu4 = evaluate(model, data_loader, device)
    print(f"Epoch [{epoch+1}/{num_epochs}], BLEU-4: {bleu4:.4f}")


Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to C:\Users\AHERNANDEZ/.cache\torch\hub\checkpoints\resnet50-0676ba61.pth
100%|█████████████████████████████████████████████████████████████████████████████| 97.8M/97.8M [00:08<00:00, 12.8MB/s]


## 6. Generate a Prediction
You need a function to transform a new input image into the same format as the training images. You'll use the caption_image method from the CNNtoRNN class. Let's write a function to take an image path and generate a caption:

In this code, generate_caption takes the path of the image you want to caption, uses the trained model directly, and processes the image using the same transform as during training. The model variable is assumed to be the same one you trained earlier in the script.
s.

In [None]:
from PIL import Image

def load_image(image_path, transform=None):
    image = Image.open(image_path).convert("RGB")
    if transform is not None:
        image = transform(image).unsqueeze(0)
    return image


## Make sure to replace 'path_to_your_image.jpg' with the path to your actual image.

In [None]:
def generate_caption(image_path, model, vocabulary, transform, device):
    model.eval()
    image = load_image(image_path, transform)
    image = image.to(device)

    # Generate caption
    with torch.no_grad():
        caption = model.caption_image(image, vocabulary, max_length=50)

    # Convert caption tokens to words and join
    caption = [word for word in caption if word not in {"<SOS>", "<EOS>", "<PAD>", "<UNK>"}]
    return ' '.join(caption)

# Assume that `model` is already trained and available in your script

# Define your transformations, same as used for the training images
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])

# Define the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Function for generating captions
def generate_caption(image_path, model, vocabulary, transform, device):
    model.eval()
    image = load_image(image_path, transform)
    image = image.to(device)

    # Generate caption
    with torch.no_grad():
        caption = model.caption_image(image, vocabulary, max_length=50)

    # Convert caption tokens to words and join
    caption = [word for word in caption if word not in {"<SOS>", "<EOS>", "<PAD>", "<UNK>"}]
    return ' '.join(caption)

# Path to your input image
image_path = 'path_to_your_image.jpg'

# Generate caption
caption = generate_caption(image_path, model, vocab, transform, device)
print("Generated Caption:", caption)

