In [2]:
import torch
from PIL import Image
import os
import pandas as pd
from torchvision import transforms as tf
import torchvision.models as models
import numpy as np
import torch
from torch.utils.data import DataLoader, Dataset
import matplotlib.pyplot as plt
import matplotlib.image as mpimg


'''
Insperation guides:
https://medium.com/@raman.shinde15/image-captioning-with-flickr8k-dataset-bleu-4bcba0b52926
https://thepythoncode.com/article/image-captioning-with-pytorch-and-transformers-in-python

Overall comments:
Image captioning uses one to many RNN's.

About Flickr 8k dataset
Images: Contains a total of 8092 jpg format with different shapes and sizes.
        6000 for train, 1000 for test, 1000 for development
Captions.txt: Contains 5 captions for each image, total of 40460 captions.

Size of training vocabulary: 7371

Architecture
We will use CNN + LSTM with attention.
CNN: To extract features from the image.
LSTM: To generate a description from the extracted information of the image

Note:
All pre-trained models expect input images normalized in the same way, i.e. mini-batches of 3-channel
RGB images of shape (3 x H x W), where H and W are expected to be at least 299. The images have to be loaded in
to a range of [0, 1] and then normalized using mean = [0.485, 0.456, 0.406] and std = [0.229, 0.224, 0.225]
'''

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

Step 1: Prepare the Dataset
1.1 Dataset Organization: Ensure your dataset is properly organized. The Flickr8k dataset should include:
An images folder containing all JPEG images.
A captions.txt file where each line corresponds to an image and its caption in the format image_filename,caption.


In [17]:
image_folder = "flickr 8k/Images"
images = []
for filename in os.listdir(image_folder):
    img = mpimg.imread(os.path.join(image_folder, filename))
    if img is not None:
        images.append(img)


print(f'There are {len(images)} images in this folder')

captions = pd.read_csv('flickr 8k/captions.txt')
print(f'There are {len(captions)} image to captions')
print(f'How the data is structured:\n {captions.head(7)}')

dataset = {}

for index, row in captions.iterrows():
    img_path = os.path.join(image_folder, row['image'])  # Include the filename
    if os.path.exists(img_path):
        with Image.open(img_path) as img:
            if img_path not in dataset:
                dataset[img_path] = {'image': img, 'captions': []}
            dataset[img_path]['captions'].append(row['caption'])

print(f'Loaded {type(dataset)} images with captions')

38009
Batch Images shape torch.Size([32, 3, 224, 224])
Batch captions shape: 32


1.2 Data Loading and Preprocessing:
Load the images and corresponding captions into your program.
Normalize the images (resize if necessary and scale pixel values).
Preprocess the captions by tokenizing (splitting text into words), converting to lowercase, and possibly 
removing punctuation.

In [None]:
transform = tf.Compose([
    tf.Resize(299),
    tf.CenterCrop(299),
    tf.ToTensor(),
    tf.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])



Step 2: Feature Extraction with CNN
2.1 Choose a Pre-trained CNN: 
Select a pre-trained model like VGG16, ResNet, or InceptionV3, 
which are commonly used for feature extraction in image captioning tasks.
2.2 Modify the CNN: 
Since you need to extract features rather than perform classification, 
modify the last layer of the CNN to output features directly, rather than class probabilities.


# Step 3: Sequence Model with LSTM
'''
3.1 Prepare Text Data:
Tokenize captions to convert each caption into a sequence of tokens.
Pad the sequences to ensure they have the same length for batch processing.
Create a vocabulary index (word-to-index and index-to-word mappings).
3.2 Design the LSTM Model:
The LSTM should take the sequence of tokens as input and learn to predict the next token in the sequence.
Integrate embedding layers to convert tokens into dense vectors.
'''

# Step 4: Combine Image and Text Data with Attention
'''
4.1 Implement Attention Mechanism:
Use an attention mechanism to allow the LSTM to focus on different parts of the image at different points in the
sequence generation.
This typically involves calculating alignment scores between the LSTM hidden state and the image features.
4.2 Integrate the Models:
The final model will concatenate the features from the CNN with the output from the attention mechanism.
This concatenated output is then fed into the LSTM to generate the next word in the caption.
'''

# Step 5: Train the Network
'''
5.1 Setup Training:
Define the loss function, typically categorical crossentropy, since this is a multi-class classification problem 
(predicting the next word).
Choose an optimizer like Adam for efficient training.
Prepare your data batches and split the data into training and validation sets.
5.2 Training Loop:
For each epoch, train the combined model on the training set and evaluate its performance on the validation set.
Save checkpoints and possibly adjust learning rates based on performance improvements.
'''

# Step 6: Evaluate with BLEU Score
'''
6.1 Generate Captions:
Use the trained model to generate captions for images in the test set by predicting one word at a time and feeding
it back into the model as input for the next word.
6.2 Calculate BLEU Score:
Use the BLEU score to evaluate the quality of the generated captions compared to the reference captions.
BLEU scores provide a quantitative measure of how the generated captions match the reference captions in terms
of precision of word use at various n-gram levels.
'''

In [None]:
#Decoder, Recurrent Neural Network, LSTM with Attention

In [None]:
#Beam search, to transform the Decoder's output into a score for each word in the vocabulary