In [None]:
!pip install kaggle


In [None]:
# Instructions for MacOS/Linux Machine
!mkdir ~/.kaggle
!copy kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle datasets download sakshighadigaonkar/flickr-8k


In [None]:
!unzip flickr-8k.zip

In [None]:
from sentence_transformers import SentenceTransformer, AutoTokenizer

model = SentenceTransformer("Alibaba-NLP/gte-Qwen2-7B-instruct", trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")


In [None]:
# libraries
import numpy as np
from numpy import array
import matplotlib.pyplot as plt
%matplotlib inline

import string
import os
from PIL import Image
from time import time

import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.models as models
import torchvision.transforms as transforms
import torchvision.datasets as datasets


In [None]:
token_path = "./flickr-8k/Flickr8k_text/Flickr8k.token.txt"
train_images_path = './flickr-8k/Flickr8k_text/Flickr_8k.trainImages.txt'
test_images_path = './flickr-8k/Flickr8k_text/Flickr_8k.testImages.txt'
images_path = './flickr-8k/Flickr8k_Dataset/Flicker8k_Dataset'

doc = open(token_path,'r').read()

In [None]:

descriptions = dict()
for line in doc.split('\n'):
        tokens = line.split()
        if len(line) > 2:
          image_id = tokens[0].split('.')[0]
          image_desc = ' '.join(tokens[1:])
          if image_id not in descriptions:
              descriptions[image_id] = list()
          descriptions[image_id].append(image_desc)
vocabulary = set()
for key in descriptions.keys():
        [vocabulary.update(d.split()) for d in descriptions[key]]
print('Original Vocabulary Size: %d words' % len(vocabulary))

## Manipulation of the descriptions to prepare them to join them with their correspondent images
lines = list()
for key, desc_list in descriptions.items():
    for desc in desc_list:
        lines.append(key + ' ' + desc)
new_descriptions = '\n'.join(lines)

## Collects in train the images we are going to use for training purposes.
doc = open(train_images_path,'r').read()
dataset = list()
for line in doc.split('\n'):
    if len(line) > 1:
      identifier = line.split('.')[0]
      dataset.append(identifier)

train = set(dataset)

#In this case we are going to actually get the images and stored them in train_img list
img = glob.glob(images_path + '/*.jpg')
train_images = set(open(train_images_path, 'r').read().strip().split('\n'))
train_img = []
for i in img:
    if i.split('/')[-1].split('\\')[-1] in train_images: ## .split('\\')[-1] this is added if you run this in a windows machine. Remove it if you are running in MacOS/Linux machine
        train_img.append(i)

test_images = set(open(test_images_path, 'r').read().strip().split('\n'))
test_img = []
for i in img: 
    if i.split('/')[-1].split('\\')[-1] in test_images:  ## .split('\\')[-1] this is added if you run this in a windows machine. Remove it if you are running in MacOS/Linux machine
        test_img.append(i)

train_descriptions = dict()
for line in descriptions.split('\n'):
    tokens = line.split()
    image_id, image_desc = tokens[0], tokens[1:]
    if image_id in train:
        if image_id not in train_descriptions:
            train_descriptions[image_id] = list()
        desc = '<sos> ' + ' '.join(image_desc) + ' <eos>' # adding the start and end sequence tokens for latter transformation
        train_descriptions[image_id].append(desc)

# Image Encoding


In [None]:
vit_model = models.vit_b_32(pretrained=True)
vit = nn.Sequential(*list(vit_model.children())[:-1]) 


In [None]:
train_DL = torch.utils.data.DataLoader()

In [None]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer('nvidia/NV-Embed-v2', trust_remote_code=True)
def add_eos(input_examples):
  input_examples = [input_example + embedding_model.tokenizer.eos_token for input_example in input_examples]
  return input_examples


# Image Captioning

In [None]:
class Image_Caption(nn.Module):
    def __init__(self, drop_p=0.5, image_dim=256, vocab_size=32000, embedding_dim=4096, hidden_dim=1024):
        super(Image_Caption, self).__init__()
        self.image_encoder = nn.Sequential(
            nn.Dropout(drop_p),
            nn.Linear(image_dim, 256),
            nn.ReLU(),
        )

        self.embedding = embedding_model
        self.text_dropout = nn.Dropout(drop_p)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)

        self.decoder = nn.Sequential(
            nn.Linear(hidden_dim, vocab_size),
            nn.ReLu(),
            nn.Linear(vocab_size, vocab_size),
            nn.Softmax(dim=1)
        )
    def forward(self, image_features, text):
        image_features = self.image_encoder(image_features)
        with torch.no_grad():
            text_features = self.embedding.encode(add_eos(text), batch_size=1, prompt=False, normalize_embeddings=True)
        text_features = self.text_dropout(text_features)
        lstm_out, _ = self.lstm(text_features)
        text_features = lstm_out[:, -1, :]
        fused_features = image_features + text_features

        output = self.decoder(fused_features)
        return output
    
model = Image_Caption()

vacab_size = embedding_model

