## Import Libraries


In [None]:
import torch
import pandas as pd
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
import torchvision.transforms as transforms
import warnings

warnings.filterwarnings("ignore")

## Load Data


In [None]:
BASE_DIRECTORY = "dataset/"
IMAGE_DIRECTORY = "dataset/Images/"

df = pd.read_csv(BASE_DIRECTORY + "captions.txt")
df["path"] = df["image"].apply(lambda x: IMAGE_DIRECTORY + x)
df = df.rename(columns={"image": "id"})
df["id"] = df["id"].str.replace(".jpg", "")
df.head()

In [None]:
print("Dataframe shape:", df.shape)
unique_id_count = df["id"].nunique()
print("Number of samples", unique_id_count)

### Make a dictionary


In [None]:
data = {}

for i in range(0, len(df), 5):
    id = df["id"][i]
    captions = [df["caption"][j] for j in range(i, i + 5)]
    path = df["path"][i]
    data[id] = {"captions": captions, "path": path}

### Split the dictionary into train, test, and validation sets


In [None]:
# Get a list of all keys (ids)
keys = list(data.keys())

# Select keys for the training set
train_keys = keys[:6000]

# Select keys for the validation set
val_keys = keys[6000:7000]

# The remaining keys are for the testing set
test_keys = keys[7000:]

# Create the training, validation, and testing sets
train_data = {key: data[key] for key in train_keys}
val_data = {key: data[key] for key in val_keys}
test_data = {key: data[key] for key in test_keys}

train_keys = list(train_data.keys())
val_keys = list(val_data.keys())
test_keys = list(test_data.keys())

print("Training set size:", len(train_data))
print("Validation set size:", len(val_data))
print("Testing set size:", len(test_data))

## Read & Show Image Data


In [None]:
def read_image(path):
    return Image.open(path)


def show_image(image):
    plt.imshow(image)
    plt.axis("off")
    plt.show()


def read_from_tensor(tensor):
    img_numpy = tensor.permute(1, 2, 0).numpy()
    img_numpy = np.clip(img_numpy, 0, 1)
    plt.imshow(img_numpy)
    plt.axis("off")
    plt.show()

### Example of an image with captions


In [None]:
show_image(read_image(train_data[train_keys[1]]["path"]))
for i in range(5):
    print(train_data[train_keys[1]]["captions"][i])

## Preprocess the Data


In [141]:
transform = transforms.Compose(
    [
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ]
)


# Assuming train_data is your data
train_tensors = [transform(read_image(train_data[key]["path"])) for key in train_keys]

# Convert list of tensors to a single tensor
train_tensors = torch.stack(train_tensors)
train_tensors = train_tensors.to("cuda")

train_tensors.shape

In [None]:
read_from_tensor(train_tensors[1])

In [None]:
# # Assuming train_data is your data and train_keys are the keys
# for key in train_keys:
#     train_data[key]['tensor'] = transform(read_image(train_data[key]["path"]))

# # Now each dictionary in train_data also contains the tensor representation of the image
# train_data[train_keys[1]]["tensor"]

In [None]:
train_tensors[1]