# Prepare image caption dataset for HuggingFace

Rename `image_caption.jsonl` to `metadata.json` and place it in the raw images/train folder together with the images.

Only keep features `file_name` and `caption`.

In [None]:
import json

In [None]:
def read_jsonl(path):
    with open(path, "r") as f:
        return [json.loads(line) for line in f]


def write_jsonl(path, data):
    with open(path, "w") as f:
        for line in data:
            json.dump(line, f)
            f.write("\n")

In [None]:
RAW_PATH = "../../../../mnt/data_6tb/oliver/NordjyllandNews/data/raw/images/train"
PROCESSED_PATH = "../../../../mnt/data_6tb/oliver/NordjyllandNews/data/processed/images/train"

In [None]:
raw_meta_data_path = f"{RAW_PATH}/metadata.jsonl"

data = read_jsonl(raw_meta_data_path)

In [None]:
processed_data = []
keys_to_keep = ["file_name", "caption"]
for d in data:
    processed_data.append({k: d[k] for k in keys_to_keep})

In [None]:
processed_meta_data_path = f"{PROCESSED_PATH}/metadata.jsonl"

write_jsonl(processed_meta_data_path, processed_data)

## Dataset statistics

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("whitegrid")

In [None]:
data = read_jsonl(processed_meta_data_path)

## Caption Length Distribution

In [None]:
caption_lenghts = [len(d["caption"]) for d in data]

In [None]:
plt.figure(figsize=(10, 5))
sns.histplot(caption_lenghts, bins=100)
plt.title("Caption Length Distribution")
plt.ylabel("Frequency")
plt.xlabel("Number of characters in caption")
plt.savefig("../figures/caption_length_distribution.png")


## Number of samples

In [None]:
len(caption_lenghts)

## Image resolutions

In [None]:
import cv2

In [None]:
smallest_height = float("inf")
smallest_width = float("inf")
smallest_channels = float("inf")

largest_height = 0
largest_width = 0
largest_channels = 0
pixel_counts = []
n_samples = len(data)
for i, d in enumerate(data):
    img = cv2.imread(f"{PROCESSED_PATH}/{d['file_name']}")
    height, width, channels = img.shape
    pixel_count = height * width * channels
    pixel_counts.append(pixel_count)
    if height < smallest_height:
        smallest_height = height
    if width < smallest_width:
        smallest_width = width
    if channels < smallest_channels:
        smallest_channels = channels
    
    if height > largest_height:
        largest_height = height
    if width > largest_width:
        largest_width = width
    if channels > largest_channels:
        largest_channels = channels

    if not i % 200:
        print(f"{i}/{n_samples}")

In [None]:
print(f"Height: [{smallest_height}, {largest_height}]")
print(f"Width: [{smallest_width}, {largest_width}]")
print(f"Channels: [{smallest_channels}, {largest_channels}]")

In [None]:
import numpy as np

In [None]:
pixel_counts_sqrt = [int(np.sqrt(p / 3)) for p in pixel_counts] # Divide by 3 because of 3 channels

plt.figure(figsize=(10, 5))
sns.histplot(pixel_counts_sqrt, bins=50)
plt.title("Image size distribution")
plt.ylabel("Frequency")
plt.xlabel("x")

plt.ticklabel_format(style='plain', axis='x')
plt.xticks(range(0, 10000 + 100, 1000), rotation=45)



plt.savefig("../figures/image_size_distribution.png", bbox_inches='tight')