# NLP Experiments Notebook

In [2]:
from pathlib import Path
import json
import os
ROOT = Path(os.getcwd()).parents[0]

## JSON Parser to Python Dict

In [4]:
def load_json(path):
    """
    Load a JSON file and return it as a Python dictionary.

    Parameters:
        path (str): Path to the JSON file.

    Returns:
        dict: Parsed JSON content.
    """
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f) # JSON to Dict

def build_id_to_filename(coco_json):
    """
    Build a dictionary mapping image IDs to their corresponding file names.
    
    Parameters:
        coco_json (dict): Entire COCO annotation JSON.

    Returns:
        dict: Mapping {image_id: file_name}
    """
    # Map id to its filename
    return {img["id"]: img["file_name"] for img in coco_json["images"]}

def group_by_filename(items, id_to_filename, key="image_id"):
    """
    Group a list of annotation items by their image filename.

    Parameters:
        items (list): List of annotation items (dicts).
        id_to_filename (dict): Mapping {image_id: file_name}.
        key (str): The key in each item that holds the image ID. 

    Returns:
        dict: Mapping {file_name: list_of_items}
    """
    grouped = {} # final dict

    for item in items:
        # Extract the image_id from the item
        img_id = item[key]

        # Convert numeric image_id to filename (string)
        filename = id_to_filename[img_id]

        # Create a list if key doesn't exist, then append the item
        grouped.setdefault(filename, []).append(item)

    return grouped

def parse_captions(json_path):
    """
    Parse a COCO captions annotation file and return a dictionary
    mapping each image filename to its list of captions.

    Parameters:
        json_path (str): Path to COCO captions JSON file.

    Returns:
        dict: {filename: [caption1, caption2, ...]}
    """
    # Load the full COCO JSON
    data = load_json(json_path)

    # Build a helper mapping id to filename
    id_to_filename = build_id_to_filename(data)

    # Extract only the useful fields for captioning
    captions_only = [{"image_id": ann["image_id"], "caption": ann["caption"]} for ann in data["annotations"]]

    # Group captions by image filename
    grouped = group_by_filename(captions_only, id_to_filename)

    # Now convert the grouped structure so each image maps directly
    for filename in grouped:
        grouped[filename] = [x["caption"] for x in grouped[filename]]

    # Return the clean final structure
    return grouped

In [6]:
path_captions = f"{ROOT}\\raw_data\\annotations\\captions_train2017.json"

captions_raw = parse_captions(path_captions)