In [None]:
import os
from ast import literal_eval

import pandas as pd
import numpy as np
from PIL import Image
from sklearn.model_selection import train_test_split
import json
import ast
from pathlib import Path

# Reload the uploaded annotation file
df = pd.read_csv("./final_setup/top30_annotations.csv")

# Re-split train/test

image_names = df['img_name'].unique()
train_imgs, test_imgs = train_test_split(image_names, test_size=0.2, random_state=42)
train_df = df[df['img_name'].isin(train_imgs)].copy()
test_df = df[df['img_name'].isin(test_imgs)].copy()

# Prepare shared COCO category mapping
coco_categories = []
category_id_map = {}
for i, label in enumerate(sorted(df['mzl_label'].unique())):
    category_id_map[label] = i
    coco_categories.append({
        "id": i,
        "name": str(label)
    })

# Helper to build COCO dict from a filtered DataFrame
def build_coco_dict(dataframe):
    coco_dict = {
        "images": [],
        "annotations": [],
        "categories": coco_categories
    }

    image_id_map = {}
    annotation_id = 0

    for i, img_name in enumerate(dataframe['img_name'].unique()):
        image_id_map[img_name] = i
        coco_dict['images'].append({
            "id": i,
            "file_name": f"{img_name}.jpg"
        })

    for _, row in dataframe.iterrows():
        img_id = image_id_map[row['img_name']]
        cat_id = category_id_map[row['mzl_label']]
        x_min, y_min, x_max, y_max = ast.literal_eval(row['relative_bbox'])
        width = x_max - x_min
        height = y_max - y_min
        area = width * height

        coco_dict['annotations'].append({
            "id": annotation_id,
            "image_id": img_id,
            "category_id": cat_id,
            "bbox": [x_min, y_min, width, height],
            "area": area,
            "iscrowd": 0,
            "segmentation": []
        })
        annotation_id += 1

    return coco_dict

# Build COCO dicts
coco_train = build_coco_dict(train_df)
coco_test = build_coco_dict(test_df)

# Assume images are in these folders
train_img_dir = Path("./final_setup/train")
test_img_dir = Path("./final_setup/test")

# Add image sizes
def add_image_sizes(coco_dict, image_dir):
    for img in coco_dict['images']:
        img_path = image_dir / img['file_name']
        if img_path.exists():
            with Image.open(img_path) as im:
                img['width'], img['height'] = im.size
        else:
            img['width'], img['height'] = 1024, 1024  #fallback
    return coco_dict

coco_train = add_image_sizes(coco_train, train_img_dir)
coco_test = add_image_sizes(coco_test, test_img_dir)

# Save final JSONs
train_json_path = Path("./final_setup/train_with_size.json")
test_json_path = Path("./final_setup/test_with_size.json")

with open(train_json_path, "w") as f:
    json.dump(coco_train, f, indent=2)

with open(test_json_path, "w") as f:
    json.dump(coco_test, f, indent=2)