# Preprocess the metadata for each dataset so that it's in a csv file with the concepts as the columns + the original image filepath

In [9]:
import os
import json
import csv

## CLEVR

In [17]:
def extract_clevr_metadata(base_dir='../Data/CLEVR'):
    """Extracts metadata (color, shape, image_path) and saves it to a CSV."""
    
    # Subdirectories for images and scenes
    image_dir = os.path.join(base_dir, 'images')
    scene_dir = os.path.join(base_dir, 'scenes')

    # Dictionary to store metadata
    metadata = {
        'color': [],
        'shape': [],
        'size': [],
        'material': [],
        'image_path': []
    }

    # Extract metadata in a single loop
    print("Extracting CLEVR metadata...")
    for json_file in os.listdir(scene_dir):
        if json_file.endswith('.json'):
            json_path = os.path.join(scene_dir, json_file)
            
            with open(json_path, 'r') as f:
                scene_data = json.load(f)

            # Extract metadata
            objects = scene_data['objects'][0]
            metadata['color'].append(objects['color'])
            metadata['shape'].append(objects['shape'])
            metadata['size'].append(objects['size'])
            metadata['material'].append(objects['material'])

            # Save the relative image filename
            image_filename = os.path.join('Data/CLEVR/images', scene_data['image_filename'])
            metadata['image_path'].append(image_filename)
            
    # Save metadata to CSV
    csv_file = os.path.join(base_dir, "metadata.csv")
    with open(csv_file, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(metadata.keys())
        for row in zip(*metadata.values()): 
            writer.writerow(row)

    print(f"Metadata saved to {csv_file} :)")
extract_clevr_metadata()

Extracting CLEVR metadata...
Metadata saved to ../Data/CLEVR/metadata.csv :)
