In [1]:
SEED = 19
N = 211

In [2]:
import json
import ijson

# Load the list of target relations
with open('./assets/final_cleaned_relations.json', 'r') as f:
    target_relations = set(json.load(f))  # convert to set for faster lookup

matched_image_ids = set()

# Stream through the Visual Genome relationships using ijson
with open('./VisualGenome/relationships.json', 'r') as f:
    objects = ijson.items(f, 'item')  # 'item' iterates over the top-level array
    for image_data in objects:
        image_id = image_data['image_id']
        for rel in image_data['relationships']:
            pred = rel['predicate'].lower().strip()
            if pred in target_relations:
                matched_image_ids.add(image_id)
                break  # Found a match; skip to next image

# Save the list of matched image IDs
with open('./assets/matched_image_ids.json', 'w') as f:
    json.dump(sorted(matched_image_ids), f, indent=2)

print("Saved matched image IDs to 'matched_image_ids.json'")

Saved matched image IDs to 'matched_image_ids.json'


In [3]:
import json
import random
import sys
from pathlib import Path

def create_random_subset(input_file, output_dir, subset_size=5, seed=SEED):
    # Read input JSON file
    with open(input_file, 'r') as f:
        data = json.load(f)
    
    if not isinstance(data, list):
        raise ValueError("Input JSON must contain an array/top-level list")
    
    # Set random seed and select subset
    random.seed(seed)
    subset = random.sample(data, min(subset_size, len(data)))
    
    # Create output filename
    input_stem = Path(input_file).stem
    output_file = Path(output_dir) / f"{input_stem}_subset_{seed}.json"
    
    # Write subset to new JSON file
    with open(output_file, 'w') as f:
        json.dump(subset, f, indent=2)
    
    print(f"Created subset with {len(subset)} items at: {output_file}")
    return output_file

input_file = "./assets/matched_image_ids.json"
output_dir = "./assets/"
subset_size = N

create_random_subset(input_file, output_dir, subset_size, SEED)

Created subset with 211 items at: assets/matched_image_ids_subset_19.json


PosixPath('assets/matched_image_ids_subset_19.json')

In [4]:
import json

def convert_json(input_data, relations):
    # Create the output structure
    output = {
        "url": input_data["image_id"],
        "objects": [],
        "attributes": [],
        "relationships": []
    }
    
    # Convert relations to set for faster lookup
    base_relations = set(relations)
    
    # Step 1: Build a mapping from original object_id to new index
    object_id_to_index = {}
    
    # Process objects and populate the mapping
    for obj in input_data["objects"]:
        # Store the current index (before appending)
        current_index = len(output["objects"])
        
        # Map the original object_id to this index
        object_id_to_index[obj["object_id"]] = current_index
        
        # Add object names
        for name in obj["names"]:
            output["objects"].append({"name": name})
        
        # Add attributes
        if "attributes" in obj:
            for attr in obj["attributes"]:
                output["attributes"].append({
                    "attribute": attr,
                    "object": current_index  # Use the new index
                })
    
    # Process relationships with mapped indices and cleaned predicates
    for rel in input_data["relationships"]:
        # Get the mapped indices for subject and object
        subject_index = object_id_to_index.get(rel["subject_id"], -1)
        object_index = object_id_to_index.get(rel["object_id"], -1)
        
        if subject_index != -1 and object_index != -1:
            # Clean the predicate to match base relations
            predicate = rel["predicate"].lower()
            base_predicate = None
            
            # Check if any base relation is contained in the predicate
            for relation in base_relations:
                if relation in predicate:
                    base_predicate = relation
                    break
            
            # Only add if we found a matching base relation
            if base_predicate:
                output["relationships"].append({
                    "predicate": base_predicate,
                    "object": object_index,
                    "subject": subject_index
                })
            # else:
            #     print(f"Warning: No base relation found for predicate '{predicate}'")
        else:
            print(f"Warning: Missing object/subject for relationship {rel['relationship_id']}")
    
    return output

# Read input scene graph
with open('./VisualGenome/scene_graphs.json', 'r') as f:
    input_data = json.load(f)

with open('./assets/final_cleaned_relations.json', 'r') as f:
    relations = json.load(f)

with open(f'./assets/matched_image_ids_subset_{SEED}.json', 'r') as f:
    image_ids = json.load(f)

image_ids_set = set(image_ids)

converted_data = []
for graph in input_data:
    if graph and graph.get('image_id') in image_ids_set:
        # extra layer to filter prepositions from the relations
        converted_data.append(convert_json(graph, relations))

# Save to JSON file
with open(f'./assets/final_subset_{SEED}.json', 'w') as f:
    json.dump(converted_data, f, indent=2)

print(f"Conversion complete. Output saved to ./assets/final_subset_{SEED}.json")

Conversion complete. Output saved to ./assets/final_subset_19.json


: 

In [5]:
import json

def simplify_scene_graphs(scene_graphs):
    simplified_graphs = []
    
    for graph in scene_graphs:
        simplified_relationships = []
        
        for relation in graph.get("relationships", []):
            try:
                subject_obj = graph["objects"][relation["subject"]]
                object_obj = graph["objects"][relation["object"]]
            except IndexError:
                continue
            
            simplified_relationships.append({
                "subject": subject_obj["name"],
                "object": object_obj["name"],
                "relationship": relation["predicate"]
            })
        
        if simplified_relationships:
            simplified_graphs.append({
                "image_id": graph["url"],
                "relationships": simplified_relationships
            })

    with open(f'./assets/simplified_scene_graphs_{SEED}.json', "w") as f:
        json.dump(simplified_graphs, f, indent=2)
        
    print(f"Processed {len(simplified_graphs)} images into simplified format")

with open(f'./assets/final_subset_{SEED}.json', "r") as f:
    data = json.load(f)

# simplify_scene_graphs(data)

Processed 1200 images into simplified format
