In [15]:
import json
import os

results = {}
for fn in sorted(os.listdir('.')):
    if not (fn.endswith('.json') and fn.startswith('GPT-4o-')): continue
    print('-'*20, fn[7:-5], '-'*20)
    
    # Load the source JSON file
    with open(fn, 'r') as file:
        data = json.load(file)

    output_structure = {}
    
    # Prepare the output JSON structure
    output_data = []
    sentid = 0

    # Process each image and its captions
    for idx, image_caption in enumerate(data['image_captions']):
        image_id = idx + 1

        entry = {
            "filepath": image_caption["filepath"],
            "filename": os.path.basename(image_caption["filename"]),
            "imgid": image_id,
            "split": "test",
            "cocoid": image_id,
            "sentids": list(range(len(image_caption["captions"]))),
            "sentences": []
        }
        
        for sentid, caption in enumerate(image_caption["captions"]):
            sentence_entry = {
                "tokens": caption.split(),
                "raw": caption,
                "imgid": image_id,
                "sentid": sentid
            }
            entry["sentences"].append(sentence_entry)
            sentid += 1
        
        output_data.append(entry)
        
    output_structure["images"] = output_data

    # Save the COCO formatted JSON to a file
    with open(f'coco_format_{fn[7:-5]}.json', 'w') as output_file:
        json.dump(output_structure, output_file, indent=4)


-------------------- IMERIT_2023_11_28_TEST_v6 --------------------
-------------------- IMERIT_2023_11_28_TRAIN_v6 --------------------
-------------------- IMERIT_2024_03_02_TEST_v5 --------------------
-------------------- IMERIT_2024_03_02_TRAIN_v5 --------------------
-------------------- IMERIT_2024_03_05_TEST_v5 --------------------
-------------------- IMERIT_2024_03_05_TRAIN_v5 --------------------
-------------------- IMERIT_2024_03_25_TEST_v5 --------------------
-------------------- IMERIT_2024_03_25_TRAIN_v5 --------------------
-------------------- IMERIT_CALIBRATION_TEST_v8 --------------------
-------------------- IMERIT_CALIBRATION_TRAIN_v5 --------------------


In [13]:
import json
import os

train_output_data = []
test_output_data = []
global_sentid = 0

# Iterate through files in the current directory
for fn in sorted(os.listdir('.')):
    if not (fn.endswith('.json') and fn.startswith('GPT-4o-')):
        continue
    
    # Determine if the file is TRAIN or TEST
    if 'TRAIN' in fn.upper():
        target_list = train_output_data
    elif 'TEST' in fn.upper():
        target_list = test_output_data
    else:
        continue

    print('-' * 20, fn[7:-5], '-' * 20)

    # Load the source JSON file
    with open(fn, 'r') as file:
        data = json.load(file)

    # Process each image and its captions
    for idx, image_caption in enumerate(data['image_captions']):
        image_id = len(target_list) + 1

        entry = {
            "filepath": image_caption["filepath"],
            "filename": os.path.basename(image_caption["filename"]),
            "imgid": image_id,
            "split": "test" if 'TEST' in fn.upper() else "train",
            "cocoid": image_id,
            "sentids": [],
            "sentences": []
        }

        for caption in image_caption["captions"]:
            sentence_entry = {
                "tokens": caption.split(),
                "raw": caption,
                "imgid": image_id,
                "sentid": global_sentid
            }
            entry["sentences"].append(sentence_entry)
            entry["sentids"].append(global_sentid)
            global_sentid += 1
        
        target_list.append(entry)

# Save the COCO formatted JSON files
with open('coco_vocab_coco_format_train.json', 'w') as train_output_file:
    json.dump({"images": train_output_data}, train_output_file, indent=4)

with open('coco_vocab_coco_format_test.json', 'w') as test_output_file:
    json.dump({"images": test_output_data}, test_output_file, indent=4)



-------------------- IMERIT_2023_11_28_TEST_v6 --------------------
-------------------- IMERIT_2023_11_28_TRAIN_v6 --------------------
-------------------- IMERIT_2024_03_02_TEST_v5 --------------------
-------------------- IMERIT_2024_03_02_TRAIN_v5 --------------------
-------------------- IMERIT_2024_03_05_TEST_v5 --------------------
-------------------- IMERIT_2024_03_05_TRAIN_v5 --------------------
-------------------- IMERIT_2024_03_25_TEST_v5 --------------------
-------------------- IMERIT_2024_03_25_TRAIN_v5 --------------------
-------------------- IMERIT_CALIBRATION_TEST_v8 --------------------
-------------------- IMERIT_CALIBRATION_TRAIN_v5 --------------------


In [16]:
TRAIN_VOCAB = {'activate', 'chipper', 'worktop', 'excavator', 'hoodies', 'extracting', 'halted', 'stationing', 'notifications', 'manouvering', 'squirming', 'mowing', 'generate', 'vaulting', 'strumming', 'wiggling', 'snowcovered', 'lightcolored', 'roughhousing', 'unlock', 'clears', 'entered', 'progressing', 'mansions', 'embarking', 'sauntering', 'romping', 'pruning', 'spotty', 'rearranging', 'therefore', 'conditioning', 'embraces', 'selects', 'bedridden', 'repeatedly', 'relocation', 'unmoved', 'brightlylit', 'bedsheets', 'caf', 'ac', 'vanbr', 'rideon', 'bedtime', 'inching', 'ringing', 'mending', 'outbuilding', 'rvs', 'curbs', 'chaos', 'extinguishing', 'opossum', 'doorbell', 'offroading', 'accelerating', 'mopping', 'feline', 'backdoor', 'renothmove', 'lakefront', 'assumptions', 'pergola', 'blackspotted', 'uturn', 'blueshirted', 'latching', 'snowblower', 'caregiver', 'floods', 'firewood', 'scurrying', 'doorsteps', 'kibble', 'illustrating', 'manwoman', 'dogish', 'sidewalklights', 'drivewaybr', 'windshields', 'aiding', 'dashing', 'overall', 'rustling', 'shoveled', 'visibility', 'conditioners', 'decking', 'chaotic', 'headlamps', 'trailor', 'answer', 'caretaker', 'restless', 'accumulating', 'management', 'tidied', 'woodpaneled', 'uploaded', 'halting', 'gardener', 'tripping', 'documenting', 'hurriedly', 'readjusting', 'refilling', 'tanktop', 'vigilant', 'stationarily', 'passsing', 'chewy', 'overtaking', 'accessing', 'securely', 'negotiating', 'worded', 'shuts', 'fumes', 'snugly', 'sweep', 'cozying', 'rainfall', 'closure', 'driveways', 'doorstop', 'chainlink', 'nipping', 'gmc', 'treehouse', 'garages', 'carpentry', 'avian', 'cashier', 'switched', 'sirens', 'causes', 'pooch', 'hummer', 'soundly', 'igniting', 'threshold', 'orb', 'pacing', 'redhat', 'fencedoff', 'doormat', 'cradled', 'adventuring', 'shifted', 'lineup', 'occupant', 'lingering', 'workstations', 'fade', 'bounding', 'leafstrewn', 'culdesac', 'russell', 'bollards', 'treelined', 'awakens', 'reverses', 'verandah', 'relocating', 'sketching', 'awakening', 'frontage', 'railaptop', 'greenlit', 'timelapse', 'closeby', 'playpen', 'midflight', 'cozied', 'downtime', 'accidentally', 'glassdoctor', 'vacuuming', 'scrutinizing', 'neighbourhood', 'sanitation', 'bedrooms', 'sudden', 'vehicled', 'alighted', 'bonding', 'stayed', 'fourwheeler', 'hue', 'continuing', 'schoolwork', 'hydrating', 'offloading', 'snowplow', 'racoon', 'explosion', 'lapping', 'maui', 'skipping', 'nurturing', 'shrouded', 'shifting', 'entranceway', 'comforted', 'oscillating', 'orbs', 'accurate', 'footwear', 'unwinding', 'vehiclebr', 'walkerassisted', 'longhaired', 'possum', 'veranda', 'deliverer', 'fastmoving', 'stoping', 'leaflets', 'flaring', 'balustrade', 'mailman', 'placement', 'tidies', 'capturing', 'reorganizing', 'manoeuvring', 'rsidential', 'strutting', 'briskly', 'songbird', 'semitruck', 'shimmering', 'bluelit', 'framework', 'mobility', 'verifying', 'organizes', 'housebr', 'tumbling', 'tussling', 'midnight', 'verbs', 'cleanup', 'scaling', 'buzzing', 'sideyard', 'lawnmower', 'static', 'multitasking', 'redshirted', 'soothing', 'wakes', 'vhehicle', 'ambulating', 'neighbors', 'cornerroom', 'darkcolored', 'inquiring', 'grabbed', 'sideway', 'gradually', 'mulches', 'tuning', 'bursting', 'thirst', 'sneaking', 'channels', 'excavation', 'reversing', 'hvac', 'activating', 'reviewing', 'driverless', 'guitarist', 'retreating', 'upholstery', 'flowerbed', 'affordable', 'trimmer', 'spherical', 'deputies', 'yards', 'walkup', 'roadblocks', 'however', 'rousing', 'settling', 'energetically', 'sandbox', 'silently', 'occurring', 'formula', 'parcels', 'sunroom', 'snowflakes', 'yardbr', 'irrigation', 'lawnpath', 'vehicl', 'creamcolored', 'rea', 'converging', 'alighting', 'catsitting', 'offwhite', 'slinking', 'swingset', 'activated', 'brightening', 'fireball', 'authorities', 'collects', 'kennels', 'snapping', 'atr', 'maneuvered', 'sunbath', 'nearer', 'purplelit', 'tarping', 'enteringaxing', 'described', 'movng', 'closures', 'stationery', 'offroad', 'premises', 'merrily', 'noticing', 'unmoving', 'thirsty', 'bolting', 'collapsing', 'canine', 'burst', 'brighten', 'minivanbr', 'midjump', 'permit', 'creeping', 'slurping', 'aligning', 'dismounting', 'irrigating', 'context', 'interiors', 'homeowner', 'benchmark', 'usps', 'kia', 'captions', 'relevant', 'decluttering', 'removal', 'glancing', 'emanating', 'halts', 'burrowing', 'backrest', 'lakeshore', 'comforting', 'terrarium', 'roommates', 'uptown', 'allterrain', 'barbecuing', 'leds', 'manuevering', 'reunite', 'agent', 'jogger', 'playtime', 'obtaining', 'felines', 'toolboxes', 'fluttering', 'silvery', 'relaxation', 'hurled', 'dropoff', 'snowfilled', 'flickering', 'repositioning', 'luminous', 'blinking', 'housework', 'lawns', 'postman', 'vehcile', 'unlocking', 'rinsing', 'moth', 'immobile', 'quenching', 'propertys', 'streetlamps', 'escaping', 'depositing', 'shifts', 'aboveground', 'scooting', 'approching', 'yardwork', 'messaging', 'hens', 'bungalow', 'briefly', 'calming', 'steadily', 'derive', 'retrievers', 'apply', 'transferring', 'darting', 'glimpsed', 'locating', 'retrieved', 'pouncing', 'playhouse', 'squeezes', 'obscuring', 'compy', 'leisure', 'woodpile', 'canines', 'implies', 'engrossed', 'siren', 'webbing', 'anomaly', 'lounged', 'cleanliness', 'surveillance', 'alights', 'parkers', 'responders', 'slumbering', 'stumbling', 'passerby', 'righthand', 'hosepipe', 'unboxing', 'toppling', 'springing', 'neonlit', 'upritte', 'obstructing', 'ambling', 'verge', 'prowling', 'idled', 'disposing', 'illumination', 'drone', 'pronto', 'overseeing', 'ensuring', 'wriggling', 'sedans', 'shutting', 'storybook', 'supervising', 'momentarily', 'disordered', 'sprinting', 'koi', 'youve', 'swaying', 'analyzing', 'remained', 'transitioning', 'inventory', 'raptor', 'intruder', 'dealing', 'postperson', 'tinkering', 'draping', 'kiddo', 'caressing', 'determine', 'striding', 'brewing', 'tidying', 'fidgeting', 'semitrailer', 'chervrolet', 'absence', 'furball', 'blueclad', 'dozing', 'prime', 'barista', 'bluecoated', 'continuous', 'engulfed', 'doghouse', 'threepoint', 'fastens', 'apparition', 'scampering', 'shuffling', 'dougs', 'responding', 'purcor', 'discarding', 'courier', 'scarlet', 'fourdoor', 'kenneled', 'hurrying', 'terraced', 'embellishing', 'shoelaces', 'raking', 'entertains', 'aramark', 'critter', 'dumpsters', 'chores', 'halt', 'positioning', 'stoneart'}
TEST_VOCAB = {'disposes', 'horsebox', 'settles', 'bedcovers', 'snowfilled', 'grabbed', 'gradually', 'orb', 'planting', 'repositioning', 'pacing', 'housework', 'lawns', 'leashing', 'selfgrooming', 'doormat', 'postman', 'extracting', 'reversing', 'halted', 'unlocking', 'activating', 'mowing', 'nestling', 'immobile', 'deduce', 'generate', 'entrancepath', 'settle', 'retreating', 'bounding', 'snowcovered', 'flowerbed', 'lightcolored', 'scrapheap', 'mow', 'culdesac', 'townhouse', 'trimmer', 'disarrayed', 'reverses', 'progressing', 'reviews', 'wakeup', 'relocating', 'however', 'mazda', 'sauntering', 'briefly', 'romping', 'settling', 'awakening', 'energetically', 'steadily', 'departuring', 'xfinity', 'rearranging', 'parcels', 'transferring', 'neighborhod', 'darting', 'shoeboxes', 'minivans', 'bedridden', 'retrieved', 'pouncing', 'playhouse', 'unmoved', 'alighting', 'playpen', 'investigation', 'discovered', 'bedsheets', 'tarpcovered', 'reversed', 'obscuring', 'bedtime', 'canines', 'brightening', 'collects', 'dismounts', 'smoothly', 'snowmobiles', 'downtime', 'slipping', 'hopps', 'engrossed', 'vacuuming', 'accelerating', 'kennels', 'tweaking', 'feline', 'developing', 'mentioned', 'perspectives', 'bonding', 'continuing', 'pergola', 'stumbling', 'unboxing', 'darkclad', 'offloading', 'hypothetical', 'transaction', 'ambling', 'lapping', 'prowling', 'disposing', 'illumination', 'dogdozing', 'caregiver', 'duplex', 'premises', 'ensuring', 'shifting', 'scurrying', 'kibble', 'shutting', 'flowerdecorated', 'sedans', 'unmoving', 'polishing', 'sprinting', 'rake', 'unwinding', 'starry', 'canine', 'dashing', 'transitioning', 'assume', 'veranda', 'hoops', 'deliverer', 'pinpoint', 'caressing', 'determine', 'scratcher', 'brewing', 'sierra', 'tidying', 'fidgeting', 'collectors', 'kroger', 'context', 'mailman', 'homeowner', 'neatening', 'mailperson', 'usps', 'accurately', 'dozing', 'prime', 'captions', 'decluttering', 'toolshed', 'halting', 'housecat', 'gardener', 'roku', 'halts', 'tripping', 'hisense', 'hurriedly', 'unboxed', 'verbs', 'comforting', 'refilling', 'roommates', 'scaling', 'retrieval', 'scootering', 'enable', 'chewy', 'garages', 'overtaking', 'plausible', 'lawnmower', 'accessing', 'courier', 'fourdoor', 'entertained', 'static', 'brightens', 'hurrying', 'midsized', 'succinct', 'driveways', 'playtime', 'fluttering', 'circulating', 'reusable', 'analyze', 'gmc', 'darkcolored', 'dishware', 'chores', 'halt', 'positioning', 'dropoff'}

def contains_prohibited_word(sentence, prohibited_words):
    tokens = sentence.split()
    return any(word in prohibited_words for word in tokens)

# Load the JSON data
input_file_path = 'coco_vocab_coco_format_train.json'
output_file_path = 'coco_vocab_coco_format_train_filtered.json'

with open(input_file_path, 'r') as file:
    data = json.load(file)

# Filter out sentences with prohibited words
for image_data in data['images']:
    filtered_sentences = []
    filtered_sentids = []

    for sentence_entry in image_data['sentences']:
        if not contains_prohibited_word(sentence_entry['raw'], TRAIN_VOCAB):
            filtered_sentences.append(sentence_entry)
            filtered_sentids.append(sentence_entry['sentid'])

    image_data['sentences'] = filtered_sentences
    image_data['sentids'] = filtered_sentids

# Save the filtered data back to a new JSON file
with open(output_file_path, 'w') as output_file:
    json.dump(data, output_file, indent=4)

print(f"Filtered data saved to {output_file_path}")

Filtered data saved to coco_vocab_coco_format_test_filtered.json
