In [None]:
### Notebook for creating RDV testset
### Creates folder structure in accordance to the RDV codebase.
### To evaluate evaluate on synthetic data, simply copy the images
### generated from the same prompts from the same metadata file into the "data/" folder

In [None]:
import os
import pandas as pd
import shutil
import json

In [None]:
### For StableDiffusion
df = pd.read_csv("....metadata.csv")
df.head()

In [None]:
### For ControlNet
df = pd.read_json("....test.jsonl", lines=True)
df.rename(columns={"text": "text", "image": "file_name"}, inplace=True)
df.drop(columns="conditioning_image", inplace=True)
columns_titles = ["file_name","text"]
df=df.reindex(columns=columns_titles)
df.head()

In [None]:
testset_path = "" # path to test images
cholect50_path = "" # path to original CholecT50 dataset

# do not change, this is the RDV folder structure
data_path = "data/"
instrument_path = "instrument/"
verb_path = "verb/"
target_path = "target/"
triplet_path = "triplet/"

In [None]:
instrument_lines = []
verb_lines = []
target_lines = []
triplet_lines = []


for idx, row in enumerate(df.iterrows()):
    vid_id = row[1]['file_name'].split("/")[0]
    line_index = int(row[1]['file_name'].split("/")[1].split(".")[0])

    save_path = os.path.join(data_path,"VID997") #controlnet
    os.makedirs(save_path, exist_ok=True)
    shutil.copyfile(os.path.join(testset_path, row[1]['file_name']),
                    os.path.join(data_path, "VID997",str(idx).zfill(6)+'.png'))
    
    if not os.path.exists(os.path.join(data_path, vid_id)):
        import json
        triplet_n_categories = 100
        instrument_n_categories = 6
        verb_n_categories = 10
        target_n_categories = 15

        with open(f"{cholect50_path}/labels/{vid_id}.json") as f:
            label_file = json.load(f)["annotations"]
            labels = label_file[str(line_index)]

            triplet = [label[0] for label in labels]
            instrument = [label[1] for label in labels]
            verb = [label[7] for label in labels]
            target = [label[8] for label in labels]
            
            triplet_line = ["0"] * (triplet_n_categories)
            for t in triplet:
                if t == -1:
                    continue
                triplet_line[t] = "1"
            triplet_line = [str(idx)] + triplet_line
            triplet_line = ",".join(triplet_line) + "\n"
            triplet_lines.append(triplet_line)
            
            instrument_line = ["0"] * (instrument_n_categories)
            for t in instrument:
                if t == -1:
                    continue
                instrument_line[t] = "1"
            instrument_line = [str(idx)] + instrument_line
            instrument_line = ",".join(instrument_line) + "\n"
            instrument_lines.append(instrument_line)
            
            verb_line = ["0"] * (verb_n_categories)
            for t in verb:
                if t == -1:
                    continue
                verb_line[t] = "1"
            verb_line = [str(idx)] + verb_line
            verb_line = ",".join(verb_line) + "\n"
            verb_lines.append(verb_line)
            
            target_line = ["0"] * (target_n_categories)
            for t in target:
                if t == -1:
                    continue
                target_line[t] = "1"
            target_line = [str(idx)] + target_line
            target_line = ",".join(target_line) + "\n"
            target_lines.append(target_line)
    else:
        with open(os.path.join(instrument_path, vid_id+'.txt'), "r") as f:
            line = f.readlines()[line_index]
            line_entries = line.split(",")
            line_entries[0] = str(idx)
            line = ",".join(line_entries)
            instrument_lines.append(line)
            
        with open(os.path.join(verb_path, vid_id+'.txt'), "r") as f:
            line = f.readlines()[line_index]
            line_entries = line.split(",")
            line_entries[0] = str(idx)
            line = ",".join(line_entries)
            verb_lines.append(line)
        
        with open(os.path.join(target_path, vid_id+'.txt'), "r") as f:
            line = f.readlines()[line_index]
            line_entries = line.split(",")
            line_entries[0] = str(idx)
            line = ",".join(line_entries)
            target_lines.append(line)
            
        with open(os.path.join(triplet_path, vid_id+'.txt'), "r") as f:
            line = f.readlines()[line_index]
            line_entries = line.split(",")
            line_entries[0] = str(idx)
            line = ",".join(line_entries)
            triplet_lines.append(line)
            
# with open(os.path.join(instrument_path, "VID999.txt"), "w", encoding='UTF8') as f:
#     # for line in instrument_lines:
#     #     f.write(f"{line}")
#     f.writelines(instrument_lines)
# with open(os.path.join(verb_path, "VID999.txt"), "w", encoding='UTF8') as f:
#     for line in verb_lines:
#         f.write(f"{line}")
# with open(os.path.join(target_path, "VID999.txt"), "w", encoding='UTF8') as f:
#     for line in target_lines:
#         f.write(f"{line}")
# with open(os.path.join(triplet_path, "VID999.txt"), "w", encoding='UTF8') as f:
#     for line in triplet_lines:
#         f.write(f"{line}")
with open(os.path.join(instrument_path, "VID997.txt"), "w", encoding='UTF8') as f:
    # for line in instrument_lines:
    #     f.write(f"{line}")
    f.writelines(instrument_lines)
with open(os.path.join(verb_path, "VID997.txt"), "w", encoding='UTF8') as f:
    for line in verb_lines:
        f.write(f"{line}")
with open(os.path.join(target_path, "VID997.txt"), "w", encoding='UTF8') as f:
    for line in target_lines:
        f.write(f"{line}")
with open(os.path.join(triplet_path, "VID997.txt"), "w", encoding='UTF8') as f:
    for line in triplet_lines:
        f.write(f"{line}")