In [6]:
import os
import csv
import json

def create_dict_from_csv_files(folder_path):
    eval_collections = {}
    
    # Loop over each file in the folder
    for filename in os.listdir(folder_path):
        # Check if the file is a CSV file
        if filename.endswith('.csv'):
            file_path = os.path.join(folder_path, filename)

            with open(file_path, 'r', newline='') as file:
                reader = csv.reader(file)
                
                # (Optional) Skip header row if you know the CSVs contain headers
                # next(reader)

                for row in reader:
                    # Each row has the format [battery, dataset, n, d, labels, k, noise, g]
                    # We only care about battery (row[0]) and dataset (row[1]) for the mapping
                    battery = row[0].strip()
                    dataset = row[1].strip()

                    if battery not in eval_collections:
                        eval_collections[battery] = set()
                    eval_collections[battery].add(dataset)
    
    # Convert sets to lists so they can be JSON-serialized easily
    for key in eval_collections:
        eval_collections[key] = sorted(list(eval_collections[key]))
    
    return eval_collections

def save_dict_to_json(eval_collections, json_filename):
    with open(json_filename, 'w') as json_file:
        json.dump(eval_collections, json_file, indent=4)

folder_path = '/Users/cajoshuapark/Dev/research/embedding_based_clustering_research/framework/collection_information'
eval_collections = create_dict_from_csv_files(folder_path)
print(eval_collections)

json_filename = '/Users/cajoshuapark/Dev/research/embedding_based_clustering_research/framework/collection_information/eval_collections.json'
save_dict_to_json(eval_collections, json_filename)
print(f"Data saved to {json_filename}")


{'battery': ['dataset'], 'g2mg': ['g2mg_128_10', 'g2mg_128_20', 'g2mg_128_30', 'g2mg_128_40', 'g2mg_128_50', 'g2mg_128_60', 'g2mg_128_70', 'g2mg_128_80', 'g2mg_128_90', 'g2mg_16_10', 'g2mg_16_20', 'g2mg_16_30', 'g2mg_16_40', 'g2mg_16_50', 'g2mg_16_60', 'g2mg_16_70', 'g2mg_16_80', 'g2mg_16_90', 'g2mg_1_10', 'g2mg_1_20', 'g2mg_1_30', 'g2mg_1_40', 'g2mg_1_50', 'g2mg_1_60', 'g2mg_1_70', 'g2mg_1_80', 'g2mg_1_90', 'g2mg_2_10', 'g2mg_2_20', 'g2mg_2_30', 'g2mg_2_40', 'g2mg_2_50', 'g2mg_2_60', 'g2mg_2_70', 'g2mg_2_80', 'g2mg_2_90', 'g2mg_32_10', 'g2mg_32_20', 'g2mg_32_30', 'g2mg_32_40', 'g2mg_32_50', 'g2mg_32_60', 'g2mg_32_70', 'g2mg_32_80', 'g2mg_32_90', 'g2mg_4_10', 'g2mg_4_20', 'g2mg_4_30', 'g2mg_4_40', 'g2mg_4_50', 'g2mg_4_60', 'g2mg_4_70', 'g2mg_4_80', 'g2mg_4_90', 'g2mg_64_10', 'g2mg_64_20', 'g2mg_64_30', 'g2mg_64_40', 'g2mg_64_50', 'g2mg_64_60', 'g2mg_64_70', 'g2mg_64_80', 'g2mg_64_90', 'g2mg_8_10', 'g2mg_8_20', 'g2mg_8_30', 'g2mg_8_40', 'g2mg_8_50', 'g2mg_8_60', 'g2mg_8_70', 'g2mg_8_80'