## Set up

In [1]:
from os import listdir
import pandas as pd

## Combine performance from multiple models

In [2]:
def combine_performance_files():
    """
    Combine performance files into a single file
    """

    # get files listed in performance directory
    performance_directory = '../data/derived/performance'
    performance_files = listdir(performance_directory)

    # remove jupyter notebook checkpoints from file list
    performance_files.remove('.ipynb_checkpoints')
    # remove summary from file list if it already exists
    if 'summary.csv' in performance_files:
        performance_files.remove('summary.csv')
    

    # initialize dictionary of empty lists to track metrics across models
    summary_dict = {'model_name':[],
                    'dataset_name':[],
                    'vector_name':[],
                    'hyperparameter_name':[],
                    'hyperparameter_value':[],
                    'metric_name':[],
                    'metric_value':[]}

    # iterate over files
    for file in performance_files:

        # get model, vector, hyperparameter, and dataset name from file name
        model_name, vector_string, hyperparameter_string, dataset_string = file.split('_')

        # get dataset name from dataset string
        dataset_name = dataset_string[:-(len('.csv'))]

        # get vector name from vector string
        vector_name = vector_string[len('vector'):]

        # get hyperparameter name based on model name
        if model_name == 'decisiontree':
            hyperparameter_name = 'maxdepth'
        elif model_name == 'svc':
            hyperparameter_name = 'c'
        elif model_name == 'mlp':
            hyperparameter_name = 'alpha'

        # get hyperparameter value from hyperparameter string
        hyperparameter_value = hyperparameter_string[len(hyperparameter_name):]

        # get metrics dataframe
        metrics_df = pd.read_csv(performance_directory + '/' + file)

        # iterate over metrics
        for metric_name in metrics_df['metric']:

            # get metric value
            metric_value = metrics_df[metrics_df['metric'] == metric_name]['value'].values[0]

            # save information to summary dictionary
            summary_dict['model_name'].append(model_name)
            summary_dict['dataset_name'].append(dataset_name)
            summary_dict['vector_name'].append(vector_name)
            summary_dict['hyperparameter_name'].append(hyperparameter_name)
            summary_dict['hyperparameter_value'].append(hyperparameter_value)
            summary_dict['metric_name'].append(metric_name)
            summary_dict['metric_value'].append(metric_value)

    # create dataframe from summary dictionary
    summary_df = pd.DataFrame(summary_dict)

    # write summary to CSV
    filepath_out = performance_directory + '/' + 'summary.csv'
    summary_df.to_csv(filepath_out, index=False)

    return None

In [3]:
combine_performance_files()