In [1]:
import ast
import glob
import warnings
from collections import defaultdict
from datetime import date

import numpy as np
import pandas as pd
import wandb

today = date.today()
api = wandb.Api()

# # Find all csv files in the current directory
csv_files = glob.glob("/home/lev/projects/TopoBenchmarkX/big_csv/*.csv")
# # Collect all the names of the csv files without the extension
csv_names = [csv_file[:-4] for csv_file in csv_files]
project_name = "TopoBenchmarkX_Graph"  
user = "telyatnikov_sap"

if project_name not in csv_names:
    runs = api.runs(f"{user}/{project_name}")

    summary_list, config_list, name_list = [], [], []
    for run in runs:
        # .summary contains the output keys/values for metrics like accuracy.
        #  We call ._json_dict to omit large files
        summary_list.append(run.summary._json_dict)

        # .config contains the hyperparameters.
        #  We remove special values that start with _.
        config_list.append(
            {k: v for k, v in run.config.items() if not k.startswith("_")}
        )

        # .name is the human-readable name of the run.
        name_list.append(run.name)

    runs_df = pd.DataFrame(
        {"summary": summary_list, "config": config_list, "name": name_list}
    )

    runs_df.to_csv(f"{user}_{project_name}.csv")
else:
    runs_df = pd.read_csv(f"{user}_{project_name}.csv", index_col=0)

    for row in runs_df.iloc:
        row["summary"] = ast.literal_eval(row["summary"])
        row["config"] = ast.literal_eval(row["config"])


for row in runs_df.iloc:
    row["summary"].update(row["config"])

lst = [i["summary"] for i in runs_df.iloc]
df = pd.DataFrame.from_dict(lst)

df_init = df.copy()

# Get average epoch run time
df["epoch_run_time"] = df["_runtime"] / df["epoch"]

In [2]:
def normalize_column(df, column_to_normalize):
    # Use json_normalize to flatten the nested dictionaries into separate columns
    flattened_df = pd.json_normalize(df[column_to_normalize])
    # Rename columns to include 'nested_column' prefix
    flattened_df.columns = [
        f"{column_to_normalize}.{col}" for col in flattened_df.columns
    ]
    # Concatenate the flattened DataFrame with the original DataFrame
    result_df = pd.concat([df, flattened_df], axis=1)
    # Get new columns names
    new_columns = flattened_df.columns
    # Drop the original nested column if needed
    result_df.drop(column_to_normalize, axis=1, inplace=True)
    return result_df, new_columns


# Config columns to normalize
columns_to_normalize = ["model", "dataset", "callbacks", "paths"]

# Keep track of config columns added
config_columns = []
for column in columns_to_normalize:
    df, columns = normalize_column(df, column)
    config_columns.extend(columns)

## Workout us_demographic 

In [3]:
# For every rows where df['dataset.parameters.data_name'] == 'US-county-demos' extend the 'dataset.parameters.data_name' with dataset.parameters.task_variable 
# and set it to 'US-county-demos' + '-' + dataset.parameters.task_variable
df.loc[df['dataset.parameters.data_name'] == 'US-county-demos', 'dataset.parameters.data_name'] = df.loc[df['dataset.parameters.data_name'] == 'US-county-demos', 'dataset.parameters.data_name'] + '-' + df.loc[df['dataset.parameters.data_name'] == 'US-county-demos', 'dataset.parameters.task_variable']

In [4]:
# Print all columns 10 per line
for i in range(0, len(df.columns), 5):
    print(list(df.columns[i:i + 5]))


['train/loss', 'train/recall', 'test/accuracy', 'val/precision', 'trainer/global_step']
['_wandb', 'lr-Adam', '_runtime', 'val/loss', 'test/auroc']
['train/accuracy', 'test/loss', 'val/auroc', 'val/accuracy', 'test/precision']
['_step', 'epoch', '_timestamp', 'val/recall', 'test/recall']
['train/auroc', 'train/precision', 'seed', 'tags', 'extras']
['trainer', 'ckpt_path', 'task_name', 'model/params/total', 'model/params/trainable']
['model/params/non_trainable', 'val/mse', 'val/mae', 'train/mae', 'train/mse']
['test/mae', 'test/mse', 'epoch_run_time', 'model.compile', 'model._target_']
['model.model_name', 'model.model_domain', 'model.loss.task', 'model.loss._target_', 'model.loss.loss_type']
['model.readout._target_', 'model.readout.hidden_dim', 'model.readout.readout_name', 'model.readout.num_cell_dimensions', 'model.backbone.act']
['model.backbone.dropout', 'model.backbone._target_', 'model.backbone.num_layers', 'model.backbone.in_channels', 'model.backbone.hidden_channels']
['model

### See unique datasets

In [5]:
print(df['dataset.parameters.data_name'].unique())
print("Num unique datasets:", len(df['dataset.parameters.data_name'].unique()))

['minesweeper' 'questions' 'tolokers' 'amazon_ratings' 'roman_empire'
 'IMDB-MULTI' 'IMDB-BINARY' 'REDDIT-BINARY' 'NCI109' 'NCI1' 'PROTEINS'
 'MUTAG' 'ZINC' 'PubMed' 'citeseer' 'Cora'
 'US-county-demos-UnemploymentRate' 'US-county-demos-BachelorRate'
 'US-county-demos-DeathRate' 'US-county-demos-BirthRate'
 'US-county-demos-MigraRate' 'US-county-demos-MedianIncome'
 'US-county-demos-Election']
Num unique datasets: 23


## See unique models

In [6]:
print(df['model.model_name'].unique())

['gcn' 'gin' 'gat']


## Solve batch problems

In [7]:
datasets = ['minesweeper', 'questions', 'tolokers', 'amazon_ratings', 'roman_empire']
models = ['gcn', 'gin']
# For the following models and datasets I mistook the batch size, it should be 1, instead of 256 or 128
# Keep the run where batch size is 128 and then change the batch size to 1
for model in models:
    for dataset in datasets:
        # Change the batch size to 1 when it is 128
        df.loc[(df['model.model_name'] == model) & (df['dataset.parameters.data_name'] == dataset) & (df['dataset.parameters.batch_size'] == 128), 'dataset.parameters.batch_size'] = 1
        # Drop runs where batch size is 256
        df.drop(df[(df['model.model_name'] == model) & (df['dataset.parameters.data_name'] == dataset) & (df['dataset.parameters.batch_size'] == 256)].index, inplace=True)
        

## Solve issue with projection dropout

In [8]:
print(df['model.feature_encoder.proj_dropout'].unique())

[0.5  0.25 0.  ]


In [9]:
# Keep rows where model.feature_encoder.proj_dropout is [0.5  0.25]
df = df[df['model.feature_encoder.proj_dropout'].isin([0.5, 0.25])]


In [10]:
df.reset_index(drop=True, inplace=True)

In [11]:
# Sweeped parameters: 
sweeped_columns = [
    'model.optimizer.lr', 
    'model.feature_encoder.out_channels',
    'model.backbone.num_layers',
    'model.feature_encoder.proj_dropout',
    'dataset.parameters.batch_size',
    'dataset.parameters.data_seed',
    'seed',
]



# For each model and dataset go over all the sweeped parameters and print the unique values
for model in df['model.model_name'].unique():
    print(f"Model: {model}")
    for dataset in df['dataset.parameters.data_name'].unique():
        print(f"Dataset: {dataset}")
        for column in sweeped_columns:
            print(f"Column: {column}")
            print(df.loc[(df['model.model_name'] == model) & (df['dataset.parameters.data_name'] == dataset), column].unique())
        
        print('---------------NEW DATASET------------------')
    print('---------------NEW MODEL------------------')


Model: gcn
Dataset: minesweeper
Column: model.optimizer.lr
[0.001 0.01 ]
Column: model.feature_encoder.out_channels
[128  64  32]
Column: model.backbone.num_layers
[4 3 2 1]
Column: model.feature_encoder.proj_dropout
[0.5  0.25]
Column: dataset.parameters.batch_size
[1]
Column: dataset.parameters.data_seed
[9 7 5 3 0]
Column: seed
[42]
---------------NEW DATASET------------------
Dataset: questions
Column: model.optimizer.lr
[0.001 0.01 ]
Column: model.feature_encoder.out_channels
[128  64  32]
Column: model.backbone.num_layers
[4 3 2 1]
Column: model.feature_encoder.proj_dropout
[0.5  0.25]
Column: dataset.parameters.batch_size
[1]
Column: dataset.parameters.data_seed
[9 7 5 3 0]
Column: seed
[42]
---------------NEW DATASET------------------
Dataset: tolokers
Column: model.optimizer.lr
[0.001 0.01 ]
Column: model.feature_encoder.out_channels
[128  64  32]
Column: model.backbone.num_layers
[4 3 2 1]
Column: model.feature_encoder.proj_dropout
[0.5  0.25]
Column: dataset.parameters.batch

[0.001 0.01 ]
Column: model.feature_encoder.out_channels
[128  64  32]
Column: model.backbone.num_layers
[2 1]
Column: model.feature_encoder.proj_dropout
[0.5  0.25]
Column: dataset.parameters.batch_size
[1]
Column: dataset.parameters.data_seed
[9 7 5 3 0]
Column: seed
[42]
---------------NEW DATASET------------------
Dataset: citeseer
Column: model.optimizer.lr
[0.001 0.01 ]
Column: model.feature_encoder.out_channels
[128  64  32]
Column: model.backbone.num_layers
[2 1]
Column: model.feature_encoder.proj_dropout
[0.5  0.25]
Column: dataset.parameters.batch_size
[1]
Column: dataset.parameters.data_seed
[9 7 5 3 0]
Column: seed
[42]
---------------NEW DATASET------------------
Dataset: Cora
Column: model.optimizer.lr
[0.001 0.01 ]
Column: model.feature_encoder.out_channels
[128  64  32]
Column: model.backbone.num_layers
[2 1]
Column: model.feature_encoder.proj_dropout
[0.5  0.25]
Column: dataset.parameters.batch_size
[1]
Column: dataset.parameters.data_seed
[9 7 5 3 0]
Column: seed
[42]

### Get the best results

In [12]:
# Extract best results for each model and dataset
# 1. Keep the columns that are necessary for the comparison
sweeped_columns = [
    'model.optimizer.lr', 
    'model.feature_encoder.out_channels',
    'model.backbone.num_layers',
    'model.feature_encoder.proj_dropout',
    'dataset.parameters.batch_size',
    # 'dataset.parameters.data_seed',
    # 'seed',
]
run_columns = ['dataset.parameters.data_seed','seed']

# Dataset and model columns
dataset_model_columns = ['model.model_name', 'dataset.parameters.data_name']

# Performance columns
performance_columns = [
    'val/loss', 'test/loss',
    'val/mae', 'test/mae',
    'val/mse', 'test/mse',
    'val/accuracy', 'test/accuracy',
    'val/auroc','test/auroc',
    'val/recall', 'test/recall',
    'val/precision', 'test/precision',
    ]
keep_columns = dataset_model_columns + sweeped_columns + performance_columns + run_columns
df = df[keep_columns]

In [13]:
performance_classification = [
    'val/accuracy', 'test/accuracy',
    'val/auroc','test/auroc',
    'val/recall', 'test/recall',
    'val/precision', 'test/precision',
    ]
performance_regression = [
    'val/mae', 'test/mae',
    'val/mse', 'test/mse',
    ]
# Define a dict of dicts for each dataset the corresponding optimization metrics
optimization_metrics = {
    'IMDB-MULTI': {'optim_metric': 'val/accuracy', 'eval_metric': 'test/accuracy', 'direction': 'max', 'performance_columns': performance_classification},
    'IMDB-BINARY': {'optim_metric': 'val/accuracy', 'eval_metric': 'test/accuracy', 'direction': 'max', 'performance_columns': performance_classification},
    'REDDIT-BINARY': {'optim_metric': 'val/accuracy', 'eval_metric': 'test/accuracy', 'direction': 'max', 'performance_columns': performance_classification},
    'NCI109': {'optim_metric': 'val/accuracy', 'eval_metric': 'test/accuracy', 'direction': 'max', 'performance_columns': performance_classification},
    'NCI1': {'optim_metric': 'val/accuracy', 'eval_metric': 'test/accuracy', 'direction': 'max', 'performance_columns': performance_classification},
    'PROTEINS': {'optim_metric': 'val/accuracy', 'eval_metric': 'test/accuracy', 'direction': 'max', 'performance_columns': performance_classification},
    'MUTAG': {'optim_metric': 'val/accuracy', 'eval_metric': 'test/accuracy', 'direction': 'max', 'performance_columns': performance_classification},
    'Cora': {'optim_metric': 'val/accuracy', 'eval_metric': 'test/accuracy', 'direction': 'max', 'performance_columns': performance_classification},
    'citeseer': {'optim_metric': 'val/accuracy', 'eval_metric': 'test/accuracy', 'direction': 'max', 'performance_columns': performance_classification},
    'PubMed': {'optim_metric': 'val/accuracy', 'eval_metric': 'test/accuracy', 'direction': 'max', 'performance_columns': performance_classification},

    'roman_empire': {'optim_metric': 'val/accuracy', 'eval_metric': 'test/accuracy', 'direction': 'max', 'performance_columns': performance_classification},
    'amazon_ratings': {'optim_metric': 'val/accuracy', 'eval_metric': 'test/accuracy', 'direction': 'max', 'performance_columns': performance_classification},
    
    'tolokers': {'optim_metric': 'val/auroc', 'eval_metric': 'test/auroc', 'direction': 'max', 'performance_columns': performance_classification},
    'questions': {'optim_metric': 'val/auroc', 'eval_metric': 'test/auroc', 'direction': 'max', 'performance_columns': performance_classification},
    'minesweeper': {'optim_metric': 'val/auroc', 'eval_metric': 'test/auroc', 'direction': 'max', 'performance_columns': performance_classification},

    'ZINC': {'optim_metric': 'val/mse', 'eval_metric': 'test/mae', 'direction': 'min', 'performance_columns': performance_regression},
    
    'US-county-demos-UnemploymentRate': {'optim_metric': 'val/mse', 'eval_metric': 'test/mse', 'direction': 'min', 'performance_columns': performance_regression},
    'US-county-demos-BachelorRate': {'optim_metric': 'val/mse', 'eval_metric': 'test/mse', 'direction': 'min', 'performance_columns': performance_regression},
    'US-county-demos-DeathRate': {'optim_metric': 'val/mse', 'eval_metric': 'test/mse', 'direction': 'min', 'performance_columns': performance_regression},
    'US-county-demos-BirthRate': {'optim_metric': 'val/mse', 'eval_metric': 'test/mse', 'direction': 'min', 'performance_columns': performance_regression},
    'US-county-demos-MigraRate': {'optim_metric': 'val/mse', 'eval_metric': 'test/mse', 'direction': 'min', 'performance_columns': performance_regression},
    'US-county-demos-MedianIncome': {'optim_metric': 'val/mse', 'eval_metric': 'test/mse', 'direction': 'min', 'performance_columns': performance_regression},
    'US-county-demos-Election': {'optim_metric': 'val/mse', 'eval_metric': 'test/mse', 'direction': 'min', 'performance_columns': performance_regression},

} 

len(optimization_metrics)

23

### Generate the best results

In [14]:
# Get unique datasets
datasets = list(df['dataset.parameters.data_name'].unique())
# Get unique models
models = list(df['model.model_name'].unique())

best_results = defaultdict(dict)
best_results_all_metrics = defaultdict(dict)
best_runs = defaultdict(dict)
# Got over each dataset and model and find the best result
for dataset in datasets:
    for model in models:
        # Get the subset of the DataFrame for the current dataset and model
        subset = df[
            (df['dataset.parameters.data_name'] == dataset)
            & (df['model.model_name'] == model)
        ]

        optim_metric = optimization_metrics[dataset]['optim_metric']
        eval_metric = optimization_metrics[dataset]['eval_metric']
        direction = optimization_metrics[dataset]['direction']
        
        # Keep metrics that matters for dataset
        performance_columns = optimization_metrics[dataset]['performance_columns']
        subset = subset[dataset_model_columns + sweeped_columns + performance_columns + run_columns]

        aggregated = subset.groupby(sweeped_columns, dropna=False).agg(
            {col: ["mean", "std"] for col in performance_columns},
        )

         # Go from MultiIndex to Index
        aggregated = aggregated.reset_index()
        aggregated = aggregated.sort_values(
                by=(optim_metric, "mean"), ascending=(direction == 'min')
            )
        
        # Git percent in case of classification
        if 'test/accuracy' in performance_columns:
            # Go over all the performance columns and multiply by 100
            for col in performance_columns:
                aggregated[(col, "mean")] *= 100
                aggregated[(col, "std")] *= 100
            
            # Round performance columns values up to 2 decimal points
            for col in performance_columns:
                aggregated[(col, "mean")] = aggregated[(col, "mean")].round(2)
                aggregated[(col, "std")] = aggregated[(col, "std")].round(2)
            
            
        else:
            # Round all values up to 4 decimal points
            # Round performance columns values up to 4 decimal points
            for col in performance_columns:
                aggregated[(col, "mean")] = aggregated[(col, "mean")].round(4)
                aggregated[(col, "std")] = aggregated[(col, "std")].round(4)
        
            
        
        # Get the best result
        final_best = aggregated.head(1)
        
        best_results[dataset][model] = {
            "mean": final_best[(eval_metric, "mean")].values[0],
            "std": final_best[(eval_metric, "std")].values[0],
        }

        # Extract best runs: 
        best_params = {}
        for col in sweeped_columns:
            best_params[col] = final_best[(col, '')].item()
        
        # Start with the entire DataFrame
        filtered_subset = subset.copy()

        # Iterate over each key-value pair in the best parameters dictionary and filter the DataFrame
        for param, value in best_params.items():
            filtered_subset = filtered_subset[filtered_subset[param] == value]
        best_runs[dataset][model] = filtered_subset
        
            
        


## Save obtained best results and best runs

In [15]:
# Convert nested dictionary to DataFrame
nested_dict = dict(best_results)
result_dict = pd.DataFrame.from_dict(
    {
        (i, j): nested_dict[i][j]
        for i in nested_dict
        for j in nested_dict[i].keys()
    },
    orient="index",
)

result_dict["performance"] = result_dict.apply(
    lambda x: f"{x['mean']} ± {x['std']}", axis=1
)
result_dict = result_dict.drop(["mean", "std"], axis=1)

# Reset multiindex
result_dict = result_dict.reset_index()
# rename columns
result_dict.columns = ["Dataset", "Model", "Performance"]

result_dict = result_dict.pivot_table(
    index="Model", columns="Dataset", values="Performance", aggfunc="first"
)

In [16]:
# Increase the number of allowed rows to display
pd.set_option("display.max_rows", 1000)
pd.set_option("display.max_columns", 1000)
pd.set_option("display.width", 1000)
result_dict.to_csv(f"best_results_graph.csv")