In [12]:
import pandas as pd
import numpy as np

# Path to your CSV file
csv_file_path = 'concrete.csv'  # Update with your actual path

# Read the CSV file into a pandas DataFrame
df = pd.read_csv(csv_file_path)

# Assuming your CSV has the following columns:
# 'add_remove_every_n_epoch', 'proto_split_density_threshold', 'proto_remove_density_threshold',
# 'repulsion_loss_margin', 'seed', 'device', 'cov_value', 'PINAW_value'

# First, we need to process the 'cov_value' and 'PINAW_value' columns
# Convert them to numeric values, handling any potential non-numeric entries
df['cov_value'] = pd.to_numeric(df['cov_value'], errors='coerce')
df['PINAW_value'] = pd.to_numeric(df['PINAW_value'], errors='coerce')

# Drop rows with NaN values in 'cov_value' or 'PINAW_value'
df = df.dropna(subset=['cov_value', 'PINAW_value'])

# Now, we need to map each row to its coverage level (90, 50, 10)
# Assuming that the three coverage levels correspond to the three lines in the output,
# and there is some way to identify them. Since we didn't include an identifier in the CSV,
# we can add a 'coverage_level' column based on the order of the results for each experiment.

# Add an identifier for each group of three results per experiment
# add_remove_every_n_epoch,proto_split_density_threshold,proto_remove_density_threshold,repulsion_loss_margin,
df['result_index'] = df.groupby(['add_remove_every_n_epoch', 'proto_split_density_threshold',
                                 'proto_remove_density_threshold', 'repulsion_loss_margin',
                                 'seed']).cumcount()

# # Map result_index to coverage levels
coverage_level_mapping = {0: 90, 1: 50, 2: 10}
df['coverage_level'] = df['result_index'].map(coverage_level_mapping)

# Now, we can group by hyperparameters and coverage level
hyperparameter_cols = ['add_remove_every_n_epoch', 'proto_split_density_threshold',
                       'proto_remove_density_threshold', 'repulsion_loss_margin']

group_cols = hyperparameter_cols + ['coverage_level']

# Compute the average coverage and PINAW for each hyperparameter setting across seeds
grouped = df.groupby(group_cols).agg({
    'cov_value': 'mean',
    'PINAW_value': 'mean'
    'cov_std' : 'std',
    'PINAW_std' : 'std',
}).reset_index()


# Now, for each coverage level, find the best hyperparameter settings
# Criteria:
# - For coverage, we can have a 5% tolerance (e.g., for 90% coverage level, acceptable coverage is >= 85%)
# - Within acceptable coverage, we want the hyperparameters that minimize PINAW (smaller is better)

def find_best_hyperparameters(grouped_df, coverage_target, tolerance=3):
    # Calculate the acceptable coverage range
    min_acceptable_coverage = (coverage_target - tolerance) / 100   
    max_acceptable_coverage = (coverage_target + tolerance) / 100

    # Filter for the desired coverage level
    coverage_df = grouped_df[grouped_df['coverage_level'] == coverage_target]

    # Filter hyperparameters within the acceptable coverage range
    acceptable_df = coverage_df[
        (coverage_df['cov_value'] >= min_acceptable_coverage) &
        (coverage_df['cov_value'] <= max_acceptable_coverage)
    ]

    if acceptable_df.empty:
        print(f"No hyperparameter settings found within {tolerance}% tolerance for {coverage_target}% coverage.")
        return None

    # Find the hyperparameter setting(s) with the minimum PINAW_value
    min_pinaw = acceptable_df['PINAW_value'].min()
    best_settings = acceptable_df[acceptable_df['PINAW_value'] == min_pinaw]

    return best_settings

# Find best hyperparameters for each coverage level
best_settings_90 = find_best_hyperparameters(grouped, 90, tolerance=5)
best_settings_50 = find_best_hyperparameters(grouped, 50, tolerance=5)
best_settings_10 = find_best_hyperparameters(grouped, 10, tolerance=5)

# Print the best hyperparameter settings
print("Best hyperparameters for 90% coverage level:")
print(best_settings_90[hyperparameter_cols + ['cov_value', 'PINAW_value']])

print("\nBest hyperparameters for 50% coverage level:")
print(best_settings_50[hyperparameter_cols + ['cov_value', 'PINAW_value']])

print("\nBest hyperparameters for 10% coverage level:")
print(best_settings_10[hyperparameter_cols + ['cov_value', 'PINAW_value']])

# Optionally, analyze the influence of each hyperparameter on the results
# For example, compute correlations between hyperparameters and PINAW_value
def analyze_hyperparameter_influence(grouped_df):
    # We can compute the correlation between each hyperparameter and the PINAW_value
    # For this, we need to encode hyperparameters numerically if they aren't already
    analysis_df = grouped_df.copy()

    # Ensure hyperparameters are numeric
    for col in hyperparameter_cols:
        analysis_df[col] = pd.to_numeric(analysis_df[col], errors='coerce')

    # Drop any rows with NaN hyperparameters
    analysis_df = analysis_df.dropna(subset=hyperparameter_cols)

    # Compute the correlation matrix
    corr_matrix = analysis_df[hyperparameter_cols + ['cov_value', 'PINAW_value']].corr()

    print("\nCorrelation matrix between hyperparameters and PINAW_value:")
    print(corr_matrix['PINAW_value'])

    print("\nCorrelation matrix between hyperparameters and cov_value:")
    print(corr_matrix['cov_value'])

    # Additionally, we can plot the relationships if needed
    # For example, using seaborn pairplot or scatter plots

# Call the analysis function
analyze_hyperparameter_influence(grouped)

Best hyperparameters for 90% coverage level:
     add_remove_every_n_epoch  proto_split_density_threshold  \
128                       100                          0.001   

     proto_remove_density_threshold  repulsion_loss_margin  cov_value  \
128                          0.0002                  0.002   0.900971   

     PINAW_value  
128     1.166166  

Best hyperparameters for 50% coverage level:
     add_remove_every_n_epoch  proto_split_density_threshold  \
115                       100                          0.001   

     proto_remove_density_threshold  repulsion_loss_margin  cov_value  \
115                          0.0001                  0.002   0.519419   

     PINAW_value  
115     0.382514  

Best hyperparameters for 10% coverage level:
    add_remove_every_n_epoch  proto_split_density_threshold  \
48                        50                          0.002   

    proto_remove_density_threshold  repulsion_loss_margin  cov_value  \
48                          0.0002  

In [10]:
grouped["cov_value"]

0      0.116505
1      0.544661
2      0.913592
3      0.137865
4      0.524274
         ...   
211    0.526213
212    0.916505
213    0.108736
214    0.545630
215    0.919419
Name: cov_value, Length: 216, dtype: float64