In [1]:
import pandas as pd
import numpy as np

# Path to your CSV file
csv_file_path = 'bike_sharing.csv'  # Update with your actual path

# Read the CSV file into a pandas DataFrame
df = pd.read_csv(csv_file_path)

# Process the 'cov_value' and 'PINAW_value' columns
df['cov_value'] = pd.to_numeric(df['cov_value'], errors='coerce')
df['PINAW_value'] = pd.to_numeric(df['PINAW_value'], errors='coerce')

# Drop rows with NaN values in 'cov_value' or 'PINAW_value'
df = df.dropna(subset=['cov_value', 'PINAW_value'])

# Add a 'coverage_level' column based on the order of results
df['result_index'] = df.groupby(['add_remove_every_n_epoch', 'proto_split_density_threshold',
                                 'proto_remove_density_threshold', 'repulsion_loss_margin',
                                 'seed']).cumcount()

# Map result_index to coverage levels
coverage_level_mapping = {0: 90, 1: 50, 2: 10}
df['coverage_level'] = df['result_index'].map(coverage_level_mapping)

# Define hyperparameter columns
hyperparameter_cols = ['add_remove_every_n_epoch', 'proto_split_density_threshold',
                       'proto_remove_density_threshold', 'repulsion_loss_margin']

group_cols = hyperparameter_cols + ['coverage_level']

# Compute the average and standard deviation for each hyperparameter setting across seeds
grouped = df.groupby(group_cols).agg({
    'cov_value': ['mean', 'std'],
    'PINAW_value': ['mean', 'std'],
}).reset_index()

# Flatten the MultiIndex columns
grouped.columns = ['_'.join(col).strip('_') for col in grouped.columns.values]

# Now, define the function to find the best hyperparameters
def find_best_hyperparameters(grouped_df, coverage_target, tolerance=5):
    # Calculate the acceptable coverage range
    min_acceptable_coverage = (coverage_target - tolerance) / 100   
    max_acceptable_coverage = (coverage_target + tolerance) / 100

    # Filter for the desired coverage level
    coverage_df = grouped_df[grouped_df['coverage_level'] == coverage_target]

    # Filter hyperparameters within the acceptable coverage range
    acceptable_df = coverage_df[
        (coverage_df['cov_value_mean'] >= min_acceptable_coverage) &
        (coverage_df['cov_value_mean'] <= max_acceptable_coverage)
    ]

    if acceptable_df.empty:
        print(f"No hyperparameter settings found within {tolerance}% tolerance for {coverage_target}% coverage.")
        return None

    # Find the hyperparameter setting(s) with the minimum PINAW_value_mean
    min_pinaw = acceptable_df['PINAW_value_mean'].min()
    best_settings = acceptable_df[acceptable_df['PINAW_value_mean'] == min_pinaw]

    return best_settings

# Find best hyperparameters for each coverage level
best_settings_90 = find_best_hyperparameters(grouped, 90, tolerance=5)
best_settings_50 = find_best_hyperparameters(grouped, 50, tolerance=5)
best_settings_10 = find_best_hyperparameters(grouped, 10, tolerance=5)

# Print the best hyperparameter settings
# print("Best hyperparameters for 90% coverage level:")
print(best_settings_90[hyperparameter_cols + ['cov_value_mean', 'PINAW_value_mean']])

# print("\nBest hyperparameters for 50% coverage level:")
print(best_settings_50[hyperparameter_cols + ['cov_value_mean', 'PINAW_value_mean']])

# print("\nBest hyperparameters for 10% coverage level:")
print(best_settings_10[hyperparameter_cols + ['cov_value_mean', 'PINAW_value_mean']])

# Analyze the influence of each hyperparameter on the results
# def analyze_hyperparameter_influence(grouped_df):
#     # Ensure hyperparameters are numeric
#     analysis_df = grouped_df.copy()
#     for col in hyperparameter_cols:
#         analysis_df[col] = pd.to_numeric(analysis_df[col], errors='coerce')

#     # Drop any rows with NaN hyperparameters
#     analysis_df = analysis_df.dropna(subset=hyperparameter_cols)

#     # Compute the correlation matrix
#     corr_matrix = analysis_df[hyperparameter_cols + ['cov_value_mean', 'PINAW_value_mean']].corr()

#     print("\nCorrelation matrix between hyperparameters and PINAW_value_mean:")
#     print(corr_matrix['PINAW_value_mean'])

#     print("\nCorrelation matrix between hyperparameters and cov_value_mean:")
#     print(corr_matrix['cov_value_mean'])

# Call the analysis function
# analyze_hyperparameter_influence(grouped)

Best hyperparameters for 90% coverage level:
    add_remove_every_n_epoch  proto_split_density_threshold  \
26                        50                          0.001   

    proto_remove_density_threshold  repulsion_loss_margin  cov_value_mean  \
26                           0.001                  0.001        0.904084   

    PINAW_value_mean  
26           2.00789  

Best hyperparameters for 50% coverage level:
    add_remove_every_n_epoch  proto_split_density_threshold  \
31                        50                          0.001   

    proto_remove_density_threshold  repulsion_loss_margin  cov_value_mean  \
31                           0.001                   0.01        0.495916   

    PINAW_value_mean  
31          0.546909  

Best hyperparameters for 10% coverage level:
   add_remove_every_n_epoch  proto_split_density_threshold  \
9                        50                          0.001   

   proto_remove_density_threshold  repulsion_loss_margin  cov_value_mean  \
9     

In [7]:
import pandas as pd
import numpy as np

# Path to your CSV file
csv_file_path = 'concrete.csv'  # Update with your actual path

# Read the CSV file into a pandas DataFrame
df = pd.read_csv(csv_file_path)

# Process the 'cov_value' and 'PINAW_value' columns
df['cov_value'] = pd.to_numeric(df['cov_value'], errors='coerce')
df['PINAW_value'] = pd.to_numeric(df['PINAW_value'], errors='coerce')

# Drop rows with NaN values in 'cov_value' or 'PINAW_value'
df = df.dropna(subset=['cov_value', 'PINAW_value'])

# Add a 'coverage_level' column based on the order of results
df['result_index'] = df.groupby(['add_remove_every_n_epoch', 'proto_split_density_threshold',
                                 'proto_remove_density_threshold', 'repulsion_loss_margin',
                                 'seed']).cumcount()

# Map result_index to coverage levels
coverage_level_mapping = {0: 90, 1: 50, 2: 10}
df['coverage_level'] = df['result_index'].map(coverage_level_mapping)

# Define hyperparameter columns
hyperparameter_cols = ['add_remove_every_n_epoch', 'proto_split_density_threshold',
                       'proto_remove_density_threshold', 'repulsion_loss_margin']

group_cols = hyperparameter_cols + ['coverage_level']

# Compute the average and standard deviation for each hyperparameter setting across seeds
grouped = df.groupby(group_cols).agg({
    'cov_value': ['mean', 'std'],
    'PINAW_value': ['mean', 'std'],
}).reset_index()

# Flatten the MultiIndex columns
grouped.columns = ['_'.join(col).strip('_') for col in grouped.columns.values]

# Now, define the function to find the best hyperparameters
def find_best_hyperparameters(grouped_df, coverage_target, tolerance=5):
    # Calculate the acceptable coverage range
    min_acceptable_coverage = (coverage_target - tolerance) / 100   
    max_acceptable_coverage = (coverage_target + tolerance) / 100

    # Filter for the desired coverage level
    coverage_df = grouped_df[grouped_df['coverage_level'] == coverage_target]

    # Filter hyperparameters within the acceptable coverage range
    acceptable_df = coverage_df[
        (coverage_df['cov_value_mean'] >= min_acceptable_coverage) &
        (coverage_df['cov_value_mean'] <= max_acceptable_coverage)
    ]

    if acceptable_df.empty:
        print(f"No hyperparameter settings found within {tolerance}% tolerance for {coverage_target}% coverage.")
        return None

    # Find the hyperparameter setting(s) with the minimum PINAW_value_mean
    min_pinaw = acceptable_df['PINAW_value_mean'].min()
    best_settings = acceptable_df[acceptable_df['PINAW_value_mean'] == min_pinaw]

    return best_settings

# Find best hyperparameters for each coverage level
best_settings_90 = find_best_hyperparameters(grouped, 90, tolerance=5)
best_settings_50 = find_best_hyperparameters(grouped, 50, tolerance=5)
best_settings_10 = find_best_hyperparameters(grouped, 10, tolerance=5)

# Print the best hyperparameter settings
# print("Best hyperparameters for 90% coverage level:")
if best_settings_90 is not None:
    cov_value_mean = best_settings_90['cov_value_mean'].values[0]
    cov_value_std = best_settings_90['cov_value_std'].values[0]

    pinaw_value_mean = best_settings_90['PINAW_value_mean'].values[0]
    pinaw_value_std = best_settings_90['PINAW_value_std'].values[0]

    # Printing the values in the required format
    print(f"{cov_value_mean},{cov_value_std}")
    print(f"{pinaw_value_mean},{pinaw_value_std}")
else:
    print("No best settings found for 90% coverage level.")

# print("\nBest hyperparameters for 50% coverage level:")
if best_settings_50 is not None:
    # print(best_settings_50[hyperparameter_cols + ['cov_value_mean', 'cov_value_std', 'PINAW_value_mean', 'PINAW_value_std']])
    cov_value_mean = best_settings_50['cov_value_mean'].values[0]
    cov_value_std = best_settings_50['cov_value_std'].values[0]

    pinaw_value_mean = best_settings_50['PINAW_value_mean'].values[0]
    pinaw_value_std = best_settings_50['PINAW_value_std'].values[0]

    # Printing the values in the required format
    print(f"{cov_value_mean},{cov_value_std}")
    print(f"{pinaw_value_mean},{pinaw_value_std}")
else:
    print("No best settings found for 50% coverage level.")

# print("\nBest hyperparameters for 10% coverage level:")
if best_settings_10 is not None:
    # print(best_settings_10[hyperparameter_cols + ['cov_value_mean', 'cov_value_std', 'PINAW_value_mean', 'PINAW_value_std']])
    
    cov_value_mean = best_settings_10['cov_value_mean'].values[0]
    cov_value_std = best_settings_10['cov_value_std'].values[0]

    pinaw_value_mean = best_settings_10['PINAW_value_mean'].values[0]
    pinaw_value_std = best_settings_10['PINAW_value_std'].values[0]

    # Printing the values in the required format
    print(f"{cov_value_mean},{cov_value_std}")
    print(f"{pinaw_value_mean},{pinaw_value_std}")
else:
    print("No best settings found for 10% coverage level.")

# # Analyze the influence of each hyperparameter on the results
# def analyze_hyperparameter_influence(grouped_df):
#     # Ensure hyperparameters are numeric
#     analysis_df = grouped_df.copy()
#     for col in hyperparameter_cols:
#         analysis_df[col] = pd.to_numeric(analysis_df[col], errors='coerce')

#     # Drop any rows with NaN hyperparameters
#     analysis_df = analysis_df.dropna(subset=hyperparameter_cols)

#     # Compute the correlation matrix
#     corr_matrix = analysis_df[hyperparameter_cols + ['cov_value_mean', 'PINAW_value_mean']].corr()

#     print("\nCorrelation matrix between hyperparameters and PINAW_value_mean:")
#     print(corr_matrix['PINAW_value_mean'])

#     print("\nCorrelation matrix between hyperparameters and cov_value_mean:")
#     print(corr_matrix['cov_value_mean'])

# # Call the analysis function
# analyze_hyperparameter_influence(grouped)

KeyError: 0