In [16]:
import pandas as pd
import numpy as np

# Path to your CSV file
csv_file_path = 'bike_sharing.csv'  # Update with your actual path

# Read the CSV file into a pandas DataFrame
df = pd.read_csv(csv_file_path)

# Process the 'cov_value' and 'PINAW_value' columns
df['cov_value'] = pd.to_numeric(df['cov_value'], errors='coerce')
df['PINAW_value'] = pd.to_numeric(df['PINAW_value'], errors='coerce')

# Drop rows with NaN values in 'cov_value' or 'PINAW_value'
df = df.dropna(subset=['cov_value', 'PINAW_value'])

# Add a 'coverage_level' column based on the order of results
df['result_index'] = df.groupby(['add_remove_every_n_epoch', 'proto_split_density_threshold',
                                 'proto_remove_density_threshold', 'repulsion_loss_margin',
                                 'seed']).cumcount()

# Map result_index to coverage levels
coverage_level_mapping = {0: 90, 1: 50, 2: 10}
df['coverage_level'] = df['result_index'].map(coverage_level_mapping)

# Define hyperparameter columns
hyperparameter_cols = ['add_remove_every_n_epoch', 'proto_split_density_threshold',
                       'proto_remove_density_threshold', 'repulsion_loss_margin']

group_cols = hyperparameter_cols + ['coverage_level']

# Compute the average and standard deviation for each hyperparameter setting across seeds
grouped = df.groupby(group_cols).agg({
    'cov_value': ['mean', 'std'],
    'PINAW_value': ['mean', 'std'],
}).reset_index()

# Flatten the MultiIndex columns
grouped.columns = ['_'.join(col).strip('_') for col in grouped.columns.values]

# Now, define the function to find the best hyperparameters
def find_best_hyperparameters(grouped_df, coverage_target, tolerance=5):
    # Calculate the acceptable coverage range
    min_acceptable_coverage = (coverage_target - tolerance) / 100   
    max_acceptable_coverage = (coverage_target + tolerance) / 100

    # Filter for the desired coverage level
    coverage_df = grouped_df[grouped_df['coverage_level'] == coverage_target]

    # Filter hyperparameters within the acceptable coverage range
    acceptable_df = coverage_df[
        (coverage_df['cov_value_mean'] >= min_acceptable_coverage) &
        (coverage_df['cov_value_mean'] <= max_acceptable_coverage)
    ]

    if acceptable_df.empty:
        print(f"No hyperparameter settings found within {tolerance}% tolerance for {coverage_target}% coverage.")
        return None

    # Find the hyperparameter setting(s) with the minimum PINAW_value_mean
    min_pinaw = acceptable_df['PINAW_value_mean'].min()
    best_settings = acceptable_df[acceptable_df['PINAW_value_mean'] == min_pinaw]

    return best_settings

# Find best hyperparameters for each coverage level
best_settings_90 = find_best_hyperparameters(grouped, 90, tolerance=5)
best_settings_50 = find_best_hyperparameters(grouped, 50, tolerance=5)
best_settings_10 = find_best_hyperparameters(grouped, 10, tolerance=5)

# Print the best hyperparameter settings
# print("Best hyperparameters for 90% coverage level:")
print(best_settings_90[hyperparameter_cols + ['cov_value_mean', 'PINAW_value_mean']])

# print("\nBest hyperparameters for 50% coverage level:")
print(best_settings_50[hyperparameter_cols + ['cov_value_mean', 'PINAW_value_mean']])

# print("\nBest hyperparameters for 10% coverage level:")
print(best_settings_10[hyperparameter_cols + ['cov_value_mean', 'PINAW_value_mean']])

# Analyze the influence of each hyperparameter on the results
# def analyze_hyperparameter_influence(grouped_df):
#     # Ensure hyperparameters are numeric
#     analysis_df = grouped_df.copy()
#     for col in hyperparameter_cols:
#         analysis_df[col] = pd.to_numeric(analysis_df[col], errors='coerce')

#     # Drop any rows with NaN hyperparameters
#     analysis_df = analysis_df.dropna(subset=hyperparameter_cols)

#     # Compute the correlation matrix
#     corr_matrix = analysis_df[hyperparameter_cols + ['cov_value_mean', 'PINAW_value_mean']].corr()

#     print("\nCorrelation matrix between hyperparameters and PINAW_value_mean:")
#     print(corr_matrix['PINAW_value_mean'])

#     print("\nCorrelation matrix between hyperparameters and cov_value_mean:")
#     print(corr_matrix['cov_value_mean'])

# Call the analysis function
# analyze_hyperparameter_influence(grouped)

   add_remove_every_n_epoch  proto_split_density_threshold  \
2                     99999                          0.001   
5                     99999                          0.001   

   proto_remove_density_threshold  repulsion_loss_margin  cov_value_mean  \
2                          0.0001                  0.001        0.905581   
5                          0.0001                  0.010        0.905581   

   PINAW_value_mean  
2          2.222284  
5          2.222284  
   add_remove_every_n_epoch  proto_split_density_threshold  \
1                     99999                          0.001   
4                     99999                          0.001   

   proto_remove_density_threshold  repulsion_loss_margin  cov_value_mean  \
1                          0.0001                  0.001        0.497986   
4                          0.0001                  0.010        0.497986   

   PINAW_value_mean  
1          0.597663  
4          0.597663  
   add_remove_every_n_epoch  proto_s

In [46]:
import pandas as pd
import numpy as np

# Path to your CSV file
csv_file_path = 'meps19.csv'  # Update with your actual path

# Read the CSV file into a pandas DataFrame
df = pd.read_csv(csv_file_path)

# Process the 'cov_value' and 'PINAW_value' columns
df['cov_value'] = pd.to_numeric(df['cov_value'], errors='coerce')
df['PINAW_value'] = pd.to_numeric(df['PINAW_value'], errors='coerce')

# Drop rows with NaN values in 'cov_value' or 'PINAW_value'
df = df.dropna(subset=['cov_value', 'PINAW_value'])

# Add a 'coverage_level' column based on the order of results
df['result_index'] = df.groupby(['add_remove_every_n_epoch', 'proto_split_density_threshold',
                                 'proto_remove_density_threshold', 'repulsion_loss_margin',
                                 'seed']).cumcount()

# Map result_index to coverage levels
coverage_level_mapping = {0: 90, 1: 50, 2: 10}
df['coverage_level'] = df['result_index'].map(coverage_level_mapping)

# Define hyperparameter columns
hyperparameter_cols = ['add_remove_every_n_epoch', 'proto_split_density_threshold',
                       'proto_remove_density_threshold', 'repulsion_loss_margin']

group_cols = hyperparameter_cols + ['coverage_level']

# Compute the average and standard deviation for each hyperparameter setting across seeds
grouped = df.groupby(group_cols).agg({
    'cov_value': ['mean', 'std'],
    'PINAW_value': ['mean', 'std'],
}).reset_index()

# Flatten the MultiIndex columns
grouped.columns = ['_'.join(col).strip('_') for col in grouped.columns.values]

# Now, define the function to find the best hyperparameters
def find_best_hyperparameters(grouped_df, coverage_target, tolerance=5):
    # Calculate the acceptable coverage range
    min_acceptable_coverage = (coverage_target - tolerance) / 100   
    max_acceptable_coverage = (coverage_target + tolerance) / 100

    # Filter for the desired coverage level
    coverage_df = grouped_df[grouped_df['coverage_level'] == coverage_target]

    # Filter hyperparameters within the acceptable coverage range
    acceptable_df = coverage_df[
        (coverage_df['cov_value_mean'] >= min_acceptable_coverage) &
        (coverage_df['cov_value_mean'] <= max_acceptable_coverage)
    ]

    if acceptable_df.empty:
        print(f"No hyperparameter settings found within {tolerance}% tolerance for {coverage_target}% coverage.")
        return None

    # Find the hyperparameter setting(s) with the minimum PINAW_value_mean
    min_pinaw = acceptable_df['PINAW_value_mean'].min()
    best_settings = acceptable_df[acceptable_df['PINAW_value_mean'] == min_pinaw]

    return best_settings

# Find best hyperparameters for each coverage level
best_settings_90 = find_best_hyperparameters(grouped, 90, tolerance=5)
best_settings_50 = find_best_hyperparameters(grouped, 50, tolerance=5)
best_settings_10 = find_best_hyperparameters(grouped, 10, tolerance=5)



# Print the best hyperparameter settings
# print("Best hyperparameters for 90% coverage level:")
if best_settings_90 is not None:
    # print(best_settings_90[hyperparameter_cols + ['cov_value_mean', 'cov_value_std', 'PINAW_value_mean', 'PINAW_value_std']])
    
    cov_value_mean = best_settings_90['cov_value_mean'].values[0]
    cov_value_std = best_settings_90['cov_value_std'].values[0]

    pinaw_value_mean = best_settings_90['PINAW_value_mean'].values[0]
    pinaw_value_std = best_settings_90['PINAW_value_std'].values[0]

    # Printing the values in the required format
    print(f"{cov_value_mean:.2},{cov_value_std:.2}")
    print(f"{pinaw_value_mean:.2},{pinaw_value_std:.2}")
else:
    print("No best settings found for 90% coverage level.")

# print("\nBest hyperparameters for 50% coverage level:")
if best_settings_50 is not None:
    # print(best_settings_50[hyperparameter_cols + ['cov_value_mean', 'cov_value_std', 'PINAW_value_mean', 'PINAW_value_std']])
    cov_value_mean = best_settings_50['cov_value_mean'].values[0]
    cov_value_std = best_settings_50['cov_value_std'].values[0]

    pinaw_value_mean = best_settings_50['PINAW_value_mean'].values[0]
    pinaw_value_std = best_settings_50['PINAW_value_std'].values[0]

    # Printing the values in the required format
    print(f"{cov_value_mean:.2},{cov_value_std:.2}")
    print(f"{pinaw_value_mean:.2},{pinaw_value_std:.2}")
else:
    print("No best settings found for 50% coverage level.")

# print("\nBest hyperparameters for 10% coverage level:")
if best_settings_10 is not None:
    # print(best_settings_10[hyperparameter_cols + ['cov_value_mean', 'cov_value_std', 'PINAW_value_mean', 'PINAW_value_std']])
    
    cov_value_mean = best_settings_10['cov_value_mean'].values[0]
    cov_value_std = best_settings_10['cov_value_std'].values[0]

    pinaw_value_mean = best_settings_10['PINAW_value_mean'].values[0]
    pinaw_value_std = best_settings_10['PINAW_value_std'].values[0]

    # Printing the values in the required format
    print(f"{cov_value_mean:.2},{cov_value_std:.2}")
    print(f"{pinaw_value_mean:.2},{pinaw_value_std:.2}")
else:
    print("No best settings found for 10% coverage level.")

# # Analyze the influence of each hyperparameter on the results
# def analyze_hyperparameter_influence(grouped_df):
#     # Ensure hyperparameters are numeric
#     analysis_df = grouped_df.copy()
#     for col in hyperparameter_cols:
#         analysis_df[col] = pd.to_numeric(analysis_df[col], errors='coerce')

#     # Drop any rows with NaN hyperparameters
#     analysis_df = analysis_df.dropna(subset=hyperparameter_cols)

#     # Compute the correlation matrix
#     corr_matrix = analysis_df[hyperparameter_cols + ['cov_value_mean', 'PINAW_value_mean']].corr()

#     print("\nCorrelation matrix between hyperparameters and PINAW_value_mean:")
#     print(corr_matrix['PINAW_value_mean'])

#     print("\nCorrelation matrix between hyperparameters and cov_value_mean:")
#     print(corr_matrix['cov_value_mean'])

# # Call the analysis function
# analyze_hyperparameter_influence(grouped)

0.9,0.014
1.0,0.061
0.51,0.015
0.51,0.034
0.095,0.0098
0.12,0.011


In [6]:
import pandas as pd
import numpy as np

# Path to your CSV file
csv_file_path = 'concrete.csv'  # Update with your actual path

# Read the CSV file into a pandas DataFrame
df = pd.read_csv(csv_file_path)

# Process the 'cov_value' and 'PINAW_value' columns
df['cov_value'] = pd.to_numeric(df['cov_value'], errors='coerce')
df['PINAW_value'] = pd.to_numeric(df['PINAW_value'], errors='coerce')

# Drop rows with NaN values in 'cov_value' or 'PINAW_value'
df = df.dropna(subset=['cov_value', 'PINAW_value'])

# Add a 'result_index' column based on the order of results
df['result_index'] = df.groupby(['add_remove_every_n_epoch', 'proto_split_density_threshold',
                                 'proto_remove_density_threshold', 'repulsion_loss_margin',
                                 'seed']).cumcount()

# Map result_index to coverage levels
coverage_level_mapping = {0: 90, 1: 50, 2: 10}
df['coverage_level'] = df['result_index'].map(coverage_level_mapping)

# Define hyperparameter columns
hyperparameter_cols = ['add_remove_every_n_epoch', 'proto_split_density_threshold',
                       'proto_remove_density_threshold', 'repulsion_loss_margin']

group_cols = hyperparameter_cols + ['coverage_level']

# Compute the average and standard deviation for each hyperparameter setting across seeds
grouped = df.groupby(group_cols).agg({
    'cov_value': ['mean', 'std'],
    'PINAW_value': ['mean', 'std'],
}).reset_index()

# Flatten the MultiIndex columns
grouped.columns = ['_'.join(col).strip('_') for col in grouped.columns.values]

# Pivot the dataframe to get coverage levels as columns
pivoted = grouped.pivot(index=hyperparameter_cols,
                        columns='coverage_level',
                        values=['cov_value_mean', 'cov_value_std', 'PINAW_value_mean', 'PINAW_value_std'])

# Flatten the MultiIndex columns
pivoted.columns = ['{}_{}'.format(col[0], int(col[1]) / 100) for col in pivoted.columns]

# Reset the index to turn hyperparameter_cols back into columns
pivoted = pivoted.reset_index()

# Rename the columns to match the desired format
col_rename = {
    'cov_value_mean_0.9': 'cov(0.9)',
    'cov_value_mean_0.5': 'cov(0.5)',
    'cov_value_mean_0.1': 'cov(0.1)',
    'cov_value_std_0.9': 'cov_std(0.9)',
    'cov_value_std_0.5': 'cov_std(0.5)',
    'cov_value_std_0.1': 'cov_std(0.1)',
    'PINAW_value_mean_0.9': 'pinaw(0.9)',
    'PINAW_value_mean_0.5': 'pinaw(0.5)',
    'PINAW_value_mean_0.1': 'pinaw(0.1)',
    'PINAW_value_std_0.9': 'pinaw_std(0.9)',
    'PINAW_value_std_0.5': 'pinaw_std(0.5)',
    'PINAW_value_std_0.1': 'pinaw_std(0.1)',
}

pivoted.rename(columns=col_rename, inplace=True)

# Reorder columns if necessary
desired_order = hyperparameter_cols + [
    'cov(0.9)', 'cov_std(0.9)', 'pinaw(0.9)', 'pinaw_std(0.9)',
    'cov(0.5)', 'cov_std(0.5)', 'pinaw(0.5)', 'pinaw_std(0.5)',
    'cov(0.1)', 'cov_std(0.1)', 'pinaw(0.1)', 'pinaw_std(0.1)'
]
pivoted = pivoted.reindex(columns=desired_order)

# Now, 'pivoted' DataFrame has the desired format
# Set the coverage threshold range
coverage_target = 0.9
coverage_threshold = 0.03

# Filter rows where 'cov(0.9)' is within the desired range
filtered = pivoted[(pivoted['cov(0.9)'] >= (0.9 - coverage_threshold)) & 
                   (pivoted['cov(0.9)'] <= (0.9 + coverage_threshold)) & 
                   (pivoted['cov(0.5)'] >= (0.5 - coverage_threshold)) & 
                   (pivoted['cov(0.5)'] <= (0.5 + coverage_threshold)) &
                   (pivoted['cov(0.1)'] >= (0.1 - coverage_threshold)) & 
                   (pivoted['cov(0.1)'] <= (0.1 + coverage_threshold))]

# Find the row with the minimum 'pinaw(0.9)' within the filtered DataFrame
best_row = filtered.iloc[filtered['pinaw(0.9)'].argmin()]

print(f"{best_row['cov(0.9)']:.2f}, {best_row['cov_std(0.9)']:.2f} (0.9)")
print(f"{best_row['pinaw(0.9)']:.2f}, {best_row['pinaw_std(0.9)']:.2f} (0.9)")
print(f"{best_row['cov(0.5)']:.2f}, {best_row['cov_std(0.5)']:.2f} (0.5)")
print(f"{best_row['pinaw(0.5)']:.2f}, {best_row['pinaw_std(0.5)']:.2f} (0.5)")
print(f"{best_row['cov(0.1)']:.2f}, {best_row['cov_std(0.1)']:.2f} (0.1)")
print(f"{best_row['pinaw(0.1)']:.2f}, {best_row['pinaw_std(0.1)']:.2f} (0.1)")


0.91, 0.04 (0.9)
1.19, 0.13 (0.9)
0.53, 0.05 (0.5)
0.41, 0.06 (0.5)
0.13, 0.04 (0.1)
0.06, 0.01 (0.1)


In [13]:

experiment_path = df[(df[hyperparameter_cols] == best_row[hyperparameter_cols].values).all(axis=1)]["experiment_path"]

print('/home/halil/max_quantile/' + experiment_path.iloc[1][19:])

/home/halil/max_quantile/logs/White_Wine/seed_0/voronoi/softlabelbased/20240927-140859


/home/halil/max_quantile/logs/bike_sharing/seed_1/voronoi/softlabelbased/20241028-110606


In [33]:
experiment_path.iloc[0][19:]

'logs/Unconditional_1d_data/seed_0/voronoi/softlabelbased/20240928-073009'