In [2]:
%load_ext autoreload
%autoreload 2

### Influence of Random Seeds 

In [1]:
import pandas as pd
import numpy as np
from scipy.stats import shapiro, ttest_ind

def analyze_seed_impact(df, metric_column, seed_1, seed_2, alpha=0.05):
    """
    Analyze the impact of random seeds on a specific metric using normality testing and a t-test.

    Parameters:
        df (pd.DataFrame): DataFrame containing the data.
        metric_column (str): The name of the column with the metric values to compare.
        seed_1 (int): The first random seed to compare.
        seed_2 (int): The second random seed to compare.
        alpha (float): Significance level for hypothesis testing (default is 0.05).

    Returns:
        None: Prints the analysis results.
    """
    print("=" * 90)
    print(f"Analyzing impact of random seeds {seed_1} and {seed_2} on metric: {metric_column}")
    print("=" * 90)

    # Extract metric values for the specified seeds
    seed_1_metrics = df[df['seed'] == seed_1][metric_column].values
    seed_2_metrics = df[df['seed'] == seed_2][metric_column].values

    print(f"Number of samples for Seed {seed_1}: {len(seed_1_metrics)}")
    print(f"Number of samples for Seed {seed_2}: {len(seed_2_metrics)}")

    # Normality testing with Shapiro-Wilk test
    print("\nStep 1: Checking Normality (Shapiro-Wilk Test)")
    shapiro_seed_1 = shapiro(seed_1_metrics)
    shapiro_seed_2 = shapiro(seed_2_metrics)

    print(f"Shapiro-Wilk test for Seed {seed_1}: W={shapiro_seed_1.statistic:.4f}, p={shapiro_seed_1.pvalue:.4f}")
    print(f"Shapiro-Wilk test for Seed {seed_2}: W={shapiro_seed_2.statistic:.4f}, p={shapiro_seed_2.pvalue:.4f}")

    if shapiro_seed_1.pvalue < alpha or shapiro_seed_2.pvalue < alpha:
        print("At least one group is not normally distributed. Consider using a non-parametric test.")
        return

    print("Both groups appear to be normally distributed. Proceeding with the t-test.")

    # Perform a two-sided t-test
    print("\nStep 2: Performing Two-Sample T-Test")
    t_stat, p_value = ttest_ind(seed_1_metrics, seed_2_metrics)

    print(f"T-statistic: {t_stat:.4f}")
    print(f"P-value: {p_value:.4f}")

    # Interpret the t-test result
    print("\nStep 3: Interpreting Results")
    if p_value < alpha:
        print("Reject the null hypothesis: The random seeds significantly influence performance.")
    else:
        print("Fail to reject the null hypothesis: No significant difference due to random seeds.")

    print("=" * 90)

df = pd.read_csv('../random_seed_based_experiment/a.csv')


In [None]:
analyze_seed_impact(df, metric_column='eval_Txt2Audio_mAP', seed_1=42, seed_2=88)

In [None]:
analyze_seed_impact(df, metric_column='eval_Audio2Txt_mAP', seed_1=42, seed_2=88)

### Correlating Objective Loss Values and Metrics


In [None]:
file_name = "HPT_Nov_09_Evaluated"
folder_name = "z_results"
columns_to_correlate = [
    # "after_train_eval_obj", # The collumn to correlate with alternatively with val_obj
    "val_obj",
    "eval_Txt2Audio_mAP",
    "eval_Txt2Audio_R10",
    "eval_Txt2Audio_R5",
    # "eval_Txt2Audio_R1",
    "eval_Audio2Txt_mAP",
    "eval_Audio2Txt_R10",
    "eval_Audio2Txt_R5",
    # "eval_Audio2Txt_R1",
    "val_Txt2Audio_mAP",
    "val_Txt2Audio_R10",
    "val_Txt2Audio_R5",
    # "val_Txt2Audio_R1",
    "val_Audio2Txt_mAP",
    "val_Audio2Txt_R10",
    "val_Audio2Txt_R5",
    # "val_Audio2Txt_R1",
]
to_correlate_with = "after_train_eval_obj"

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
import numpy as np
import os
from matplotlib_inline.backend_inline import set_matplotlib_formats

set_matplotlib_formats("svg")
# Load CSV data

csv_path = f"../{folder_name}/{file_name}.csv"  # adjust path if needed
df = pd.read_csv(csv_path)

# Columns to correlate with "val_obj"


# Calculate Pearson and Spearman correlations with "val_obj" and their p-values
pearson_corrs = {}
spearman_corrs = {}

for col in columns_to_correlate:
    pearson_corr, pearson_pval = stats.pearsonr(
        df[f"{to_correlate_with}"], df[col]
    )
    spearman_corr, spearman_pval = stats.spearmanr(
        df[f"{to_correlate_with}"], df[col]
    )

    pearson_corrs[col] = (pearson_corr, pearson_pval)
    spearman_corrs[col] = (spearman_corr, spearman_pval)

# Create dataframes for correlations and p-values (log transformed)
pearson_df = pd.DataFrame(
    [
        (
            col,
            pearson_corrs[col][0],
            (
                np.log10(pearson_corrs[col][1])
                if pearson_corrs[col][1] > 0
                else 0
            ),
        )
        for col in columns_to_correlate
    ],
    columns=["Metric", "Pearson_Correlation", "Log10_Pearson_P-value"],
)

spearman_df = pd.DataFrame(
    [
        (
            col,
            spearman_corrs[col][0],
            (
                np.log10(spearman_corrs[col][1])
                if spearman_corrs[col][1] > 0
                else 0
            ),
        )
        for col in columns_to_correlate
    ],
    columns=["Metric", "Spearman_Correlation", "Log10_Spearman_P-value"],
)

# Log the Pearson and Spearman p-values after transformation
# print("Logged Pearson and Spearman Correlations and Log10 P-values:\n")
# print("Pearson Correlations and Log10 P-values:")
# print(pearson_df)
# print("\nSpearman Correlations and Log10 P-values:")
# print(spearman_df)

# Plot correlations and log-transformed p-values with 'lava' colormap
fig, axes = plt.subplots(2, 2, figsize=(16, 12), dpi=150)

# Pearson Correlation Plot with lava colormap
sns.barplot(
    data=pearson_df.sort_values(by="Pearson_Correlation", ascending=False),
    x="Pearson_Correlation",
    y="Metric",
    ax=axes[0, 0],
    palette="viridis",  # 'viridis' is similar to 'lava' and provides a vibrant color scale
)
axes[0, 0].set_title(f"Pearson Correlation with {to_correlate_with}")


# Spearman Correlation Plot with lava colormap
sns.barplot(
    data=spearman_df.sort_values(by="Spearman_Correlation", ascending=False),
    x="Spearman_Correlation",
    y="Metric",
    ax=axes[0, 1],
    palette="viridis",
    legend=True,
)
axes[0, 1].set_title(f"Spearman Correlation with {to_correlate_with}")

# Log10 Pearson P-value Plot with lava colormap
sns.barplot(
    data=pearson_df.sort_values(by="Log10_Pearson_P-value", ascending=False),
    x="Log10_Pearson_P-value",
    y="Metric",
    ax=axes[1, 0],
    palette="rainbow",
    legend=True,
)
axes[1, 0].set_title("Log10 Pearson P-values")

# Log10 Spearman P-value Plot with lava colormap
sns.barplot(
    data=spearman_df.sort_values(by="Log10_Spearman_P-value", ascending=False),
    x="Log10_Spearman_P-value",
    y="Metric",
    ax=axes[1, 1],
    palette="rainbow",
    legend=True,
)
axes[1, 1].set_title("Log10 Spearman P-values")

output_path = f"../{folder_name}/{to_correlate_with}_{file_name}.svg"
# Adjust layout for better presentation
plt.tight_layout()
plt.savefig(output_path, bbox_inches="tight", format="svg")
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set up the figure
plt.figure(figsize=(15, 20), dpi=150)

# Iterate over each column and create a scatter plot or other plot
for i, col in enumerate(columns_to_correlate, 1):
    plt.subplot(4, 4, i)  # Create a grid of subplots (5 rows, 3 columns)

    # Plot scatter plot
    sns.scatterplot(x=df[f"{to_correlate_with}"], y=df[col])

    # Set the title and labels
    plt.title(f"{to_correlate_with} vs {col}")
    plt.xlabel(f"{to_correlate_with}")
    plt.ylabel(col)

output_path = f"../{folder_name}/{to_correlate_with}_{file_name}_scatter.svg"
plt.tight_layout()
plt.savefig(output_path, bbox_inches="tight", format="svg")
plt.show()