In [2]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import os
from scipy.stats import shapiro, bartlett, f_oneway
import matplotlib.pyplot as plt
import seaborn as sns

[INFO] No VAE checkpoint found.
[INFO] No generator checkpoint found.
[INFO] No discriminator checkpoint found.


# Wczytanie danych

In [3]:
metrics = ['f2_score', 'balanced_accuracy', 'recall', 'specificity']

gan_path = 'crossvalidation_results/CNNGAN_cross_validation_results.csv'
synth_gan_path = 'crossvalidation_results/CNNGAN_synthetic_cross_validation_results.csv'
vae_path = 'crossvalidation_results/CNNVAE_cross_validation_results.csv'
synth_vae_path = 'crossvalidation_results/CNNVAE_synthetic_cross_validation_results.csv'
orig_path = 'crossvalidation_results/without_oversampling_cross_validation_results.csv'
gan_df = pd.read_csv(gan_path)
synth_gan_df = pd.read_csv(synth_gan_path)
vae_df = pd.read_csv(vae_path)
synth_vae_df = pd.read_csv(synth_vae_path)
orig_df = pd.read_csv(orig_path)

# Statystyki opisowe

In [21]:
out_dir = 'general_statistics/'
if not os.path.exists(out_dir):
    os.makedirs(out_dir)

dataframes = {
    'CNNGAN_df':  gan_df,
    'CNNVAE_df':  vae_df,
    'original_df': orig_df,
    'CNNGAN_synthetic_df': synth_gan_df,
    'CNNVAE_synthetic_df': synth_vae_df
}

for name, df in dataframes.items():
    x = df.shape[1]
    if name == 'CNNGAN_df' or name == 'CNNVAE_synthetic_df':
        x = -2
    elif name == 'CNNGAN_synthetic_df':
        x = -1
    elif name == 'CNNVAE_df':
        x = -3
    desc = df.iloc[:, 1:x].describe()

    # Transpose it and reset index
    desc_t = desc.transpose().reset_index()

    # Rename columns
    desc_t = desc_t.rename(columns={'index': 'metric'})
    
    # Melt the dataframe to long format: stat in one column, values in another
    result_df = desc_t.melt(id_vars='metric', var_name='stat', value_name='value')
    
    # Optional: reorder columns
    result_df = result_df[['stat', 'metric', 'value']]
    # Show result
    result_df.to_csv(f'{out_dir}{name.split('_df')[0]}_general_statistics.csv', index=False)

# Histogramy

In [25]:
out_dir = 'histograms/'
if not os.path.exists(out_dir):
    os.makedirs(out_dir)
    
dataframes = {
    'CNNGAN_df':  gan_df,
    'CNNVAE_df':  vae_df,
    'original_df': orig_df
}

for name, df in dataframes.items():
    # create a new figure for this DataFrame
    fig, axes = plt.subplots(2, 2, figsize=(10, 8))
    axes = axes.flatten()

    for ax, metric in zip(axes, metrics):
        ax.hist(df[metric], bins=10)
        ax.set_title(metric.replace('_', ' ').title())
        ax.set_xlabel('Value')
        ax.set_ylabel('Frequency')
        ax.grid(True, axis='y', linestyle='--', linewidth=0.5)

    plt.tight_layout()
    
    filename = os.path.join(out_dir, f"{name.split('_df')[0]}_histograms.png")
    plt.savefig(filename)
    print(f"Saved {filename}")
    
    # Close the figure to free memory
    plt.close(fig)

Saved histograms/CNNGAN_histograms.png
Saved histograms/CNNVAE_histograms.png
Saved histograms/original_histograms.png


# Wykresy ramka-wąsy

In [26]:
out_dir = 'boxplots/'
if not os.path.exists(out_dir):
    os.makedirs(out_dir)

for name, df in dataframes.items():
    # create a new figure for this DataFrame
    fig, axes = plt.subplots(2, 2, figsize=(10, 8))
    axes = axes.flatten()

    for ax, metric in zip(axes, metrics):
        sns.boxplot(y=df[metric], ax=ax)
        ax.set_title(metric.replace('_', ' ').title())
        ax.set_ylabel('Value')
        ax.grid(True, axis='y', linestyle='--', linewidth=0.5)

    plt.tight_layout()
    
    filename = os.path.join(out_dir, f"{name.split('_df')[0]}_boxplots.png")
    plt.savefig(filename)
    print(f"Saved {filename}")
    
    # Close the figure to free memory
    plt.close(fig)

Saved boxplots/CNNGAN_boxplots.png
Saved boxplots/CNNVAE_boxplots.png
Saved boxplots/original_boxplots.png


# Testy na normalność (Shapiro-Wilk)

In [24]:
for name, df in dataframes.items():
    for metric in metrics:
        stat, p_value = shapiro(df[metric])
        print("Dataframe:", name, "### Metric:", metric, "### Stat:", round(stat, 3), "### P value:", round(p_value, 3), "### Normality?:", "YES" if p_value > 0.05 else "NO")

Dataframe: CNNGAN_df ### Metric: f2_score ### Stat: 0.913 ### P value: 0.301 ### Normality?: YES
Dataframe: CNNGAN_df ### Metric: balanced_accuracy ### Stat: 0.936 ### P value: 0.509 ### Normality?: YES
Dataframe: CNNGAN_df ### Metric: recall ### Stat: 0.938 ### P value: 0.528 ### Normality?: YES
Dataframe: CNNGAN_df ### Metric: specificity ### Stat: 0.944 ### P value: 0.603 ### Normality?: YES
Dataframe: CNNVAE_df ### Metric: f2_score ### Stat: 0.97 ### P value: 0.888 ### Normality?: YES
Dataframe: CNNVAE_df ### Metric: balanced_accuracy ### Stat: 0.947 ### P value: 0.637 ### Normality?: YES
Dataframe: CNNVAE_df ### Metric: recall ### Stat: 0.946 ### P value: 0.619 ### Normality?: YES
Dataframe: CNNVAE_df ### Metric: specificity ### Stat: 0.905 ### P value: 0.246 ### Normality?: YES
Dataframe: original_df ### Metric: f2_score ### Stat: 0.946 ### P value: 0.626 ### Normality?: YES
Dataframe: original_df ### Metric: balanced_accuracy ### Stat: 0.92 ### P value: 0.354 ### Normality?: YES

# Testy na jednorodność wariancji (test Bartletta)

In [18]:
for metric in metrics:
    stat, p_value = bartlett(gan_df[metric], vae_df[metric], orig_df[metric])
    print("Metric:", metric, "### Stat:", round(stat, 3), "### P value:", round(p_value, 3), "### Homoscedasticity?:", "YES" if p_value > 0.05 else "NO")

Metric: f2_score ### Stat: 4.341 ### P value: 0.114 ### Homoscedasticity?: YES
Metric: balanced_accuracy ### Stat: 5.135 ### P value: 0.077 ### Homoscedasticity?: YES
Metric: recall ### Stat: 3.518 ### P value: 0.172 ### Homoscedasticity?: YES
Metric: specificity ### Stat: 0.574 ### P value: 0.751 ### Homoscedasticity?: YES


# Testy na równość średnich (test ANOVA)

In [19]:
for metric in metrics:
    stat, p_value = f_oneway(gan_df[metric], vae_df[metric], orig_df[metric])
    print("Metric:", metric, "### Stat:", round(stat, 3), "### P value:", round(p_value, 3), "### Significant difference?:", "NO" if p_value > 0.05 else "YES")

Metric: f2_score ### Stat: 0.683 ### P value: 0.513 ### Significant difference?: NO
Metric: balanced_accuracy ### Stat: 1.188 ### P value: 0.32 ### Significant difference?: NO
Metric: recall ### Stat: 0.496 ### P value: 0.614 ### Significant difference?: NO
Metric: specificity ### Stat: 1.051 ### P value: 0.364 ### Significant difference?: NO
