In [1]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import kruskal, f_oneway
import warnings
warnings.filterwarnings('ignore')

# Set visualization style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (16, 10)
plt.rcParams['font.size'] = 11

## 1. Load All Cleaned Datasets

In [2]:
# Define dataset paths
import os
base_path = '/home/anirudh-sharma/Desktop/music-genere-presentation'
datasets = {
    'FMA Small': os.path.join(base_path, 'data/feature-extraction-cleaned/fma_small_features_labeled.csv'),
    'FMA Medium': os.path.join(base_path, 'data/feature-extraction-cleaned/fma_medium_features_labeled.csv'),
    'GTZAN': os.path.join(base_path, 'data/feature-extraction-cleaned/gtzan_features.csv'),
    'Indian Music': os.path.join(base_path, 'data/feature-extraction-cleaned/indian_features.csv'),
    'Ludwig': os.path.join(base_path, 'data/feature-extraction-cleaned/ludwig_features.csv')
}

# Load all datasets with dataset identifier
dfs = []
for name, path in datasets.items():
    df = pd.read_csv(path)
    df['dataset'] = name  # Add dataset identifier
    dfs.append(df)
    print(f"‚úì Loaded {name}: {df.shape[0]:,} tracks, {df.shape[1]-1} features")

# Combine all datasets
combined_df = pd.concat(dfs, ignore_index=True)
print(f"\nüìä Combined dataset: {combined_df.shape[0]:,} tracks, {combined_df.shape[1]} total columns")
print(f"\nüìã Dataset Distribution:")
print(combined_df['dataset'].value_counts().sort_index())

‚úì Loaded FMA Small: 7,996 tracks, 75 features
‚úì Loaded FMA Medium: 16,986 tracks, 75 features
‚úì Loaded GTZAN: 999 tracks, 73 features
‚úì Loaded Indian Music: 500 tracks, 74 features
‚úì Loaded Ludwig: 11,293 tracks, 74 features

üìä Combined dataset: 37,774 tracks, 76 total columns

üìã Dataset Distribution:
dataset
FMA Medium      16986
FMA Small        7996
GTZAN             999
Indian Music      500
Ludwig          11293
Name: count, dtype: int64
‚úì Loaded Ludwig: 11,293 tracks, 74 features

üìä Combined dataset: 37,774 tracks, 76 total columns

üìã Dataset Distribution:
dataset
FMA Medium      16986
FMA Small        7996
GTZAN             999
Indian Music      500
Ludwig          11293
Name: count, dtype: int64


## 2. Create Output Directory

In [3]:
# Create output directory
output_dir = os.path.join(base_path, 'results/step1.6-dataset-bias-check')
os.makedirs(output_dir, exist_ok=True)
print(f"üìÅ Results will be saved to: {output_dir}")

üìÅ Results will be saved to: /home/anirudh-sharma/Desktop/music-genere-presentation/results/step1.6-dataset-bias-check


## 3. Key Features for Bias Detection

We'll analyze features that are most likely to show recording/technical bias:
- **rms_mean**: Loudness (recording level bias)
- **spec_centroid_mean**: Brightness (equipment/encoding bias)
- **tempo**: Rhythmic extraction bias
- **zcr_mean**: Noisiness (bit depth/compression bias)
- **mfcc1_mean**: Overall energy (normalization bias)

In [4]:
# Define key features for bias analysis
bias_features = [
    'rms_mean',           # Loudness
    'spec_centroid_mean', # Brightness
    'spec_rolloff_mean',  # High-frequency content
    'tempo',              # Rhythmic extraction
    'zcr_mean',           # Zero-crossing rate (noisiness)
    'mfcc1_mean'          # Overall energy
]

print("üìä Analyzing bias in the following features:")
for i, feat in enumerate(bias_features, 1):
    print(f"   {i}. {feat}")

üìä Analyzing bias in the following features:
   1. rms_mean
   2. spec_centroid_mean
   3. spec_rolloff_mean
   4. tempo
   5. zcr_mean
   6. mfcc1_mean


## 4. Statistical Summary by Dataset

In [5]:
# Compute descriptive statistics grouped by dataset
print("\n" + "="*100)
print("üìà DESCRIPTIVE STATISTICS BY DATASET")
print("="*100)

summary_stats = []

for feature in bias_features:
    print(f"\n{'='*100}")
    print(f"Feature: {feature}")
    print(f"{'='*100}")
    
    # Group by dataset and compute statistics
    stats_df = combined_df.groupby('dataset')[feature].agg([
        ('Mean', 'mean'),
        ('Median', 'median'),
        ('Std', 'std'),
        ('Min', 'min'),
        ('Max', 'max'),
        ('Q25', lambda x: x.quantile(0.25)),
        ('Q75', lambda x: x.quantile(0.75))
    ]).round(4)
    
    print(stats_df)
    
    # Store for summary table
    for dataset in stats_df.index:
        summary_stats.append({
            'Feature': feature,
            'Dataset': dataset,
            'Mean': stats_df.loc[dataset, 'Mean'],
            'Std': stats_df.loc[dataset, 'Std'],
            'Range': f"{stats_df.loc[dataset, 'Min']:.2f} - {stats_df.loc[dataset, 'Max']:.2f}"
        })

# Create summary DataFrame
summary_df = pd.DataFrame(summary_stats)
print("\n" + "="*100)
print("üìä SUMMARY TABLE")
print("="*100)
print(summary_df.to_string(index=False))


üìà DESCRIPTIVE STATISTICS BY DATASET

Feature: rms_mean
                Mean  Median     Std     Min     Max     Q25     Q75
dataset                                                             
FMA Medium    0.1831  0.1750  0.0959  0.0000  0.9115  0.1124  0.2429
FMA Small     0.1802  0.1699  0.0970  0.0000  0.8724  0.1074  0.2405
GTZAN         0.1309  0.1222  0.0657  0.0053  0.3977  0.0866  0.1756
Indian Music  0.1564  0.1534  0.0689  0.0228  0.3888  0.0966  0.2142
Ludwig        0.1962  0.2013  0.0885  0.0000  0.5673  0.1272  0.2649

Feature: spec_centroid_mean
                   Mean     Median       Std       Min        Max        Q25  \
dataset                                                                        
FMA Medium    2005.6462  2012.2467  703.0080   33.7095  7961.7338  1526.5577   
FMA Small     1900.5601  1885.4190  710.2004  194.1829  6765.7466  1392.5559   
GTZAN         2202.5984  2215.2672  716.1103  570.3499  4435.7321  1626.5270   
Indian Music  2059.1919  2019

## 5. Box Plot Comparison Across Datasets

Visual comparison of feature distributions to identify systematic differences between datasets.

In [None]:
def plot_dataset_comparison_boxplot(df, feature, output_path):
    """
    Create box plot comparing feature distributions across datasets.
    """
    fig, ax = plt.subplots(figsize=(14, 8))
    
    # Create box plot
    datasets_order = ['GTZAN', 'FMA Small', 'FMA Medium', 'Indian Music', 'Ludwig']
    box_data = [df[df['dataset'] == ds][feature].dropna() for ds in datasets_order if ds in df['dataset'].unique()]
    labels = [ds for ds in datasets_order if ds in df['dataset'].unique()]
    
    bp = ax.boxplot(box_data, labels=labels, patch_artist=True,
                    notch=True, showmeans=True,
                    boxprops=dict(facecolor='lightblue', alpha=0.7),
                    medianprops=dict(color='red', linewidth=2),
                    meanprops=dict(marker='D', markerfacecolor='green', markersize=8),
                    whiskerprops=dict(linewidth=1.5),
                    capprops=dict(linewidth=1.5))
    
    # Add color coding
    colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#FFA07A', '#98D8C8']
    for patch, color in zip(bp['boxes'], colors[:len(labels)]):
        patch.set_facecolor(color)
        patch.set_alpha(0.6)
    
    # Styling
    ax.set_title(f'Dataset Comparison: {feature}\nBox Plot Analysis', 
                 fontsize=16, fontweight='bold', pad=20)
    ax.set_xlabel('Dataset', fontsize=14, fontweight='bold')
    ax.set_ylabel(f'{feature} Value', fontsize=14, fontweight='bold')
    ax.grid(axis='y', alpha=0.3, linestyle='--')
    ax.set_axisbelow(True)
    
    # Add legend
    from matplotlib.patches import Patch
    legend_elements = [
        Patch(facecolor='lightblue', alpha=0.5, label='IQR (Q1-Q3)'),
        plt.Line2D([0], [0], color='red', linewidth=2, label='Median'),
        plt.Line2D([0], [0], marker='D', color='w', markerfacecolor='green', 
                   markersize=8, label='Mean')
    ]
    ax.legend(handles=legend_elements, loc='upper right', fontsize=10)
    
    plt.xticks(rotation=15, ha='right')
    plt.tight_layout()
    plt.savefig(output_path, dpi=300, bbox_inches='tight')
    print(f"‚úì Saved: {os.path.basename(output_path)}")
    plt.show()

# Generate box plots for all bias features
print("\nüìä Generating Dataset Comparison Box Plots...\n")

for feature in bias_features:
    output_path = os.path.join(output_dir, f'boxplot_comparison_{feature}.png')
    plot_dataset_comparison_boxplot(combined_df, feature, output_path)

## 6. Statistical Significance Testing

### Kruskal-Wallis H-Test
Non-parametric test to determine if datasets have significantly different distributions.

**Null Hypothesis (H‚ÇÄ):** All datasets have the same distribution for the feature.

**Interpretation:**
- p < 0.001: Strong evidence of bias (datasets are significantly different)
- p < 0.05: Moderate evidence of bias
- p ‚â• 0.05: No significant bias detected

In [6]:
print("\n" + "="*100)
print("üî¨ STATISTICAL SIGNIFICANCE TESTING: Kruskal-Wallis H-Test")
print("="*100)

bias_test_results = []

for feature in bias_features:
    # Prepare data groups
    groups = [combined_df[combined_df['dataset'] == ds][feature].dropna() 
              for ds in combined_df['dataset'].unique()]
    
    # Perform Kruskal-Wallis test
    h_stat, p_value = kruskal(*groups)
    
    # Interpret results
    if p_value < 0.001:
        significance = "STRONG BIAS"
        color = "üî¥"
    elif p_value < 0.05:
        significance = "MODERATE BIAS"
        color = "üü°"
    else:
        significance = "NO BIAS"
        color = "üü¢"
    
    bias_test_results.append({
        'Feature': feature,
        'H-Statistic': h_stat,
        'P-Value': p_value,
        'Significance': significance
    })
    
    print(f"\n{color} {feature}:")
    print(f"   H-statistic: {h_stat:.4f}")
    print(f"   P-value: {p_value:.4e}")
    print(f"   Result: {significance}")

# Create results DataFrame
bias_results_df = pd.DataFrame(bias_test_results)
print("\n" + "="*100)
print("üìä BIAS TEST SUMMARY")
print("="*100)
print(bias_results_df.to_string(index=False))


üî¨ STATISTICAL SIGNIFICANCE TESTING: Kruskal-Wallis H-Test

üî¥ rms_mean:
   H-statistic: 748.8197
   P-value: 9.3404e-161
   Result: STRONG BIAS

üî¥ spec_centroid_mean:
   H-statistic: 1522.1591
   P-value: 0.0000e+00
   Result: STRONG BIAS

üî¥ spec_rolloff_mean:
   H-statistic: 1433.4361
   P-value: 3.8839e-309
   Result: STRONG BIAS

üî¥ tempo:
   H-statistic: 21.8289
   P-value: 2.1677e-04
   Result: STRONG BIAS

üî¥ zcr_mean:
   H-statistic: 1829.5166
   P-value: 0.0000e+00
   Result: STRONG BIAS

üî¥ mfcc1_mean:
   H-statistic: 1863.4316
   P-value: 0.0000e+00
   Result: STRONG BIAS

üìä BIAS TEST SUMMARY
           Feature  H-Statistic       P-Value Significance
          rms_mean   748.819749 9.340352e-161  STRONG BIAS
spec_centroid_mean  1522.159057  0.000000e+00  STRONG BIAS
 spec_rolloff_mean  1433.436074 3.883876e-309  STRONG BIAS
             tempo    21.828861  2.167686e-04  STRONG BIAS
          zcr_mean  1829.516633  0.000000e+00  STRONG BIAS
        mfcc1_m

## 7. Effect Size Analysis (Cohen's d)

Quantify the magnitude of differences between datasets using Cohen's d effect size.

**Interpretation:**
- |d| < 0.2: Negligible difference
- 0.2 ‚â§ |d| < 0.5: Small difference
- 0.5 ‚â§ |d| < 0.8: Medium difference
- |d| ‚â• 0.8: Large difference

In [8]:
def cohens_d(group1, group2):
    """
    Calculate Cohen's d effect size between two groups.
    """
    n1, n2 = len(group1), len(group2)
    var1, var2 = group1.var(), group2.var()
    pooled_std = np.sqrt(((n1-1)*var1 + (n2-1)*var2) / (n1+n2-2))
    return (group1.mean() - group2.mean()) / pooled_std

print("\n" + "="*100)
print("üìè EFFECT SIZE ANALYSIS: Cohen's d (Pairwise Comparisons)")
print("="*100)

effect_size_results = []
datasets_list = combined_df['dataset'].unique()

for feature in bias_features:
    print(f"\n{'='*100}")
    print(f"Feature: {feature}")
    print(f"{'='*100}")
    
    # Compare all pairs of datasets
    for i, ds1 in enumerate(datasets_list):
        for ds2 in datasets_list[i+1:]:
            group1 = combined_df[combined_df['dataset'] == ds1][feature].dropna()
            group2 = combined_df[combined_df['dataset'] == ds2][feature].dropna()
            
            d = cohens_d(group1, group2)
            abs_d = abs(d)
            
            # Interpret effect size
            if abs_d < 0.2:
                interpretation = "Negligible"
                emoji = "üü¢"
            elif abs_d < 0.5:
                interpretation = "Small"
                emoji = "üü°"
            elif abs_d < 0.8:
                interpretation = "Medium"
                emoji = "üü†"
            else:
                interpretation = "Large"
                emoji = "üî¥"
            
            effect_size_results.append({
                'Feature': feature,
                'Dataset_1': ds1,
                'Dataset_2': ds2,
                'Cohen_d': d,
                'Abs_Cohen_d': abs_d,
                'Effect_Size': interpretation
            })
            
            print(f"{emoji} {ds1} vs {ds2}: d = {d:.4f} ({interpretation})")

# Create effect size DataFrame
effect_size_df = pd.DataFrame(effect_size_results)
effect_size_df = effect_size_df.sort_values('Abs_Cohen_d', ascending=False)

print("\n" + "="*100)
print("üìä TOP 20 LARGEST EFFECT SIZES (Strongest Dataset Biases)")
print("="*100)
print(effect_size_df.head(20).to_string(index=False))


üìè EFFECT SIZE ANALYSIS: Cohen's d (Pairwise Comparisons)

Feature: rms_mean
üü¢ FMA Small vs FMA Medium: d = -0.0300 (Negligible)
üü† FMA Small vs GTZAN: d = 0.5250 (Medium)
üü° FMA Small vs Indian Music: d = 0.2498 (Small)
üü¢ FMA Small vs Ludwig: d = -0.1737 (Negligible)
üü† FMA Medium vs GTZAN: d = 0.5533 (Medium)
üü° FMA Medium vs Indian Music: d = 0.2811 (Small)
üü¢ FMA Medium vs Ludwig: d = -0.1410 (Negligible)
üü° GTZAN vs Indian Music: d = -0.3818 (Small)
üü† GTZAN vs Ludwig: d = -0.7523 (Medium)
üü° Indian Music vs Ludwig: d = -0.4542 (Small)

Feature: spec_centroid_mean
üü¢ FMA Small vs FMA Medium: d = -0.1490 (Negligible)
üü° FMA Small vs GTZAN: d = -0.4249 (Small)
üü° FMA Small vs Indian Music: d = -0.2273 (Small)
üü† FMA Small vs Ludwig: d = -0.5018 (Medium)
üü° FMA Medium vs GTZAN: d = -0.2799 (Small)
üü¢ FMA Medium vs Indian Music: d = -0.0768 (Negligible)
üü° FMA Medium vs Ludwig: d = -0.3382 (Small)
üü° GTZAN vs Indian Music: d = 0.2237 (Small)
ü

## 8. Violin Plot Comparison

Show distribution shapes and probability density for key features across datasets.

In [None]:
def plot_violin_comparison(df, feature, output_path):
    """
    Create violin plot showing distribution shapes across datasets.
    """
    fig, ax = plt.subplots(figsize=(14, 8))
    
    # Create violin plot
    datasets_order = ['GTZAN', 'FMA Small', 'FMA Medium', 'Indian Music', 'Ludwig']
    plot_df = df[df['dataset'].isin(datasets_order)]
    
    sns.violinplot(data=plot_df, x='dataset', y=feature, 
                   order=datasets_order, palette='Set2',
                   inner='quartile', ax=ax)
    
    # Overlay strip plot for data points
    sns.stripplot(data=plot_df, x='dataset', y=feature,
                  order=datasets_order, color='black',
                  alpha=0.1, size=2, ax=ax)
    
    # Styling
    ax.set_title(f'Distribution Shape Comparison: {feature}\nViolin Plot with Quartiles',
                 fontsize=16, fontweight='bold', pad=20)
    ax.set_xlabel('Dataset', fontsize=14, fontweight='bold')
    ax.set_ylabel(f'{feature} Value', fontsize=14, fontweight='bold')
    ax.grid(axis='y', alpha=0.3, linestyle='--')
    plt.xticks(rotation=15, ha='right')
    plt.tight_layout()
    plt.savefig(output_path, dpi=300, bbox_inches='tight')
    print(f"‚úì Saved: {os.path.basename(output_path)}")
    plt.show()

# Generate violin plots for key features
print("\nüìä Generating Violin Plots for Distribution Shape Analysis...\n")

key_features_violin = ['rms_mean', 'spec_centroid_mean', 'tempo']
for feature in key_features_violin:
    output_path = os.path.join(output_dir, f'violin_comparison_{feature}.png')
    plot_violin_comparison(combined_df, feature, output_path)

## 9. 2D Feature Space Visualization

Visualize dataset separation in 2D feature space to assess clustering by source.

In [None]:
def plot_2d_feature_space(df, feature_x, feature_y, output_path):
    """
    Create 2D scatter plot showing dataset separation.
    """
    fig, ax = plt.subplots(figsize=(14, 10))
    
    # Create scatter plot with different colors for each dataset
    datasets = df['dataset'].unique()
    colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#FFA07A', '#98D8C8']
    
    for i, dataset in enumerate(datasets):
        subset = df[df['dataset'] == dataset]
        ax.scatter(subset[feature_x], subset[feature_y],
                   c=colors[i], label=dataset, alpha=0.5, s=20, edgecolors='none')
    
    # Styling
    ax.set_title(f'2D Feature Space: Dataset Separation\n{feature_x} vs {feature_y}',
                 fontsize=16, fontweight='bold', pad=20)
    ax.set_xlabel(feature_x, fontsize=14, fontweight='bold')
    ax.set_ylabel(feature_y, fontsize=14, fontweight='bold')
    ax.legend(title='Dataset', fontsize=11, title_fontsize=12, loc='best')
    ax.grid(alpha=0.3, linestyle='--')
    plt.tight_layout()
    plt.savefig(output_path, dpi=300, bbox_inches='tight')
    print(f"‚úì Saved: {os.path.basename(output_path)}")
    plt.show()

# Generate 2D feature space plots
print("\nüìä Generating 2D Feature Space Visualizations...\n")

feature_pairs = [
    ('rms_mean', 'spec_centroid_mean'),
    ('tempo', 'zcr_mean'),
    ('spec_centroid_mean', 'spec_rolloff_mean')
]

for feat_x, feat_y in feature_pairs:
    output_path = os.path.join(output_dir, f'2d_scatter_{feat_x}_vs_{feat_y}.png')
    plot_2d_feature_space(combined_df, feat_x, feat_y, output_path)

## 10. Coefficient of Variation (CV) Analysis

Measure relative variability within each dataset to assess consistency.

**Formula:** CV = (œÉ / Œº) √ó 100%

**Interpretation:**
- CV < 20%: Low variability (homogeneous)
- 20% ‚â§ CV < 50%: Moderate variability
- CV ‚â• 50%: High variability (heterogeneous)

In [9]:
print("\n" + "="*100)
print("üìä COEFFICIENT OF VARIATION (CV) ANALYSIS")
print("="*100)

cv_results = []

for feature in bias_features:
    print(f"\n{'='*100}")
    print(f"Feature: {feature}")
    print(f"{'='*100}")
    
    for dataset in combined_df['dataset'].unique():
        subset = combined_df[combined_df['dataset'] == dataset][feature].dropna()
        mean_val = subset.mean()
        std_val = subset.std()
        cv = (std_val / mean_val) * 100 if mean_val != 0 else 0
        
        # Interpret CV
        if cv < 20:
            variability = "Low"
            emoji = "üü¢"
        elif cv < 50:
            variability = "Moderate"
            emoji = "üü°"
        else:
            variability = "High"
            emoji = "üî¥"
        
        cv_results.append({
            'Feature': feature,
            'Dataset': dataset,
            'Mean': mean_val,
            'Std': std_val,
            'CV_%': cv,
            'Variability': variability
        })
        
        print(f"{emoji} {dataset}: CV = {cv:.2f}% ({variability})")

# Create CV DataFrame
cv_df = pd.DataFrame(cv_results)
print("\n" + "="*100)
print("üìä COEFFICIENT OF VARIATION SUMMARY")
print("="*100)
print(cv_df.to_string(index=False))


üìä COEFFICIENT OF VARIATION (CV) ANALYSIS

Feature: rms_mean
üî¥ FMA Small: CV = 53.83% (High)
üî¥ FMA Medium: CV = 52.37% (High)
üî¥ GTZAN: CV = 50.23% (High)
üü° Indian Music: CV = 44.09% (Moderate)
üü° Ludwig: CV = 45.12% (Moderate)

Feature: spec_centroid_mean
üü° FMA Small: CV = 37.37% (Moderate)
üü° FMA Medium: CV = 35.05% (Moderate)
üü° GTZAN: CV = 32.51% (Moderate)
üü° Indian Music: CV = 22.13% (Moderate)
üü° Ludwig: CV = 28.18% (Moderate)

Feature: spec_rolloff_mean
üü° FMA Small: CV = 40.02% (Moderate)
üü° FMA Medium: CV = 37.46% (Moderate)
üü° GTZAN: CV = 34.44% (Moderate)
üü° Indian Music: CV = 24.41% (Moderate)
üü° Ludwig: CV = 29.99% (Moderate)

Feature: tempo
üü° FMA Small: CV = 24.39% (Moderate)
üü° FMA Medium: CV = 24.02% (Moderate)
üü° GTZAN: CV = 23.54% (Moderate)
üü° Indian Music: CV = 23.76% (Moderate)
üü° Ludwig: CV = 23.10% (Moderate)

Feature: zcr_mean
üî¥ FMA Small: CV = 52.40% (High)
üü° FMA Medium: CV = 49.68% (Moderate)
üü° GTZAN: C

## 11. Save Results

In [10]:
# Save statistical test results
bias_results_df.to_csv(os.path.join(output_dir, 'bias_test_results.csv'), index=False)
print("‚úì Saved: bias_test_results.csv")

# Save effect size results
effect_size_df.to_csv(os.path.join(output_dir, 'effect_size_analysis.csv'), index=False)
print("‚úì Saved: effect_size_analysis.csv")

# Save CV results
cv_df.to_csv(os.path.join(output_dir, 'coefficient_of_variation.csv'), index=False)
print("‚úì Saved: coefficient_of_variation.csv")

# Save summary statistics
summary_df.to_csv(os.path.join(output_dir, 'descriptive_statistics_by_dataset.csv'), index=False)
print("‚úì Saved: descriptive_statistics_by_dataset.csv")

‚úì Saved: bias_test_results.csv
‚úì Saved: effect_size_analysis.csv
‚úì Saved: coefficient_of_variation.csv
‚úì Saved: descriptive_statistics_by_dataset.csv


## 12. Bias Assessment Summary & Recommendations

In [11]:
print("\n" + "="*100)
print("üìã STEP 1.6: DATASET BIAS CHECK - FINAL SUMMARY")
print("="*100)

# Count bias levels
strong_bias_count = len(bias_results_df[bias_results_df['Significance'] == 'STRONG BIAS'])
moderate_bias_count = len(bias_results_df[bias_results_df['Significance'] == 'MODERATE BIAS'])
no_bias_count = len(bias_results_df[bias_results_df['Significance'] == 'NO BIAS'])

print(f"\nüîç BIAS DETECTION SUMMARY:")
print(f"   ‚Ä¢ Features with STRONG bias (p < 0.001): {strong_bias_count}/{len(bias_features)}")
print(f"   ‚Ä¢ Features with MODERATE bias (p < 0.05): {moderate_bias_count}/{len(bias_features)}")
print(f"   ‚Ä¢ Features with NO bias (p ‚â• 0.05): {no_bias_count}/{len(bias_features)}")

# Identify most biased features
biased_features = bias_results_df[bias_results_df['P-Value'] < 0.05]['Feature'].tolist()
if biased_features:
    print(f"\n‚ö†Ô∏è  BIASED FEATURES DETECTED:")
    for feat in biased_features:
        print(f"   ‚Ä¢ {feat}")
else:
    print(f"\n‚úÖ NO SIGNIFICANT BIAS DETECTED")

# Count large effect sizes
large_effects = len(effect_size_df[effect_size_df['Effect_Size'] == 'Large'])
medium_effects = len(effect_size_df[effect_size_df['Effect_Size'] == 'Medium'])

print(f"\nüìè EFFECT SIZE SUMMARY:")
print(f"   ‚Ä¢ Pairwise comparisons with LARGE effect (|d| ‚â• 0.8): {large_effects}")
print(f"   ‚Ä¢ Pairwise comparisons with MEDIUM effect (0.5 ‚â§ |d| < 0.8): {medium_effects}")

# Recommendations
print(f"\nüí° RECOMMENDATIONS:")

if strong_bias_count > 0:
    print(f"   ‚ö†Ô∏è  STRONG DATASET BIAS DETECTED:")
    print(f"   1. Consider dataset-specific normalization (Z-score per dataset)")
    print(f"   2. Apply domain adaptation techniques before clustering")
    print(f"   3. Evaluate clustering results with stratified validation by dataset")
    print(f"   4. Report dataset as a potential confounding variable")
    print(f"   5. Consider training separate models per dataset or using dataset as a feature")
elif moderate_bias_count > 0:
    print(f"   ‚ö° MODERATE DATASET BIAS DETECTED:")
    print(f"   1. StandardScaler normalization should suffice")
    print(f"   2. Monitor cluster compositions for dataset imbalance")
    print(f"   3. Validate that clusters represent genres, not datasets")
    print(f"   4. Document bias levels in your report")
else:
    print(f"   ‚úÖ NO SIGNIFICANT BIAS:")
    print(f"   1. Datasets are sufficiently homogeneous for combined analysis")
    print(f"   2. Proceed with standard normalization and clustering")
    print(f"   3. Dataset origin unlikely to be primary clustering factor")

print(f"\nüìä NEXT STEPS:")
print(f"   1. Apply appropriate normalization based on bias findings")
print(f"   2. Proceed to PCA dimensionality reduction (Step 3)")
print(f"   3. During clustering, verify that dataset != cluster")
print(f"   4. Use cross-tabulation to check cluster-dataset relationships")

print("\n" + "="*100)
print("‚úÖ STEP 1.6 COMPLETE: Dataset Bias Assessment Finished")
print("="*100)


üìã STEP 1.6: DATASET BIAS CHECK - FINAL SUMMARY

üîç BIAS DETECTION SUMMARY:
   ‚Ä¢ Features with STRONG bias (p < 0.001): 6/6
   ‚Ä¢ Features with MODERATE bias (p < 0.05): 0/6
   ‚Ä¢ Features with NO bias (p ‚â• 0.05): 0/6

‚ö†Ô∏è  BIASED FEATURES DETECTED:
   ‚Ä¢ rms_mean
   ‚Ä¢ spec_centroid_mean
   ‚Ä¢ spec_rolloff_mean
   ‚Ä¢ tempo
   ‚Ä¢ zcr_mean
   ‚Ä¢ mfcc1_mean

üìè EFFECT SIZE SUMMARY:
   ‚Ä¢ Pairwise comparisons with LARGE effect (|d| ‚â• 0.8): 0
   ‚Ä¢ Pairwise comparisons with MEDIUM effect (0.5 ‚â§ |d| < 0.8): 6

üí° RECOMMENDATIONS:
   ‚ö†Ô∏è  STRONG DATASET BIAS DETECTED:
   1. Consider dataset-specific normalization (Z-score per dataset)
   2. Apply domain adaptation techniques before clustering
   3. Evaluate clustering results with stratified validation by dataset
   4. Report dataset as a potential confounding variable
   5. Consider training separate models per dataset or using dataset as a feature

üìä NEXT STEPS:
   1. Apply appropriate normalization base

## 13. Generate Bias Report Heatmap

Create a comprehensive heatmap showing all statistical test results.

In [None]:
# Create pivot table for heatmap
pivot_df = bias_results_df.pivot_table(
    index='Feature',
    values='P-Value',
    aggfunc='first'
)

# Transform p-values for better visualization (log scale)
pivot_df_log = -np.log10(pivot_df)

# Create heatmap
fig, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(pivot_df_log, annot=True, fmt='.2f', cmap='RdYlGn_r',
            cbar_kws={'label': '-log10(p-value)'},
            linewidths=1, linecolor='white', ax=ax)

# Add significance thresholds
ax.axhline(y=0, color='blue', linewidth=2, linestyle='--', alpha=0.5)
ax.text(0.5, -0.5, 'p < 0.001: Strong Bias (> 3.0)', fontsize=10, ha='center')
ax.text(0.5, -0.8, 'p < 0.05: Moderate Bias (> 1.3)', fontsize=10, ha='center')

plt.title('Dataset Bias Detection: Statistical Significance Heatmap\n-log10(p-value) from Kruskal-Wallis Test',
          fontsize=14, fontweight='bold', pad=20)
plt.ylabel('Feature', fontsize=12, fontweight='bold')
plt.xlabel('')
plt.tight_layout()
plt.savefig(os.path.join(output_dir, 'bias_heatmap.png'), dpi=300, bbox_inches='tight')
print("‚úì Saved: bias_heatmap.png")
plt.show()