# Music Genre Discovery - Dataset Analysis

**Project:** Unsupervised Music Genre Discovery Using Audio Feature Learning  
**Author:** Anirudh Sharma  
**Date:** November 2025

---

## Overview

This notebook performs comprehensive data analysis on the GTZAN music genre dataset, including:
- Data adequacy and quality checks
- Class balance analysis
- Descriptive statistical analysis
- Outlier detection and removal
- Missing value handling
- Distribution pattern analysis
- Percentile and quartile analysis
- Trimmed statistics
- Correlation analysis

## 1. Import Required Libraries

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import trim_mean
import warnings
import os

# Ignore warnings for cleaner output
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# Create results directory if it doesn't exist
os.makedirs('results', exist_ok=True)

print("✓ Libraries imported successfully!")
print(f"  - pandas version: {pd.__version__}")
print(f"  - numpy version: {np.__version__}")
print(f"  - matplotlib version: {plt.matplotlib.__version__}")

## 2. Load the Dataset

In [None]:
# Load the dataset
df = pd.read_csv('gtzan/features_30_sec.csv')

# Display basic information
print("=" * 80)
print("DATASET LOADED SUCCESSFULLY")
print("=" * 80)
print(f"\nDataset Shape: {df.shape}")
print(f"  - Total Samples: {len(df)}")
print(f"  - Total Columns: {df.shape[1]}")
print(f"  - Total Features: {df.shape[1] - 2}  (excluding 'filename' and 'label')")
print(f"\nGenres: {df['label'].nunique()}")
print(f"Genre List: {sorted(df['label'].unique())}")

# Display first few rows
print("\n" + "=" * 80)
print("First 5 Rows of Dataset:")
print("=" * 80)
df.head()

In [None]:
# Dataset info
print("Dataset Information:")
print("=" * 80)
df.info()

## 3. Data Adequacy Check

In [None]:
# Prepare feature columns
label_col = 'label'
features = [col for col in df.columns if col not in ['filename', 'label']]

print("=" * 80)
print("DATA ADEQUACY CHECK")
print("=" * 80)

# Basic statistics
total_samples = len(df)
total_features = len(features)
genres = df[label_col].unique()

print(f"\n1. Dataset Size:")
print(f"   - Total samples: {total_samples}")
print(f"   - Total features: {total_features}")
print(f"   - Number of genres: {len(genres)}")

# Sample to feature ratio
ratio = total_samples / total_features
print(f"\n2. Sample-to-Feature Ratio: {ratio:.2f}")
if ratio > 10:
    print("   ✓ ADEQUATE: Good ratio for machine learning (>10)")
else:
    print("   ⚠ WARNING: Low ratio, consider dimensionality reduction")

# Minimum samples per genre
min_samples_per_genre = total_samples / len(genres)
print(f"\n3. Average samples per genre: {min_samples_per_genre:.0f}")
if min_samples_per_genre >= 50:
    print("   ✓ ADEQUATE: Sufficient samples per genre (≥50)")
else:
    print("   ⚠ WARNING: Low samples per genre")

# Adequacy summary
adequacy_report = {
    'total_samples': total_samples,
    'total_features': total_features,
    'num_genres': len(genres),
    'sample_feature_ratio': ratio,
    'avg_samples_per_genre': min_samples_per_genre,
    'is_adequate': ratio > 10 and min_samples_per_genre >= 50
}

print(f"\n4. Overall Assessment: {'✓ ADEQUATE' if adequacy_report['is_adequate'] else '⚠ NEEDS ATTENTION'}")

## 4. Class Balance Analysis

In [None]:
print("=" * 80)
print("CLASS BALANCE ANALYSIS")
print("=" * 80)

# Count samples per genre
class_counts = df[label_col].value_counts().sort_index()
class_percentages = (class_counts / len(df) * 100).round(2)

# Create distribution dataframe
distribution = pd.DataFrame({
    'Genre': class_counts.index,
    'Count': class_counts.values,
    'Percentage': class_percentages.values
})

print("\nClass Distribution:")
print(distribution.to_string(index=False))

# Check balance
max_count = class_counts.max()
min_count = class_counts.min()
imbalance_ratio = max_count / min_count

print(f"\nBalance Metrics:")
print(f"  - Maximum samples: {max_count}")
print(f"  - Minimum samples: {min_count}")
print(f"  - Imbalance ratio: {imbalance_ratio:.2f}")

if imbalance_ratio <= 1.5:
    print("  ✓ BALANCED: Dataset is well balanced (ratio ≤ 1.5)")
elif imbalance_ratio <= 3:
    print("  ⚠ MODERATELY IMBALANCED: Consider balancing techniques")
else:
    print("  ✗ HIGHLY IMBALANCED: Balancing required")

In [None]:
# Visualization
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Bar chart
axes[0].bar(distribution['Genre'], distribution['Count'], color='skyblue', edgecolor='navy')
axes[0].set_xlabel('Genre', fontsize=12, fontweight='bold')
axes[0].set_ylabel('Number of Samples', fontsize=12, fontweight='bold')
axes[0].set_title('Class Distribution - Counts', fontsize=14, fontweight='bold')
axes[0].tick_params(axis='x', rotation=45)
axes[0].grid(axis='y', alpha=0.3)

# Pie chart
colors = plt.cm.Set3(range(len(distribution)))
axes[1].pie(distribution['Percentage'], labels=distribution['Genre'], 
            autopct='%1.1f%%', colors=colors, startangle=90)
axes[1].set_title('Class Distribution - Percentages', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.savefig('results/class_balance.png', dpi=300, bbox_inches='tight')
print("\n✓ Visualization saved: results/class_balance.png")
plt.show()

## 5. Descriptive Statistical Analysis

In [None]:
print("=" * 80)
print("DESCRIPTIVE STATISTICAL ANALYSIS")
print("=" * 80)

# Basic statistics
desc_stats = df[features].describe()

# Additional statistics
additional_stats = pd.DataFrame({
    'variance': df[features].var(),
    'skewness': df[features].skew(),
    'kurtosis': df[features].kurtosis(),
    'range': df[features].max() - df[features].min(),
    'iqr': df[features].quantile(0.75) - df[features].quantile(0.25)
}).T

# Combine all statistics
full_stats = pd.concat([desc_stats, additional_stats])

print("\nKey Statistics (first 5 features):")
print(full_stats.iloc[:, :5].to_string())
print("\n... (showing first 5 features only)")

# Save complete statistics
full_stats.to_csv('results/descriptive_statistics.csv')
print("\n✓ Full statistics saved: results/descriptive_statistics.csv")

In [None]:
# Display summary of key statistics
print("\nSummary of Statistical Properties:")
print("=" * 80)

# Identify features with high skewness
high_skew = df[features].skew().abs().sort_values(ascending=False).head(5)
print("\nTop 5 Features with Highest Skewness:")
for feat, skew_val in high_skew.items():
    print(f"  {feat}: {skew_val:.2f}")

# Identify features with high kurtosis
high_kurt = df[features].kurtosis().abs().sort_values(ascending=False).head(5)
print("\nTop 5 Features with Highest Kurtosis:")
for feat, kurt_val in high_kurt.items():
    print(f"  {feat}: {kurt_val:.2f}")

## 6. Missing Value Analysis

In [None]:
print("=" * 80)
print("MISSING VALUE ANALYSIS")
print("=" * 80)

# Check for missing values
missing_count = df.isnull().sum()
missing_percentage = (missing_count / len(df)) * 100

missing_df = pd.DataFrame({
    'Feature': missing_count.index,
    'Missing_Count': missing_count.values,
    'Percentage': missing_percentage.values
})

missing_df = missing_df[missing_df['Missing_Count'] > 0].sort_values(
    'Missing_Count', ascending=False)

if len(missing_df) > 0:
    print(f"\nFound {len(missing_df)} features with missing values:")
    print(missing_df.to_string(index=False))
    
    # Fill missing values with mean
    print("\n→ Filling missing values with column means...")
    for feature in missing_df['Feature']:
        if feature != 'label':
            mean_value = df[feature].mean()
            df[feature].fillna(mean_value, inplace=True)
    
    print("✓ Missing values handled")
else:
    print("\n✓ No missing values found in the dataset!")

## 7. Outlier Detection and Analysis

In [None]:
def detect_outliers_iqr(data, feature):
    """Detect outliers using the IQR method."""
    Q1 = data[feature].quantile(0.25)
    Q3 = data[feature].quantile(0.75)
    IQR = Q3 - Q1
    
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    outliers = data[(data[feature] < lower_bound) | 
                   (data[feature] > upper_bound)].index
    
    return outliers, Q1, Q3, IQR, lower_bound, upper_bound

print("=" * 80)
print("OUTLIER DETECTION (IQR Method)")
print("=" * 80)

outlier_info = {}

for feature in features:
    outliers, Q1, Q3, IQR, lower, upper = detect_outliers_iqr(df, feature)
    outlier_info[feature] = {
        'count': len(outliers),
        'percentage': (len(outliers) / len(df)) * 100,
        'Q1': Q1,
        'Q3': Q3,
        'IQR': IQR,
        'lower_bound': lower,
        'upper_bound': upper
    }

# Sort by outlier percentage
sorted_features = sorted(outlier_info.items(), 
                        key=lambda x: x[1]['percentage'], 
                        reverse=True)

print("\nTop 10 features with most outliers:")
print(f"{'Feature':<35} {'Count':<10} {'Percentage':<12}")
print("-" * 60)
for feat, info in sorted_features[:10]:
    print(f"{feat:<35} {info['count']:<10} {info['percentage']:<12.2f}%")

In [None]:
# Create boxplots for top features with outliers
features_to_plot = [feat for feat, _ in sorted_features[:10]]

fig, axes = plt.subplots(5, 2, figsize=(15, 20))
axes = axes.ravel()

for idx, feature in enumerate(features_to_plot):
    ax = axes[idx]
    df.boxplot(column=feature, ax=ax)
    ax.set_title(f'{feature}\n({outlier_info[feature]["count"]} outliers, '
                f'{outlier_info[feature]["percentage"]:.1f}%)',
                fontsize=10, fontweight='bold')
    ax.set_ylabel('Value')
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('results/outlier_boxplots.png', dpi=300, bbox_inches='tight')
print("\n✓ Boxplots saved: results/outlier_boxplots.png")
plt.show()

## 8. Distribution Pattern Analysis

In [None]:
print("=" * 80)
print("DISTRIBUTION PATTERN ANALYSIS")
print("=" * 80)

distribution_info = {}

for feature in features:
    # Normality test (Shapiro-Wilk)
    try:
        sample_data = df[feature].dropna()
        if len(sample_data) > 5000:
            sample_data = sample_data.sample(5000, random_state=42)
        
        statistic, p_value = stats.shapiro(sample_data)
        is_normal = p_value > 0.05
    except:
        is_normal = False
        p_value = 0
    
    distribution_info[feature] = {
        'mean': df[feature].mean(),
        'median': df[feature].median(),
        'std': df[feature].std(),
        'skewness': df[feature].skew(),
        'kurtosis': df[feature].kurtosis(),
        'is_normal': is_normal,
        'normality_p_value': p_value
    }

# Count normal distributions
normal_count = sum(1 for info in distribution_info.values() if info['is_normal'])

print(f"\nDistribution Summary:")
print(f"  - Total features: {len(features)}")
print(f"  - Normal distributions: {normal_count}")
print(f"  - Non-normal distributions: {len(features) - normal_count}")

In [None]:
# Plot distribution for selected features
sample_features = features[:6]  # First 6 features

fig, axes = plt.subplots(3, 2, figsize=(15, 12))
axes = axes.ravel()

for idx, feature in enumerate(sample_features):
    ax = axes[idx]
    
    # Histogram with KDE
    df[feature].hist(bins=50, ax=ax, alpha=0.7, color='skyblue', edgecolor='black')
    ax2 = ax.twinx()
    df[feature].plot(kind='kde', ax=ax2, color='red', linewidth=2)
    
    ax.set_xlabel('Value', fontweight='bold')
    ax.set_ylabel('Frequency', fontweight='bold')
    ax2.set_ylabel('Density', fontweight='bold', color='red')
    
    title = f'{feature}\n'
    title += f"Skew: {distribution_info[feature]['skewness']:.2f}, "
    title += f"Kurt: {distribution_info[feature]['kurtosis']:.2f}"
    ax.set_title(title, fontsize=10, fontweight='bold')
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('results/distribution_analysis.png', dpi=300, bbox_inches='tight')
print("\n✓ Distribution plots saved: results/distribution_analysis.png")
plt.show()

## 9. Percentile and Quartile Analysis

In [None]:
print("=" * 80)
print("PERCENTILE AND QUARTILE ANALYSIS")
print("=" * 80)

stats_dict = {}

for feature in features:
    stats_dict[feature] = {
        'Mean': df[feature].mean(),
        'Median (M)': df[feature].median(),
        'Q1 (25th percentile)': df[feature].quantile(0.25),
        'Q3 (75th percentile)': df[feature].quantile(0.75),
        'P75 (75th percentile)': df[feature].quantile(0.75),
        'P25 (25th percentile)': df[feature].quantile(0.25),
        'IQR': df[feature].quantile(0.75) - df[feature].quantile(0.25)
    }

stats_df = pd.DataFrame(stats_dict).T

print("\nPercentile Statistics (first 5 features):")
print(stats_df.head().to_string())
print("\n... (showing first 5 features)")

stats_df.to_csv('results/percentile_quartile_stats.csv')
print("\n✓ Full statistics saved: results/percentile_quartile_stats.csv")

## 10. Trimmed Statistics

In [None]:
print("=" * 80)
print("TRIMMED STATISTICS (trim fraction: 0.1)")
print("=" * 80)

trim_fraction = 0.1
trimmed_stats = {}

for feature in features:
    data = df[feature].dropna().values
    
    # Trimmed mean
    trimmed_mean_val = trim_mean(data, trim_fraction)
    
    # Trimmed median
    lower_p = trim_fraction * 100
    upper_p = 100 - (trim_fraction * 100)
    trimmed_data = data[(data >= np.percentile(data, lower_p)) & 
                       (data <= np.percentile(data, upper_p))]
    trimmed_median_val = np.median(trimmed_data)
    
    # Trimmed standard deviation
    trimmed_std = np.std(trimmed_data)
    
    trimmed_stats[feature] = {
        'Original_Mean': np.mean(data),
        'Trimmed_Mean': trimmed_mean_val,
        'Original_Median': np.median(data),
        'Trimmed_Median': trimmed_median_val,
        'Original_Std': np.std(data),
        'Trimmed_Std': trimmed_std
    }

trimmed_df = pd.DataFrame(trimmed_stats).T

print(f"\nTrimmed Statistics (first 5 features):")
print(trimmed_df.head().to_string())
print("\n... (showing first 5 features)")

trimmed_df.to_csv('results/trimmed_statistics.csv')
print("\n✓ Full statistics saved: results/trimmed_statistics.csv")

## 11. Correlation Analysis

In [None]:
print("=" * 80)
print("CORRELATION ANALYSIS (Pearson method)")
print("=" * 80)

# Calculate correlation matrix
corr_matrix = df[features].corr(method='pearson')

# Find top correlations
corr_pairs = []
for i in range(len(corr_matrix.columns)):
    for j in range(i+1, len(corr_matrix.columns)):
        corr_pairs.append({
            'Feature_1': corr_matrix.columns[i],
            'Feature_2': corr_matrix.columns[j],
            'Correlation': corr_matrix.iloc[i, j]
        })

corr_df = pd.DataFrame(corr_pairs)
corr_df['Abs_Correlation'] = corr_df['Correlation'].abs()
corr_df = corr_df.sort_values('Abs_Correlation', ascending=False)

print(f"\nTop 20 Feature Correlations:")
print(corr_df.head(20)[['Feature_1', 'Feature_2', 'Correlation']].to_string(index=False))

# Save correlation matrix
corr_matrix.to_csv('results/correlation_matrix.csv')
print(f"\n✓ Full correlation matrix saved: results/correlation_matrix.csv")

In [None]:
# Create correlation heatmap
plt.figure(figsize=(20, 16))

# Select subset of features for better visualization
n_features = min(30, len(features))
selected_features = features[:n_features]
corr_subset = df[selected_features].corr(method='pearson')

sns.heatmap(corr_subset, annot=False, cmap='coolwarm', center=0,
           square=True, linewidths=0.5, cbar_kws={"shrink": 0.8})
plt.title(f'Feature Correlation Heatmap (Pearson)\n(First {n_features} features)',
         fontsize=16, fontweight='bold', pad=20)
plt.tight_layout()
plt.savefig('results/correlation_heatmap.png', dpi=300, bbox_inches='tight')
print("✓ Correlation heatmap saved: results/correlation_heatmap.png")
plt.show()

## 12. Outlier Removal and Clean Dataset Creation

In [None]:
print("=" * 80)
print("OUTLIER REMOVAL (IQR method)")
print("=" * 80)

initial_size = len(df)

# IQR method
outlier_mask = pd.Series([False] * len(df))

for feature in features:
    Q1 = df[feature].quantile(0.25)
    Q3 = df[feature].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    
    feature_outliers = (df[feature] < lower) | (df[feature] > upper)
    outlier_mask = outlier_mask | feature_outliers

cleaned_df = df[~outlier_mask].copy()

final_size = len(cleaned_df)
removed = initial_size - final_size

print(f"\nOutlier Removal Summary:")
print(f"  - Initial samples: {initial_size}")
print(f"  - Samples removed: {removed} ({(removed/initial_size)*100:.2f}%)")
print(f"  - Final samples: {final_size}")
print(f"  ✓ Cleaned dataset created")

# Save cleaned dataset
cleaned_df.to_csv('gtzan/features_30_sec_cleaned.csv', index=False)
print("\n✓ Cleaned dataset saved: gtzan/features_30_sec_cleaned.csv")

## 13. Summary and Conclusions

In [None]:
print("=" * 80)
print("DATA ANALYSIS SUMMARY")
print("=" * 80)

print("\n✓ Dataset Analysis Complete!")
print("\nKey Findings:")
print(f"  1. Dataset Adequacy: {'✓ ADEQUATE' if adequacy_report['is_adequate'] else '⚠ NEEDS ATTENTION'}")
print(f"  2. Class Balance: {imbalance_ratio:.2f} ratio ({'✓ BALANCED' if imbalance_ratio <= 1.5 else '⚠ IMBALANCED'})")
print(f"  3. Missing Values: {len(missing_df)} features with missing values")
print(f"  4. Normal Distributions: {normal_count}/{len(features)} features ({(normal_count/len(features)*100):.1f}%)")
print(f"  5. Outliers Removed: {removed} samples ({(removed/initial_size)*100:.2f}%)")

print("\nGenerated Files:")
print("  - results/class_balance.png")
print("  - results/descriptive_statistics.csv")
print("  - results/outlier_boxplots.png")
print("  - results/distribution_analysis.png")
print("  - results/percentile_quartile_stats.csv")
print("  - results/trimmed_statistics.csv")
print("  - results/correlation_matrix.csv")
print("  - results/correlation_heatmap.png")
print("  - gtzan/features_30_sec_cleaned.csv")

print("\n" + "=" * 80)
print("Ready for Clustering Implementation!")
print("Proceed to: Code_Implementation.ipynb")
print("=" * 80)