# Feature Selection

Picking the best features from the 32 features that are extracted in mfcc+envelope_extraction.ipynb


In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import seaborn as sns
import os
import librosa
from pathlib import Path
from tqdm import tqdm

def analyze_feature_importance(X, y):
    """
    Analyze feature importance using multiple methods
    """
    # Create feature names for better interpretation
    feature_names = (
        [f'mfcc_{i+1}' for i in range(14)] +  # 14 MFCC features
        [f'delta_{i+1}' for i in range(14)] +  # 14 delta features
        ['pre_max_deriv', 'post_max_deriv', 'flatness', 'temporal_centroid']  # 4 envelope features
    )
    
    # Standardize the features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # 1. PCA Analysis
    pca = PCA()
    pca.fit(X_scaled)
    
    # Calculate feature importance based on PCA components
    feature_importance_pca = np.abs(pca.components_[0])  # Using first principal component
    
    # 2. Random Forest Feature Importance
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    rf.fit(X_scaled, y)
    feature_importance_rf = rf.feature_importances_
    
    # 3. Correlation Analysis
    correlation_matrix = np.corrcoef(X_scaled.T)
    
    # Create results DataFrame
    results_df = pd.DataFrame({
        'Feature': feature_names,
        'PCA_Importance': feature_importance_pca,
        'RF_Importance': feature_importance_rf
    })
    
    # Sort by Random Forest importance
    results_df = results_df.sort_values('RF_Importance', ascending=False)
    
    # Plotting
    plt.figure(figsize=(15, 10))
    
    # Plot 1: Feature Importance
    plt.subplot(2, 1, 1)
    plt.bar(range(len(feature_importance_rf)), results_df['RF_Importance'])
    plt.xticks(range(len(feature_importance_rf)), results_df['Feature'], rotation=45, ha='right')
    plt.title('Feature Importance (Random Forest)')
    plt.tight_layout()
    
    # Plot 2: Correlation Heatmap
    plt.subplot(2, 1, 2)
    sns.heatmap(correlation_matrix, xticklabels=feature_names, yticklabels=feature_names, 
                cmap='coolwarm', center=0)
    plt.title('Feature Correlation Matrix')
    plt.tight_layout()
    
    # Print top features
    print("\nTop 10 Most Important Features (Random Forest):")
    print(results_df[['Feature', 'RF_Importance']].head(10))
    
    print("\nPCA Explained Variance Ratio:")
    print(f"First 5 components explain: {pca.explained_variance_ratio_[:5].cumsum()[-1]:.2%} of variance")
    
    # Identify highly correlated features
    print("\nHighly Correlated Feature Pairs:")
    for i in range(len(feature_names)):
        for j in range(i+1, len(feature_names)):
            if abs(correlation_matrix[i,j]) > 0.8:  # Threshold for high correlation
                print(f"{feature_names[i]} & {feature_names[j]}: {correlation_matrix[i,j]:.3f}")
    
    return results_df, correlation_matrix, pca

# Load the data
X = np.load('../../extracted_features/features/mfcc_env_features.npy')
y = np.load('../../extracted_features/labels/mfcc_env_labels.npy')

# Run the analysis
results_df, correlation_matrix, pca = analyze_feature_importance(X, y)

os.makedirs('../../visualization', exist_ok=True)

# Save the plot
plt.savefig('../../visualization/feature_extraction_analysis.png')
plt.close()


Top 10 Most Important Features (Random Forest):
     Feature  RF_Importance
1     mfcc_2       0.086723
15   delta_2       0.085599
2     mfcc_3       0.068139
3     mfcc_4       0.044547
30  flatness       0.041299
4     mfcc_5       0.036626
6     mfcc_7       0.034778
16   delta_3       0.034503
9    mfcc_10       0.033933
0     mfcc_1       0.030908

PCA Explained Variance Ratio:
First 5 components explain: 39.10% of variance

Highly Correlated Feature Pairs:
pre_max_deriv & post_max_deriv: -0.896


In [2]:
def extract_mfcc_features_expanded(segment_info_path, segments_dir='segments', n_mfcc=14, features_output_dir='../../extracted_features/features/mfcc_features.npy', labels_output_dir='../../extracted_features/labels/mfcc_labels.npy'):
    """
    Extract expanded feature set including MFCCs, their deltas, and envelope descriptors.
    Added robustness checks for empty or corrupted audio segments.
    """
    # Load segment info
    metadata = pd.read_csv(segment_info_path)
    
    # Update paths to use the specified segments directory
    segments_path = Path(segments_dir)
    metadata['segment_path'] = metadata['segment_path'].apply(
        lambda x: str(segments_path / Path(x).name))
    
    # Initialize arrays to store features and labels
    features = []
    labels = []
    
    print(f"Extracting expanded feature set from {segments_dir}...")
    for idx, row in tqdm(metadata.iterrows(), total=len(metadata)):
        try:
            # Load audio segment
            y, sr = librosa.load(row['segment_path'])
            
            # Check if the audio segment is valid
            if len(y) == 0:
                print(f"Skipping empty audio file: {row['segment_path']}")
                metadata = metadata.drop(idx)
                continue
                
            # 1. Extract MFCCs and their statistics
            mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
            mfcc_mean = np.mean(mfcc, axis=1)
            
            # 2. Compute MFCC deltas (first derivatives)
            mfcc_delta = librosa.feature.delta(mfcc)
            mfcc_delta_mean = np.mean(mfcc_delta, axis=1)
            
            # 3. Extract envelope-based descriptors
            # Find the amplitude envelope
            envelope = np.abs(y)
            
            # Safety check for empty envelope
            if len(envelope) == 0:
                print(f"Skipping file with empty envelope: {row['segment_path']}")
                metadata = metadata.drop(idx)
                continue
            
            # Find the maximum amplitude and its position
            max_amp_pos = np.argmax(envelope)
            max_amp = envelope[max_amp_pos]
            
            # 3.1 Maximum derivative before the maximum amplitude
            pre_max_deriv = 0
            if max_amp_pos > 0:
                pre_envelope = envelope[:max_amp_pos]
                if len(pre_envelope) > 1:  # Need at least 2 points for diff
                    pre_max_deriv = np.max(np.diff(pre_envelope))
            
            # 3.2 Derivative after the maximum amplitude
            post_max_deriv = 0
            if max_amp_pos < len(envelope)-1:
                post_envelope = envelope[max_amp_pos:]
                if len(post_envelope) > 1:  # Need at least 2 points for diff
                    post_max_deriv = np.min(np.diff(post_envelope))
            
            # 3.3 Temporal centroid
            times = np.arange(len(y))
            # Avoid division by zero
            env_sum = np.sum(envelope)
            if env_sum > 0:
                temporal_centroid = np.sum(times * envelope) / env_sum
                temporal_centroid_ratio = temporal_centroid / len(y)
            else:
                temporal_centroid_ratio = 0.5  # Default to middle if envelope is all zeros
            
            # 3.4 Flatness coefficient (spectral flatness as a proxy)
            # Handle potential warnings from librosa
            with np.errstate(divide='ignore', invalid='ignore'):
                flatness = librosa.feature.spectral_flatness(y=y)[0].mean()
                flatness = 0.0 if np.isnan(flatness) else flatness
            
            # Combine all features
            feature_vector = np.concatenate([
                mfcc_mean,                    # 14 features
                mfcc_delta_mean,              # 14 features
                [pre_max_deriv,               # 1 feature
                 post_max_deriv,              # 1 feature
                 flatness,                    # 1 feature
                 temporal_centroid_ratio]      # 1 feature
            ])
            
            features.append(feature_vector)
            labels.append(row['instrument_label'])
            
        except Exception as e:
            print(f"Error processing {row['segment_path']}: {str(e)}")
            metadata = metadata.drop(idx)
            continue
    
    # Convert to numpy arrays
    X = np.array(features)
    y = np.array(labels)
    
    # Print summary of processing
    print(f"\nProcessing complete:")
    print(f"Successfully processed: {len(features)} segments")
    print(f"Failed/Skipped: {len(metadata.index) - len(features)} segments")
    
    # Create features directory if it doesn't exist
    # output_dir = Path('features')
    # output_dir.mkdir(exist_ok=True, parents=True)
    
    # Save features and labels with directory-specific names
    # dir_suffix = '_augmented' if segments_dir == 'augmentedSegments' else ''
    # np.save(output_dir / f'mfcc_features_expanded{dir_suffix}.npy', X)
    # np.save(output_dir / f'labels_expanded{dir_suffix}.npy', y)
    
    np.save(features_output_dir, X)
    np.save(labels_output_dir, y)
    
    return X, y, metadata

In [3]:
def extract_mfcc_features_optimized(segment_info_path, segments_dir='segments', n_mfcc=14, features_output_dir='../../extracted_features/features/mfcc_features.npy', labels_output_dir='../../extracted_features/labels/mfcc_labels.npy'):
    """
    Extract only the most important features based on our analysis.
    """
    # First, get all features
    X, y, metadata = extract_mfcc_features_expanded(
        segment_info_path, 
        segments_dir, 
        n_mfcc,
        features_output_dir,
        labels_output_dir
    )
    
    # Debug prints
    print("Shape of X:", X.shape)
    print("Type of X:", type(X))
    print("First few elements of X:", X[:5])
    
    # Load the saved features to verify they were saved correctly
    X = np.load(features_output_dir)
    print("Shape of loaded X:", X.shape)
    
    # Make sure X is 2D
    if len(X.shape) == 1:
        print("Reshaping X...")
        X = X.reshape(-1, 34)  # 34 features: 14 MFCCs + 14 deltas + 4 envelope features
        print("New shape of X:", X.shape)
    
    # Define indices of important features based on analysis
    important_feature_indices = [
        1,  # mfcc_2
        15, # delta_2
        2,  # mfcc_3
        3,  # mfcc_4
        30, # flatness
        4,  # mfcc_5
        6,  # mfcc_7
        16, # delta_3
        9,  # mfcc_10
        0,  # mfcc_1
        29, # temporal_centroid
        28  # pre_max_deriv
    ]
    
    # Select only the important features
    X_selected = X[:, important_feature_indices]
    
    # Save the selected features
    np.save(features_output_dir, X_selected)
    np.save(labels_output_dir, y)
    
    print(f"Final shape of X_selected: {X_selected.shape}")
    return X_selected, y, metadata

In [4]:
segment_info_path = '../../segment_info/segment_info.csv'
augmented_segment_info_path = '../../segment_info/augmented_segment_info.csv'

In [5]:
X_mfcc_extracted, y_mfcc_extracted, metadata_mfcc_extracted = extract_mfcc_features_optimized(
    segment_info_path,
    segments_dir='../../segments',
    features_output_dir='../../extracted_features/features/mfcc_extracted_features.npy',
    labels_output_dir='../../extracted_features/labels/mfcc_extracted_labels.npy'
)

X_mfcc_extracted_aug, y_mfcc_extracted_aug, metadata_mfcc_extracted_aug = extract_mfcc_features_optimized(
    augmented_segment_info_path,
    segments_dir='../../augmentedSegments',
    features_output_dir='../../extracted_features/features/mfcc_extracted_aug_features.npy',
    labels_output_dir='../../extracted_features/labels/mfcc_extracted_aug_labels.npy'
)

Extracting expanded feature set from ../../segments...


100%|██████████| 5714/5714 [00:11<00:00, 482.97it/s]



Processing complete:
Successfully processed: 5714 segments
Failed/Skipped: 0 segments
Shape of X: (5714, 32)
Type of X: <class 'numpy.ndarray'>
First few elements of X: [[-3.68122345e+02  1.36077242e+01  5.19156027e+00  6.92084074e+00
  -2.02500820e+01 -9.85776424e+00 -2.29979496e+01  6.71332121e+00
   4.09922504e+00 -1.35043945e+01 -1.25964928e+01 -4.04153204e+00
  -7.45831156e+00 -6.52279091e+00 -1.65836639e+01  8.35736942e+00
   9.07826841e-01  1.26167294e-02  1.16370177e+00 -3.48212808e-01
   3.92646998e-01  8.85657873e-03  1.27106011e-01  3.27873826e-01
  -5.89539558e-02 -1.78510681e-01 -5.40644050e-01 -4.96054739e-01
   1.26419216e-01 -1.32621095e-01  9.62158144e-02  2.86268803e-01]
 [-4.62650421e+02  6.26236954e+01  6.74965048e+00  6.19364166e+00
  -1.31714792e+01 -1.31292267e+01 -2.00550766e+01  5.50945950e+00
   6.36978912e+00 -8.63569736e+00 -1.24239960e+01 -5.82191420e+00
  -1.06630611e+01 -7.98821878e+00 -2.25514851e+01  7.56658888e+00
   9.36554670e-01  4.88572001e-01  1.

100%|██████████| 34284/34284 [01:06<00:00, 517.71it/s]


Processing complete:
Successfully processed: 34284 segments
Failed/Skipped: 0 segments
Shape of X: (34284, 32)
Type of X: <class 'numpy.ndarray'>
First few elements of X: [[-3.68122345e+02  1.36077242e+01  5.19156027e+00  6.92084074e+00
  -2.02500820e+01 -9.85776424e+00 -2.29979496e+01  6.71332121e+00
   4.09922504e+00 -1.35043945e+01 -1.25964928e+01 -4.04153204e+00
  -7.45831156e+00 -6.52279091e+00 -1.65836639e+01  8.35736942e+00
   9.07826841e-01  1.26167294e-02  1.16370177e+00 -3.48212808e-01
   3.92646998e-01  8.85657873e-03  1.27106011e-01  3.27873826e-01
  -5.89539558e-02 -1.78510681e-01 -5.40644050e-01 -4.96054739e-01
   1.26419216e-01 -1.32621095e-01  9.62158144e-02  2.86268803e-01]
 [-4.62650421e+02  6.26236954e+01  6.74965048e+00  6.19364166e+00
  -1.31714792e+01 -1.31292267e+01 -2.00550766e+01  5.50945950e+00
   6.36978912e+00 -8.63569736e+00 -1.24239960e+01 -5.82191420e+00
  -1.06630611e+01 -7.98821878e+00 -2.25514851e+01  7.56658888e+00
   9.36554670e-01  4.88572001e-01  


