# Railway Crack Detection - Feature Analysis

Extract and analyze acoustic features for crack detection.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sys
from tqdm import tqdm

sys.path.append('../')

from src.preprocessing.audio_loader import AudioLoader
from src.preprocessing.noise_filter import NoiseFilter
from src.feature_extraction.mfcc_extractor import MFCCExtractor
from src.feature_extraction.spectral_features import SpectralFeatureExtractor
from src.feature_extraction.fractal_analysis import FractalAnalyzer
from src.utils.audio_utils import load_audio_files_from_directory

plt.style.use('seaborn-v0_8-darkgrid')
print('✅ Imports successful')

## 1. Load Dataset

In [None]:
# Configuration
HEALTHY_DIR = '../data/raw/healthy'
DEFECTIVE_DIR = '../data/raw/defective'
SAMPLE_RATE = 22050
N_MFCC = 20
N_FFT = 2048
HOP_LENGTH = 512

# Load audio
healthy_audio, _ = load_audio_files_from_directory(HEALTHY_DIR, SAMPLE_RATE)
defective_audio, _ = load_audio_files_from_directory(DEFECTIVE_DIR, SAMPLE_RATE)

print(f'Loaded {len(healthy_audio)} healthy and {len(defective_audio)} defective samples')

## 2. Preprocessing

In [None]:
# Initialize preprocessor
noise_filter = NoiseFilter(sample_rate=SAMPLE_RATE)

# Preprocess all samples
print('Preprocessing audio...')
healthy_clean = [noise_filter.preprocess(audio) for audio in tqdm(healthy_audio)]
defective_clean = [noise_filter.preprocess(audio) for audio in tqdm(defective_audio)]

print('✅ Preprocessing complete')

## 3. Feature Extraction

In [None]:
# Initialize extractors
mfcc_extractor = MFCCExtractor(SAMPLE_RATE, N_MFCC, N_FFT, HOP_LENGTH)
spectral_extractor = SpectralFeatureExtractor(SAMPLE_RATE, N_FFT, HOP_LENGTH)
fractal_analyzer = FractalAnalyzer(SAMPLE_RATE)

def extract_features(audio_list, label):
    """Extract all features from audio list"""
    features_list = []
    
    for audio in tqdm(audio_list, desc=f'Extracting {label} features'):
        # MFCC
        mfcc_feat = mfcc_extractor.extract_full_features(audio)
        
        # Spectral
        spectral_feat = spectral_extractor.extract_all_features(audio)
        
        # Fractal
        fractal_feat = fractal_analyzer.extract_all_fractal_features(audio)
        
        # Combine
        all_features = np.concatenate([mfcc_feat, spectral_feat, fractal_feat])
        features_list.append(all_features)
    
    return np.array(features_list)

# Extract features
print('Extracting features...')
healthy_features = extract_features(healthy_clean, 'Healthy')
defective_features = extract_features(defective_clean, 'Defective')

print(f'\nFeature shape: {healthy_features.shape}')
print(f'Total features per sample: {healthy_features.shape[1]}')

## 4. Feature Statistics

In [None]:
# Create feature DataFrame
n_mfcc_features = 120  # 20 MFCCs * 6 statistics
n_spectral_features = 13
n_fractal_features = 4

feature_names = (
    [f'MFCC_{i}' for i in range(n_mfcc_features)] +
    [f'Spectral_{i}' for i in range(n_spectral_features)] +
    ['Higuchi_FD', 'Petrosian_FD', 'Katz_FD', 'Hurst_Exp']
)

# Combine all features
all_features = np.vstack([healthy_features, defective_features])
labels = np.array([0]*len(healthy_features) + [1]*len(defective_features))

# Create DataFrame
df_features = pd.DataFrame(all_features, columns=feature_names)
df_features['label'] = labels
df_features['label_name'] = df_features['label'].map({0: 'Healthy', 1: 'Defective'})

print('Feature statistics:')
print(df_features.groupby('label_name').describe().T)

## 5. Feature Distribution Analysis

In [None]:
# Plot feature distributions for key features
key_features = ['MFCC_0', 'MFCC_20', 'Spectral_0', 'Higuchi_FD', 'Hurst_Exp']

fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()

for i, feature in enumerate(key_features):
    if feature in df_features.columns:
        df_features[df_features['label']==0][feature].hist(
            ax=axes[i], bins=30, alpha=0.6, label='Healthy', color='green'
        )
        df_features[df_features['label']==1][feature].hist(
            ax=axes[i], bins=30, alpha=0.6, label='Defective', color='red'
        )
        axes[i].set_xlabel(feature)
        axes[i].set_ylabel('Frequency')
        axes[i].legend()
        axes[i].grid(alpha=0.3)

# Remove extra subplot
fig.delaxes(axes[-1])

plt.suptitle('Feature Distributions: Healthy vs Defective', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

## 6. Feature Correlation

In [None]:
# Calculate correlation matrix for key features
key_features_with_label = key_features + ['label']
corr_matrix = df_features[key_features_with_label].corr()

# Plot correlation heatmap
fig, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
            center=0, square=True, ax=ax, cbar_kws={'label': 'Correlation'})
ax.set_title('Feature Correlation Matrix', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

## 7. Save Processed Features

In [None]:
# Save features to processed directory
output_dir = Path('../data/processed')
output_dir.mkdir(parents=True, exist_ok=True)

# Save as numpy arrays
np.save(output_dir / 'features.npy', all_features)
np.save(output_dir / 'labels.npy', labels)

# Save feature names
with open(output_dir / 'feature_names.txt', 'w') as f:
    f.write('\n'.join(feature_names))

print(f'✅ Features saved to {output_dir}')
print(f'   - features.npy: {all_features.shape}')
print(f'   - labels.npy: {labels.shape}')

## Summary

This notebook:
- Preprocessed audio with noise filtering
- Extracted MFCC, spectral, and fractal features
- Analyzed feature distributions and correlations
- Saved processed features for model training

**Next:** Model training (Notebook 03)