# Railway Crack Detection - Data Exploration

This notebook explores the railway acoustic emission dataset.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import librosa
import librosa.display
from pathlib import Path
import sys

# Add src to path
sys.path.append('../')

from src.utils.audio_utils import load_audio_files_from_directory, get_audio_stats
from src.utils.visualization import plot_waveform, plot_spectrogram, plot_mel_spectrogram

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

print('✅ Imports successful')

## 1. Load Dataset

In [None]:
# Define paths
HEALTHY_DIR = '../data/raw/healthy'
DEFECTIVE_DIR = '../data/raw/defective'
SAMPLE_RATE = 22050

# Load healthy samples
print('Loading healthy rail samples...')
healthy_audio, healthy_files = load_audio_files_from_directory(
    HEALTHY_DIR, sample_rate=SAMPLE_RATE
)

# Load defective samples
print('Loading defective rail samples...')
defective_audio, defective_files = load_audio_files_from_directory(
    DEFECTIVE_DIR, sample_rate=SAMPLE_RATE
)

print(f'\nHealthy samples: {len(healthy_audio)}')
print(f'Defective samples: {len(defective_audio)}')
print(f'Total samples: {len(healthy_audio) + len(defective_audio)}')

## 2. Dataset Statistics

In [None]:
# Calculate statistics for all samples
def analyze_dataset(audio_list, label):
    stats_list = []
    for audio in audio_list:
        stats = get_audio_stats(audio, SAMPLE_RATE)
        stats['label'] = label
        stats_list.append(stats)
    return pd.DataFrame(stats_list)

# Analyze both classes
healthy_stats = analyze_dataset(healthy_audio, 'Healthy')
defective_stats = analyze_dataset(defective_audio, 'Defective')

# Combine
all_stats = pd.concat([healthy_stats, defective_stats], ignore_index=True)

# Display summary
print('Dataset Statistics Summary:\n')
print(all_stats.groupby('label').describe().T)

## 3. Visualize Sample Audio

In [None]:
# Plot healthy sample
if len(healthy_audio) > 0:
    sample_healthy = healthy_audio[0]
    
    fig = plot_waveform(sample_healthy, SAMPLE_RATE, 'Healthy Rail - Waveform')
    plt.show()
    
    fig = plot_spectrogram(sample_healthy, SAMPLE_RATE, title='Healthy Rail - Spectrogram')
    plt.show()

In [None]:
# Plot defective sample
if len(defective_audio) > 0:
    sample_defective = defective_audio[0]
    
    fig = plot_waveform(sample_defective, SAMPLE_RATE, 'Defective Rail - Waveform')
    plt.show()
    
    fig = plot_spectrogram(sample_defective, SAMPLE_RATE, title='Defective Rail - Spectrogram')
    plt.show()

## 4. Class Distribution

In [None]:
# Plot class distribution
fig, ax = plt.subplots(figsize=(8, 6))

classes = ['Healthy', 'Defective']
counts = [len(healthy_audio), len(defective_audio)]

ax.bar(classes, counts, color=['green', 'red'], alpha=0.7)
ax.set_ylabel('Number of Samples')
ax.set_title('Class Distribution')
ax.grid(axis='y', alpha=0.3)

for i, count in enumerate(counts):
    ax.text(i, count + 5, str(count), ha='center', va='bottom', fontsize=12, fontweight='bold')

plt.tight_layout()
plt.show()

# Check for class imbalance
if len(counts) > 1 and min(counts) > 0:
    imbalance_ratio = max(counts) / min(counts)
    print(f'\nClass imbalance ratio: {imbalance_ratio:.2f}')
    if imbalance_ratio > 1.5:
        print('⚠️  Dataset is imbalanced. Consider using data augmentation.')

## 5. Audio Duration Analysis

In [None]:
# Plot duration distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Healthy durations
axes[0].hist(healthy_stats['duration'], bins=20, color='green', alpha=0.7, edgecolor='black')
axes[0].set_xlabel('Duration (s)')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Healthy Rail - Duration Distribution')
axes[0].grid(axis='y', alpha=0.3)

# Defective durations
axes[1].hist(defective_stats['duration'], bins=20, color='red', alpha=0.7, edgecolor='black')
axes[1].set_xlabel('Duration (s)')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Defective Rail - Duration Distribution')
axes[1].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

## Summary

This notebook explored the railway acoustic dataset:
- Loaded and analyzed audio samples
- Visualized waveforms and spectrograms
- Examined class distribution
- Analyzed audio duration statistics

**Next Steps:**
- Feature extraction (Notebook 02)
- Model training (Notebook 03)