# Railway Crack Detection - Model Training

Train and evaluate machine learning models for crack detection.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sys
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

sys.path.append('../')

from src.models.classifier import RailwayDefectClassifier
from src.models.ensemble import EnsembleClassifier
from src.evaluation.metrics import ModelEvaluator
from src.augmentation.diffusion_generator import DiffusionAudioGenerator
from src.utils.visualization import plot_confusion_matrix

plt.style.use('seaborn-v0_8-darkgrid')
print('✅ Imports successful')

## 1. Load Processed Features

In [None]:
# Load features and labels
data_dir = Path('../data/processed')

X = np.load(data_dir / 'features.npy')
y = np.load(data_dir / 'labels.npy')

# Load feature names
with open(data_dir / 'feature_names.txt', 'r') as f:
    feature_names = f.read().splitlines()

print(f'Features shape: {X.shape}')
print(f'Labels shape: {y.shape}')
print(f'Number of features: {len(feature_names)}')
print(f'\nClass distribution:')
print(f'  Healthy (0): {np.sum(y==0)}')
print(f'  Defective (1): {np.sum(y==1)}')

## 2. Train-Test Split

In [None]:
# Split dataset
TEST_SIZE = 0.2
RANDOM_STATE = 42

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y
)

print(f'Training samples: {len(X_train)}')
print(f'Test samples: {len(X_test)}')
print(f'\nTraining class distribution:')
print(f'  Healthy: {np.sum(y_train==0)}')
print(f'  Defective: {np.sum(y_train==1)}')

## 3. Train Random Forest Classifier

In [None]:
# Initialize and train Random Forest
print('Training Random Forest...')
rf_classifier = RailwayDefectClassifier(
    model_type='random_forest',
    n_estimators=100,
    max_depth=20,
    random_state=RANDOM_STATE
)

rf_classifier.train(X_train, y_train)
print('✅ Training complete')

# Evaluate
rf_metrics = rf_classifier.evaluate(X_test, y_test)
print('\nRandom Forest Performance:')
print(f"  Accuracy:  {rf_metrics['accuracy']:.4f}")
print(f"  Precision: {rf_metrics['precision']:.4f}")
print(f"  Recall:    {rf_metrics['recall']:.4f}")
print(f"  F1-Score:  {rf_metrics['f1_score']:.4f}")

## 4. Train XGBoost Classifier

In [None]:
# Initialize and train XGBoost
print('Training XGBoost...')
xgb_classifier = RailwayDefectClassifier(
    model_type='xgboost',
    n_estimators=100,
    learning_rate=0.1,
    max_depth=6,
    random_state=RANDOM_STATE
)

xgb_classifier.train(X_train, y_train)
print('✅ Training complete')

# Evaluate
xgb_metrics = xgb_classifier.evaluate(X_test, y_test)
print('\nXGBoost Performance:')
print(f"  Accuracy:  {xgb_metrics['accuracy']:.4f}")
print(f"  Precision: {xgb_metrics['precision']:.4f}")
print(f"  Recall:    {xgb_metrics['recall']:.4f}")
print(f"  F1-Score:  {xgb_metrics['f1_score']:.4f}")

## 5. Train SVM Classifier

In [None]:
# Initialize and train SVM
print('Training SVM...')
svm_classifier = RailwayDefectClassifier(
    model_type='svm',
    kernel='rbf',
    C=1.0,
    random_state=RANDOM_STATE
)

svm_classifier.train(X_train, y_train)
print('✅ Training complete')

# Evaluate
svm_metrics = svm_classifier.evaluate(X_test, y_test)
print('\nSVM Performance:')
print(f"  Accuracy:  {svm_metrics['accuracy']:.4f}")
print(f"  Precision: {svm_metrics['precision']:.4f}")
print(f"  Recall:    {svm_metrics['recall']:.4f}")
print(f"  F1-Score:  {svm_metrics['f1_score']:.4f}")

## 6. Train Ensemble Model

In [None]:
# Initialize and train Ensemble
print('Training Ensemble Model...')
ensemble = EnsembleClassifier()
ensemble.train(X_train, y_train)
print('✅ Ensemble training complete')

# Evaluate
ensemble_metrics = ensemble.evaluate(X_test, y_test)
print('\nEnsemble Performance:')
print(f"  Accuracy:  {ensemble_metrics['accuracy']:.4f}")
print(f"  Precision: {ensemble_metrics['precision']:.4f}")
print(f"  Recall:    {ensemble_metrics['recall']:.4f}")
print(f"  F1-Score:  {ensemble_metrics['f1_score']:.4f}")

## 7. Model Comparison

In [None]:
# Compare all models
comparison_df = pd.DataFrame({
    'Model': ['Random Forest', 'XGBoost', 'SVM', 'Ensemble'],
    'Accuracy': [
        rf_metrics['accuracy'],
        xgb_metrics['accuracy'],
        svm_metrics['accuracy'],
        ensemble_metrics['accuracy']
    ],
    'Precision': [
        rf_metrics['precision'],
        xgb_metrics['precision'],
        svm_metrics['precision'],
        ensemble_metrics['precision']
    ],
    'Recall': [
        rf_metrics['recall'],
        xgb_metrics['recall'],
        svm_metrics['recall'],
        ensemble_metrics['recall']
    ],
    'F1-Score': [
        rf_metrics['f1_score'],
        xgb_metrics['f1_score'],
        svm_metrics['f1_score'],
        ensemble_metrics['f1_score']
    ]
})

print('\nModel Comparison:')
print(comparison_df.to_string(index=False))

# Plot comparison
fig, ax = plt.subplots(figsize=(12, 6))
comparison_df.set_index('Model').plot(kind='bar', ax=ax, rot=0)
ax.set_ylabel('Score')
ax.set_title('Model Performance Comparison', fontsize=14, fontweight='bold')
ax.legend(loc='lower right')
ax.grid(axis='y', alpha=0.3)
ax.set_ylim([0, 1.0])
plt.tight_layout()
plt.show()

## 8. Confusion Matrix

In [None]:
# Get ensemble predictions
y_pred = ensemble.predict(X_test)

# Calculate confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Plot
fig = plot_confusion_matrix(
    cm,
    class_labels=['Healthy', 'Defective'],
    title='Ensemble Model - Confusion Matrix'
)
plt.show()

# Print classification report
print('\nDetailed Classification Report:')
print(classification_report(y_test, y_pred, target_names=['Healthy', 'Defective']))

## 9. Feature Importance

In [None]:
# Get feature importance from Random Forest
importance = rf_classifier.get_feature_importance()

if importance is not None:
    # Get top 20 features
    top_n = 20
    indices = np.argsort(importance)[::-1][:top_n]
    
    # Plot
    fig, ax = plt.subplots(figsize=(10, 8))
    ax.barh(range(top_n), importance[indices])
    ax.set_yticks(range(top_n))
    ax.set_yticklabels([feature_names[i] for i in indices])
    ax.set_xlabel('Importance')
    ax.set_title(f'Top {top_n} Most Important Features', fontsize=14, fontweight='bold')
    ax.invert_yaxis()
    ax.grid(axis='x', alpha=0.3)
    plt.tight_layout()
    plt.show()

## 10. Save Best Model

In [None]:
# Save ensemble model (best performer)
model_dir = Path('../models/trained')
model_dir.mkdir(parents=True, exist_ok=True)

# Save individual classifiers
rf_classifier.save(model_dir / 'random_forest_model.pkl')
xgb_classifier.save(model_dir / 'xgboost_model.pkl')
svm_classifier.save(model_dir / 'svm_model.pkl')

print('✅ Models saved successfully!')
print(f'   - Random Forest: {model_dir / "random_forest_model.pkl"}')
print(f'   - XGBoost: {model_dir / "xgboost_model.pkl"}')
print(f'   - SVM: {model_dir / "svm_model.pkl"}')

## Summary

This notebook:
- Trained multiple classifiers (RF, XGBoost, SVM, Ensemble)
- Evaluated model performance
- Compared different approaches
- Analyzed feature importance
- Saved trained models

**Best Model:** Ensemble (combining RF + XGBoost + SVM)

**Next Steps:**
- Deploy model in Streamlit app
- Test with real railway acoustic data