# Feature Engineering - Time-Series Features

This notebook demonstrates the feature engineering process for predictive maintenance.

## Objectives
- Create time-series features (rolling statistics, trends, differences)
- Analyze feature importance and correlations
- Understand feature creation process

In [None]:
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

import config
from src.preprocess import preprocess_pipeline
from src.features import create_all_features, get_feature_columns

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

## 1. Load Processed Data

In [None]:
# Load processed data (or run preprocessing if needed)
try:
    train_df = pd.read_pickle(config.DATA_PROCESSED_DIR / "train_processed.pkl")
    val_df = pd.read_pickle(config.DATA_PROCESSED_DIR / "val_processed.pkl")
    print("Loaded processed data from disk")
except FileNotFoundError:
    print("Processed data not found. Running preprocessing pipeline...")
    train_df, val_df, scaler = preprocess_pipeline()
    print("Preprocessing complete!")

print(f"Train shape: {train_df.shape}")
print(f"Validation shape: {val_df.shape}")

## 2. Create Time-Series Features

In [None]:
# Create all features
print("Creating features for training data...")
train_features = create_all_features(train_df)

print("\nCreating features for validation data...")
val_features = create_all_features(val_df)

print(f"\nOriginal columns: {len(train_df.columns)}")
print(f"Feature columns: {len(train_features.columns)}")
print(f"New features created: {len(train_features.columns) - len(train_df.columns)}")

## 3. Feature Analysis

In [None]:
# Get feature columns (excluding metadata)
feature_cols = get_feature_columns(train_features)
print(f"Total feature columns: {len(feature_cols)}")

# Show sample feature names
print("\nSample feature names:")
print(feature_cols[:20])

In [None]:
# Feature correlation with RUL
feature_cols = get_feature_columns(train_features)
correlations = train_features[feature_cols + ['RUL']].corr()['RUL'].sort_values(ascending=False)

print("Top 20 features correlated with RUL:")
print(correlations.head(20))

print("\nBottom 20 features correlated with RUL:")
print(correlations.tail(20))

In [None]:
# Feature correlation with failure
correlations_failure = train_features[feature_cols + ['failure']].corr()['failure'].sort_values(ascending=False)

print("Top 20 features correlated with failure:")
print(correlations_failure.head(20))

## 4. Visualize Feature Examples

In [None]:
# Visualize rolling features for one engine
sample_engine = train_features[train_features['engine_id'] == 1].sort_values('cycle')

fig, axes = plt.subplots(2, 2, figsize=(16, 10))

# Original sensor vs rolling mean
sensor_col = 'sensor_2'
axes[0, 0].plot(sample_engine['cycle'], sample_engine[sensor_col], label='Original', alpha=0.7)
axes[0, 0].plot(sample_engine['cycle'], sample_engine[f'{sensor_col}_rolling_mean_10'], 
                label='Rolling Mean (10)', linewidth=2)
axes[0, 0].set_title(f'{sensor_col} - Original vs Rolling Mean')
axes[0, 0].set_xlabel('Cycle')
axes[0, 0].set_ylabel('Value')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# Rolling std
axes[0, 1].plot(sample_engine['cycle'], sample_engine[f'{sensor_col}_rolling_std_10'], 
                label='Rolling Std (10)', linewidth=2, color='orange')
axes[0, 1].set_title(f'{sensor_col} - Rolling Standard Deviation')
axes[0, 1].set_xlabel('Cycle')
axes[0, 1].set_ylabel('Std')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# Rolling slope
axes[1, 0].plot(sample_engine['cycle'], sample_engine[f'{sensor_col}_rolling_slope_10'], 
                label='Rolling Slope (10)', linewidth=2, color='green')
axes[1, 0].axhline(y=0, color='r', linestyle='--', alpha=0.5)
axes[1, 0].set_title(f'{sensor_col} - Rolling Slope (Trend)')
axes[1, 0].set_xlabel('Cycle')
axes[1, 0].set_ylabel('Slope')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)

# First-order difference
axes[1, 1].plot(sample_engine['cycle'], sample_engine[f'{sensor_col}_diff'], 
                label='First-order Diff', linewidth=2, color='purple')
axes[1, 1].axhline(y=0, color='r', linestyle='--', alpha=0.5)
axes[1, 1].set_title(f'{sensor_col} - First-order Difference')
axes[1, 1].set_xlabel('Cycle')
axes[1, 1].set_ylabel('Difference')
axes[1, 1].legend()
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../results/feature_examples.png', dpi=300, bbox_inches='tight')
plt.show()

## 5. Save Feature-Engineered Data

In [None]:
# Save feature-engineered data
config.DATA_PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
train_features.to_pickle(config.DATA_PROCESSED_DIR / "train_features.pkl")
val_features.to_pickle(config.DATA_PROCESSED_DIR / "val_features.pkl")

print("Feature-engineered data saved!")
print(f"Train features shape: {train_features.shape}")
print(f"Validation features shape: {val_features.shape}")