# Data Exploration - NASA C-MAPSS Dataset

This notebook explores the NASA C-MAPSS Turbofan Engine Degradation Dataset.

## Objectives
- Understand dataset structure and statistics
- Analyze sensor distributions
- Visualize degradation patterns
- Examine class imbalance for failure prediction

In [None]:
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

import config
from src.preprocess import load_train_data, calculate_rul_labels, create_failure_labels

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

## 1. Load and Inspect Data

In [None]:
# Load training data
df = load_train_data()
print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {list(df.columns)}")
df.head()

In [None]:
# Basic statistics
print("Dataset Info:")
print(f"Number of engines: {df['engine_id'].nunique()}")
print(f"Total cycles: {df['cycle'].sum()}")
print(f"Average cycles per engine: {df.groupby('engine_id')['cycle'].max().mean():.1f}")
print(f"Min cycles per engine: {df.groupby('engine_id')['cycle'].max().min()}")
print(f"Max cycles per engine: {df.groupby('engine_id')['cycle'].max().max()}")

## 2. Calculate RUL and Failure Labels

In [None]:
# Calculate RUL
df = calculate_rul_labels(df)
df = create_failure_labels(df)

print("RUL Statistics:")
print(df['RUL'].describe())

print("\nFailure Label Distribution:")
print(df['failure'].value_counts())
print(f"\nFailure rate: {df['failure'].mean()*100:.2f}%")

## 3. Sensor Analysis

In [None]:
# Sensor statistics
sensor_cols = [col for col in df.columns if col.startswith('sensor_')]
sensor_stats = df[sensor_cols].describe().T
sensor_stats['variance'] = df[sensor_cols].var()
sensor_stats = sensor_stats.sort_values('variance', ascending=False)
sensor_stats

## 4. Visualize Degradation Patterns

In [None]:
# Plot RUL distribution over cycles for a few engines
sample_engines = df['engine_id'].unique()[:5]

fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.flatten()

for idx, engine_id in enumerate(sample_engines):
    engine_data = df[df['engine_id'] == engine_id].sort_values('cycle')
    axes[idx].plot(engine_data['cycle'], engine_data['RUL'], linewidth=2)
    axes[idx].axhline(y=config.FAILURE_THRESHOLD, color='r', linestyle='--', 
                      label=f'Failure threshold ({config.FAILURE_THRESHOLD})')
    axes[idx].set_title(f'Engine {engine_id} - RUL over Cycles')
    axes[idx].set_xlabel('Cycle')
    axes[idx].set_ylabel('RUL')
    axes[idx].legend()
    axes[idx].grid(True, alpha=0.3)

# Hide unused subplots
for idx in range(len(sample_engines), len(axes)):
    axes[idx].axis('off')

plt.tight_layout()
plt.savefig('../results/rul_degradation_patterns.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Correlation matrix for sensors
sensor_corr = df[sensor_cols].corr()

plt.figure(figsize=(14, 12))
sns.heatmap(sensor_corr, annot=False, cmap='coolwarm', center=0, 
            square=True, linewidths=0.5, cbar_kws={"shrink": 0.8})
plt.title('Sensor Correlation Matrix')
plt.tight_layout()
plt.savefig('../results/sensor_correlation.png', dpi=300, bbox_inches='tight')
plt.show()

## 5. Class Imbalance Analysis

In [None]:
# Failure distribution by RUL bins
rul_bins = [0, 10, 20, 30, 50, 100, 200, 500]
df['RUL_bin'] = pd.cut(df['RUL'], bins=rul_bins, labels=[f'{rul_bins[i]}-{rul_bins[i+1]}' 
                                                           for i in range(len(rul_bins)-1)])

failure_by_rul = df.groupby('RUL_bin')['failure'].agg(['count', 'sum', 'mean'])
failure_by_rul.columns = ['Total', 'Failures', 'Failure_Rate']
failure_by_rul

In [None]:
# Visualize class imbalance
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Failure distribution
df['failure'].value_counts().plot(kind='bar', ax=axes[0], color=['skyblue', 'salmon'])
axes[0].set_title('Failure Label Distribution')
axes[0].set_xlabel('Failure (0=No, 1=Yes)')
axes[0].set_ylabel('Count')
axes[0].tick_params(axis='x', rotation=0)

# RUL distribution
axes[1].hist(df['RUL'], bins=50, edgecolor='black', alpha=0.7)
axes[1].axvline(x=config.FAILURE_THRESHOLD, color='r', linestyle='--', 
                label=f'Failure threshold ({config.FAILURE_THRESHOLD})')
axes[1].set_title('RUL Distribution')
axes[1].set_xlabel('Remaining Useful Life (Cycles)')
axes[1].set_ylabel('Frequency')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../results/class_imbalance.png', dpi=300, bbox_inches='tight')
plt.show()