# 02 - Feature Engineering: CMAPSS Dataset
We generate rolling statistical features over time for each engine's sensors to enable early failure prediction.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='whitegrid')

## Load preprocessed data with RUL

In [6]:
df = pd.read_csv('../data/CMAPSSData/train_FD001_with_rul.csv')
df.head()

Unnamed: 0,unit,cycle,op1,op2,op3,sensor1,sensor2,sensor3,sensor4,sensor5,...,sensor13,sensor14,sensor15,sensor16,sensor17,sensor18,sensor19,sensor20,sensor21,RUL
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,...,2388.02,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.419,191
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,2388.07,8131.49,8.4318,0.03,392,2388,100.0,39.0,23.4236,190
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,...,2388.03,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442,189
3,1,4,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,...,2388.08,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739,188
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,2388.04,8133.8,8.4294,0.03,393,2388,100.0,38.9,23.4044,187


## Create Binary Label: Early Failure (within 30 cycles)

In [7]:
df['label'] = (df['RUL'] <= 30).astype(int)
df[['unit','cycle','RUL','label']].head()

Unnamed: 0,unit,cycle,RUL,label
0,1,1,191,0
1,1,2,190,0
2,1,3,189,0
3,1,4,188,0
4,1,5,187,0


## Generate Rolling Features for Sensors

In [10]:
# Rolling statistics on selected sensors
sensor_cols = [f'sensor{i}' for i in [2,3,7,11]]
window_size = 5
for col in sensor_cols:
    df[f'{col}_mean'] = df.groupby('unit')[col].transform(lambda x: x.rolling(window=window_size, min_periods=1).mean())
    df[f'{col}_std'] = df.groupby('unit')[col].transform(lambda x: x.rolling(window=window_size, min_periods=1).std())
    df[f'{col}_diff'] = df.groupby('unit')[col].diff()

df_featured = df.drop(columns=sensor_cols + ['op1','op2','op3'])
df_featured = df_featured.dropna()
df_featured.head()

Unnamed: 0,unit,cycle,sensor1,sensor4,sensor5,sensor6,sensor8,sensor9,sensor10,sensor12,...,sensor2_diff,sensor3_mean,sensor3_std,sensor3_diff,sensor7_mean,sensor7_std,sensor7_diff,sensor11_mean,sensor11_std,sensor11_diff
1,1,2,518.67,1403.14,14.62,21.61,2388.04,9044.07,1.3,522.28,...,0.33,1590.76,1.499066,2.12,554.055,0.431335,-0.61,47.48,0.014142,0.02
2,1,3,518.67,1404.2,14.62,21.61,2388.08,9052.94,1.3,522.42,...,0.2,1589.836667,1.918654,-3.83,554.123333,0.327159,0.51,47.41,0.121655,-0.22
3,1,4,518.67,1401.87,14.62,21.61,2388.11,9049.48,1.3,522.86,...,0.0,1588.075,3.855909,-5.2,554.205,0.313103,0.19,47.34,0.171659,-0.14
4,1,5,518.67,1406.22,14.62,21.61,2388.06,9055.15,1.3,522.19,...,0.02,1587.03,4.075678,0.06,554.164,0.286234,-0.45,47.328,0.151063,0.15
5,1,6,518.67,1398.37,14.62,21.61,2388.02,9049.68,1.3,521.68,...,-0.27,1585.984,3.885831,1.62,554.226,0.362671,0.67,47.266,0.141527,-0.12


## Save Feature Dataset

In [11]:
df_featured.to_csv('../outputs/FD001_features_labeled.csv', index=False)
print('Feature dataset saved.')

Feature dataset saved.
