In [3]:
import pandas as pd
import matplotlib.pyplot as plt

# Column names as defined by NASA
columns = ['engine_id', 'cycle'] + \
          [f'op_setting_{i}' for i in range(1, 4)] + \
          [f'sensor_{i}' for i in range(1, 22)]

# Load training data
train_df = pd.read_csv(
    '../data/raw/train_FD001.txt',
    sep=' ',
    header=None
)

# Drop completely empty columns (IMPORTANT FIX)
train_df = train_df.dropna(axis=1, how='all')

# Assign column names
train_df.columns = columns

train_df.head()


Unnamed: 0,engine_id,cycle,op_setting_1,op_setting_2,op_setting_3,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,...,sensor_12,sensor_13,sensor_14,sensor_15,sensor_16,sensor_17,sensor_18,sensor_19,sensor_20,sensor_21
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,...,521.66,2388.02,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.419
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,522.28,2388.07,8131.49,8.4318,0.03,392,2388,100.0,39.0,23.4236
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,...,522.42,2388.03,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442
3,1,4,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,...,522.86,2388.08,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,522.19,2388.04,8133.8,8.4294,0.03,393,2388,100.0,38.9,23.4044


In [5]:
# Calculate max cycle per engine
max_cycle = train_df.groupby('engine_id')['cycle'].max()

# Create RUL column
train_df['RUL'] = train_df.apply(
    lambda row: max_cycle[row['engine_id']] - row['cycle'],
    axis=1
)

train_df[['engine_id', 'cycle', 'RUL']].head()
 

Unnamed: 0,engine_id,cycle,RUL
0,1,1,191.0
1,1,2,190.0
2,1,3,189.0
3,1,4,188.0
4,1,5,187.0


In [7]:
FAILURE_THRESHOLD = 24  # cycles (~24 hours)

train_df['failure'] = (train_df['RUL'] <= FAILURE_THRESHOLD).astype(int)

train_df['failure'].value_counts(normalize=True) * 100



0    87.882313
1    12.117687
Name: failure, dtype: float64