In [1]:
"""
FD001 Preprocessing Pipeline
Step 1: Load data
Step 2: Add RUL labels to training data
Step 3: Feature selection (drop constant sensors)
Step 4: Normalise (z-score, fit on train, apply to test)
"""

import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
DATA_DIR = "6. Turbofan Engine Degradation Simulation Data Set/CMAPSSData"   # folder containing the .txt files
MAX_RUL  = 125 # piecewise-linear RUL cap (cycles)

In [2]:
# Load data
def load_data(data_dir: str):
    """
    Loads train_FD001.txt, test_FD001.txt and RUL_FD001.txt.

    The files have no headers so assign column names manually.
    26 columns total:
        col 1    - unit number  (engine ID)
        col 2    - cycle        (timestep)
        cols 3-5 - 3 operating settings
        cols 6-26 - 21 sensor readings

    Returns
    -------
    train : pd.DataFrame  full run-to-failure trajectories (100 engines)
    test  : pd.DataFrame  trajectories cut before failure  (100 engines)
    rul   : pd.Series     true RUL at last cycle of each test engine
    """
    col_names = (
        ['unit', 'cycle', 'os1', 'os2', 'os3'] +
        [f's{i}' for i in range(1, 22)]
    )

    train = pd.read_csv(
        f'{data_dir}/train_FD001.txt',
        sep=r'\s+', header=None, names=col_names
    )
    test = pd.read_csv(
        f'{data_dir}/test_FD001.txt',
        sep=r'\s+', header=None, names=col_names
    )
    rul = pd.read_csv(
        f'{data_dir}/RUL_FD001.txt',
        header=None, names=['RUL']
    )['RUL']

    print(f"Train -> {train['unit'].nunique()} engines,"
          f"{len(train):,} rows, {train.shape[1]} columns")
    print(f"Test -> {test['unit'].nunique()} engines,"
          f"{len(test):,} rows,  {test.shape[1]} columns")
    print(f"RUL -> {len(rul)} ground-truth values")
    print(f"\nTrain cycle range per engine:")
    cycles = train.groupby('unit')['cycle'].max()
    print(f"min={cycles.min()} max={cycles.max()}"
          f"mean={cycles.mean():.1f} std={cycles.std():.1f}")
    print(f"\nTest RUL range:")
    print(f"min={rul.min()} max={rul.max()}"
          f"mean={rul.mean():.1f} std={rul.std():.1f}")

    return train, test, rul

In [3]:
def add_rul_column(df: pd.DataFrame, max_rul: int) -> pd.DataFrame:
    """
    Appends a 'RUL' column to every row of the training DataFrame.

    For each engine the true RUL at cycle t is:
        RUL(t) = max_cycle_for_this_engine - t

    For example, if engine #1 runs for 200 cycles:
        cycle 1   → RUL = 199
        cycle 100 → RUL = 100
        cycle 200 → RUL = 0    (failure)

    Piecewise-linear cap (MAX_RUL):
        Any RUL above max_rul is clipped to max_rul.
        This means early healthy cycles all get the same label (125),
        telling the model "don't try to be precise here, the engine is fine".
        The model then focuses its capacity on the degradation zone
        where accurate prediction actually matters.

    Parameters
    ----------
    df      : training DataFrame (must have 'unit' and 'cycle' columns)
    max_rul : upper cap on RUL label (typically 125 for FD001)

    Returns
    -------
    df with new 'RUL' column added
    """
    df = df.copy()

    # Last observed cycle per engine = failure point
    max_cycle = df.groupby('unit')['cycle'].max()

    # RUL at each row = failure cycle minus current cycle
    df['RUL'] = df.apply(
        lambda row: max_cycle[row['unit']] - row['cycle'], axis=1
    )

    # Apply piecewise-linear cap
    df['RUL'] = df['RUL'].clip(upper=max_rul)

    print(f"Max RUL cap applied: {max_rul} cycles")
    print(f"RUL column range: {df['RUL'].min():.0f} - {df['RUL'].max():.0f}")
    print(f"Rows capped at {max_rul}:"
          f"{(df['RUL'] == max_rul).sum():,} "
          f"({(df['RUL'] == max_rul).mean()*100:.1f}% of rows)")
    print(f"Rows in degradation:"
          f"{(df['RUL'] < max_rul).sum():,} "
          f"({(df['RUL'] < max_rul).mean()*100:.1f}% of rows)")

    return df

In [None]:
def main():
    # Step 1 — Load
    train, test, rul = load_data(DATA_DIR)

    # Step 2 — RUL labels
    train = add_rul_column(train, max_rul=MAX_RUL)

    # Step 3 — Feature selection
    train, test, kept_sensors, dropped_sensors = select_features(train, test)

    # Step 4 — Normalise
    train, test, scaler = normalize(train, test, feature_cols=kept_sensors)
