In [5]:
# 02_preprocessing.ipynb (multi-dataset)

import pandas as pd
import numpy as np
import os
import joblib
from sklearn.preprocessing import MinMaxScaler

# =========================
# Parameters
# =========================
SEQ_LEN = 30
MAX_RUL = 125
DATA_DIR = "../data/raw/"
PROCESSED_DIR = "../data/processed/"
os.makedirs(PROCESSED_DIR, exist_ok=True)

# =========================
# Column definitions
# =========================
sensor_cols = [f'sensor_{i}' for i in range(1, 22)]
op_cols = ['op_setting_1', 'op_setting_2', 'op_setting_3']
cols = ['engine_id', 'cycle'] + op_cols + sensor_cols

# =========================
# Helper functions
# =========================
def gen_sequence(df, seq_length, features):
    data_array = df[features].values
    num_elements = data_array.shape[0]
    for start, stop in zip(range(0, num_elements - seq_length), range(seq_length, num_elements)):
        yield data_array[start:stop, :]

def gen_labels(df, seq_length, label):
    label_array = df[label].values
    return label_array[seq_length:]

def preprocess_dataset(fd_id):
    print(f"\n🔹 Processing {fd_id} ...")

    # File paths
    train_file = os.path.join(DATA_DIR, f"train_{fd_id}.txt")
    test_file = os.path.join(DATA_DIR, f"test_{fd_id}.txt")
    rul_file = os.path.join(DATA_DIR, f"RUL_{fd_id}.txt")

    # -------------------------
    # Training data
    # -------------------------
    train_df = pd.read_csv(train_file, sep=' ', header=None)
    train_df = train_df.dropna(axis=1, how='all')
    train_df.columns = cols

    # Compute RUL for train
    max_cycles = train_df.groupby('engine_id')['cycle'].max().reset_index()
    max_cycles.columns = ['engine_id', 'max_cycle']
    train_df = train_df.merge(max_cycles, on='engine_id')
    train_df['RUL'] = train_df['max_cycle'] - train_df['cycle']
    train_df['RUL'] = train_df['RUL'].clip(upper=MAX_RUL)

    # Scale
    scaler = MinMaxScaler()
    train_df[op_cols + sensor_cols] = scaler.fit_transform(train_df[op_cols + sensor_cols])

    # Save scaler
    joblib.dump(scaler, os.path.join(PROCESSED_DIR, f"scaler_{fd_id}.pkl"))

    # Generate train sequences
    feature_cols = op_cols + sensor_cols
    sequence_list, label_list = [], []
    for eid in train_df['engine_id'].unique():
        engine_df = train_df[train_df['engine_id'] == eid]
        seqs = list(gen_sequence(engine_df, SEQ_LEN, feature_cols))
        labs = gen_labels(engine_df, SEQ_LEN, 'RUL')
        sequence_list.extend(seqs)
        label_list.extend(labs)

    X_train = np.array(sequence_list)
    y_train = np.array(label_list)

    # Save arrays
    np.save(os.path.join(PROCESSED_DIR, f"X_train_{fd_id}.npy"), X_train)
    np.save(os.path.join(PROCESSED_DIR, f"y_train_{fd_id}.npy"), y_train)

    print(f"✅ {fd_id}: X_train {X_train.shape}, y_train {y_train.shape}")

    # -------------------------
    # Test data
    # -------------------------
    test_df = pd.read_csv(test_file, sep=' ', header=None)
    test_df = test_df.dropna(axis=1, how='all')
    test_df.columns = cols

    # True RUL values
    rul_df = pd.read_csv(rul_file, sep=' ', header=None)
    rul_df = rul_df.dropna(axis=1, how='all')
    rul_df.columns = ['RUL']

    # Compute test RUL
    max_cycles = test_df.groupby('engine_id')['cycle'].max().reset_index()
    max_cycles.columns = ['engine_id', 'max_cycle']
    test_df = test_df.merge(max_cycles, on='engine_id')
    test_df = test_df.merge(rul_df, left_on='engine_id', right_index=True)
    test_df['RUL'] = test_df['RUL'] + test_df['max_cycle'] - test_df['cycle']
    test_df['RUL'] = test_df['RUL'].clip(upper=MAX_RUL)

    # Apply same scaler
    test_df[op_cols + sensor_cols] = scaler.transform(test_df[op_cols + sensor_cols])

    # Generate test sequences
    sequence_list, label_list = [], []
    for eid in test_df['engine_id'].unique():
        engine_df = test_df[test_df['engine_id'] == eid]
        seqs = list(gen_sequence(engine_df, SEQ_LEN, feature_cols))
        labs = gen_labels(engine_df, SEQ_LEN, 'RUL')
        sequence_list.extend(seqs)
        label_list.extend(labs)

    X_test = np.array(sequence_list)
    y_test = np.array(label_list)

    # Save arrays
    np.save(os.path.join(PROCESSED_DIR, f"X_test_{fd_id}.npy"), X_test)
    np.save(os.path.join(PROCESSED_DIR, f"y_test_{fd_id}.npy"), y_test)

    print(f"✅ {fd_id}: X_test {X_test.shape}, y_test {y_test.shape}")


# =========================
# Run for all FD sets
# =========================
for fd_id in ["FD001", "FD002", "FD003", "FD004"]:
    preprocess_dataset(fd_id)



🔹 Processing FD001 ...
✅ FD001: X_train (17631, 30, 24), y_train (17631,)
✅ FD001: X_test (9928, 30, 24), y_test (9928,)

🔹 Processing FD002 ...
✅ FD002: X_train (45959, 30, 24), y_train (45959,)
✅ FD002: X_test (26159, 30, 24), y_test (26159,)

🔹 Processing FD003 ...
✅ FD003: X_train (21720, 30, 24), y_train (21720,)
✅ FD003: X_test (13379, 30, 24), y_test (13379,)

🔹 Processing FD004 ...
✅ FD004: X_train (53779, 30, 24), y_train (53779,)
✅ FD004: X_test (33593, 30, 24), y_test (33593,)
