In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split, GridSearchCV
from typing import Tuple


In [2]:
random.seed(42)
np.random.seed(42)

DROPPED_COLUMNS_001 = ['oper_set_3','temp_fan_inlet','engine_px_ratio','demanded_fan_speed', 
                       'demanded_corr_fan_speed','px_fan_inlet','px_by_duct','fuel_air_ratio']
RENAMING_DICT = {0: "engine_num", 1: "cycle_num", 2: "oper_set_1", 3: "oper_set_2", 4: "oper_set_3", 5: "temp_fan_inlet",
                 6: "temp_lpc_outlet", 7: "temp_hpc_outlet", 8: "temp_lpt_outlet", 9: "px_fan_inlet", 10: "px_by_duct", 11: "px_hpc_outlet",
                 12: "phys_fan_speed", 13: "phys_core_speed", 14: "engine_px_ratio", 15: "stat_px_hpc_out", 16: "fuel_flow_ratio", 17: "corr_fan_speed",
                 18: "corr_core_speed", 19: "bypass_ratio", 20: "fuel_air_ratio", 21: "bleed_enthalpy", 22: "demanded_fan_speed", 23: "demanded_corr_fan_speed",
                 24: "hpt_coolant_bleed", 25: "lpt_coolant_bleed"}
FEATURES_TO_SCALE = ['oper_set_1', 'oper_set_2', 'temp_lpc_outlet', 'temp_hpc_outlet', 'temp_lpt_outlet', 
                     'px_hpc_outlet', 'phys_fan_speed', 'phys_core_speed', 'stat_px_hpc_out', 
                     'fuel_flow_ratio', 'corr_fan_speed', 'bypass_ratio', 'bleed_enthalpy', 
                     'hpt_coolant_bleed', 'lpt_coolant_bleed']


In [3]:
# Function to load and preprocess data
def load_and_preprocess_data(train_path: str, test_path: str, rul_path: str) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    train = pd.read_csv(train_path, sep="\s+", header=None)
    test = pd.read_csv(test_path, sep="\s+", header=None)
    rul = pd.read_csv(rul_path, header=None)
    
    train.rename(columns=RENAMING_DICT, inplace=True)
    test.rename(columns=RENAMING_DICT, inplace=True)
    rul.rename(columns={0: "rul"}, inplace=True)
    
    train.drop(columns=DROPPED_COLUMNS_001, inplace=True)
    test.drop(columns=DROPPED_COLUMNS_001, inplace=True)
    
    train['rul'] = train.groupby('engine_num')['cycle_num'].apply(lambda x: x.max() - x).values
    train.drop(columns="cycle_num", inplace=True)
    test.drop(columns="cycle_num", inplace=True)
    
    return train, test, rul


In [4]:
# Load datasets
train_001, test_001, rul_001 = load_and_preprocess_data("../CMAPSS Nasa Data set/train_FD001.txt",
                                                        "../CMAPSS Nasa Data set/test_FD001.txt",
                                                        "../CMAPSS Nasa Data set/RUL_FD001.txt")


To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  train['rul'] = train.groupby('engine_num')['cycle_num'].apply(lambda x: x.max() - x).values


In [5]:
# Feature Scaling
scaler = StandardScaler()
train_features = train_001.drop(columns=['engine_num', 'rul'])
train_features_scaled = scaler.fit_transform(train_features)
train_001_scaled = pd.DataFrame(train_features_scaled, columns=train_features.columns)
train_001_scaled['rul'] = train_001['rul'].values


In [6]:
# Model Training and Evaluation
def train_evaluate_model(X: pd.DataFrame, y: pd.Series) -> Tuple[RandomForestRegressor, dict]:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # GridSearchCV for hyperparameter tuning
    param_grid = {
        'n_estimators': [100, 200],
        'max_features': ['auto', 'sqrt'],
        'max_depth': [10, 20, None],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2],
        'bootstrap': [True, False]
    }
    
    rf_model = RandomForestRegressor(random_state=42)
    grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2, scoring='r2')
    grid_search.fit(X_train, y_train)
    
    best_rf_model = grid_search.best_estimator_
    
    y_train_pred = best_rf_model.predict(X_train)
    y_test_pred = best_rf_model.predict(X_test)
    
    results = {
        "Training RMSE": np.sqrt(mean_squared_error(y_train, y_train_pred)),
        "Training MAE": mean_absolute_error(y_train, y_train_pred),
        "Training R-squared": r2_score(y_train, y_train_pred),
        "Test RMSE": np.sqrt(mean_squared_error(y_test, y_test_pred)),
        "Test MAE": mean_absolute_error(y_test, y_test_pred),
        "Test R-squared": r2_score(y_test, y_test_pred)
    }
    
    return best_rf_model, results, y_test, y_test_pred

X = train_001_scaled.drop(columns='rul')
y = train_001_scaled['rul']

best_model, results, y_test, y_test_pred = train_evaluate_model(X, y)


Fitting 5 folds for each of 96 candidates, totalling 480 fits


In [7]:
# Output Metrics
print("Random Forest Model Performance")
for metric, value in results.items():
    print(f"{metric}: {value:.2f}")

# Assessing overfitting or underfitting
training_error = np.abs(results["Training R-squared"] - results["Test R-squared"])
print(f"Training error (R-squared difference): {training_error:.2f}")

if training_error < 0.1:
    print("The model is well-generalized and balanced.")
elif results["Training R-squared"] > results["Test R-squared"]:
    print("The model may be overfitting on the training data.")
else:
    print("The model may be underfitting or requires further optimization.")


Random Forest Model Performance
Training RMSE: 20.83
Training MAE: 14.28
Training R-squared: 0.91
Test RMSE: 41.00
Test MAE: 29.29
Test R-squared: 0.63
Training error (R-squared difference): 0.28
The model may be overfitting on the training data.
