In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split, cross_val_score


In [2]:
np.random.seed(42)

DROPPED_COLUMNS_001 = ['oper_set_3', 'temp_fan_inlet', 'engine_px_ratio', 'demanded_fan_speed', 
                       'demanded_corr_fan_speed', 'px_fan_inlet', 'px_by_duct', 'fuel_air_ratio']
RENAMING_DICT = {0: "engine_num", 1: "cycle_num", 2: "oper_set_1", 3: "oper_set_2", 4: "oper_set_3", 
                 5: "temp_fan_inlet", 6: "temp_lpc_outlet", 7: "temp_hpc_outlet", 8: "temp_lpt_outlet", 
                 9: "px_fan_inlet", 10: "px_by_duct", 11: "px_hpc_outlet", 12: "phys_fan_speed", 
                 13: "phys_core_speed", 14: "engine_px_ratio", 15: "stat_px_hpc_out", 16: "fuel_flow_ratio", 
                 17: "corr_fan_speed", 18: "corr_core_speed", 19: "bypass_ratio", 20: "fuel_air_ratio", 
                 21: "bleed_enthalpy", 22: "demanded_fan_speed", 23: "demanded_corr_fan_speed", 
                 24: "hpt_coolant_bleed", 25: "lpt_coolant_bleed"}


In [3]:
# Data Import and Cleaning
train_001 = pd.read_csv("../CMAPSS Nasa Data set/train_FD001.txt", sep="\s+", header=None)
test_001 = pd.read_csv("../CMAPSS Nasa Data set/test_FD001.txt", sep="\s+", header=None)
rul_001 = pd.read_csv("../CMAPSS Nasa Data set/RUL_FD001.txt", header=None)

# Rename columns
train_001.rename(columns=RENAMING_DICT, inplace=True)
test_001.rename(columns=RENAMING_DICT, inplace=True)
rul_001.rename(columns={0: "rul"}, inplace=True)

# Drop columns with constant values
train_001.drop(columns=DROPPED_COLUMNS_001, inplace=True)
test_001.drop(columns=DROPPED_COLUMNS_001, inplace=True)

# Drop columns with high correlation
train_001.drop(columns=["corr_core_speed"], inplace=True)
test_001.drop(columns=["corr_core_speed"], inplace=True)

# Add RUL to training data
train_001['rul'] = train_001.groupby('engine_num')['cycle_num'].apply(lambda x: x.max() - x).values

# Drop cycle number as it doesn't help in prediction
train_001.drop(columns="cycle_num", inplace=True)
test_001.drop(columns="cycle_num", inplace=True)


To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  train_001['rul'] = train_001.groupby('engine_num')['cycle_num'].apply(lambda x: x.max() - x).values


In [4]:
# Define features and target variable
X = train_001.drop(columns=['engine_num', 'rul'])
y = train_001['rul']

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Initialize Linear Regression model
lr_model = LinearRegression()

# Train-test split for evaluation
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Train the model
lr_model.fit(X_train, y_train)

# Predict on the training set and test set
y_train_pred = lr_model.predict(X_train)
y_test_pred = lr_model.predict(X_test)

# Evaluate model performance on training set
rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
mae_train = mean_absolute_error(y_train, y_train_pred)
r2_train = r2_score(y_train, y_train_pred)

# Evaluate model performance on test set
rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))
mae_test = mean_absolute_error(y_test, y_test_pred)
r2_test = r2_score(y_test, y_test_pred)

# Cross-validation to assess generalization performance
cv_scores = cross_val_score(lr_model, X_scaled, y, cv=5, scoring='r2')
mean_cv_score = np.mean(cv_scores)

# Print model performance metrics
print("Linear Regression Model Performance")
print(f"Training RMSE: {rmse_train:.2f}")
print(f"Training MAE: {mae_train:.2f}")
print(f"Training R-squared: {r2_train:.2f}")
print(f"Test RMSE: {rmse_test:.2f}")
print(f"Test MAE: {mae_test:.2f}")
print(f"Test R-squared: {r2_test:.2f}")
print(f"Cross-validation R-squared: {mean_cv_score:.2f}")

# Assessing overfitting or underfitting
training_error = np.abs(r2_train - mean_cv_score)
print(f"Training error (R-squared difference from cross-validation): {training_error:.2f}")

if training_error < 0.1:
    print("The model is well-generalized and balanced.")
elif r2_train > r2_test:
    print("The model may be overfitting on the training data.")
else:
    print("The model may be underfitting or requires further optimization.")


Linear Regression Model Performance
Training RMSE: 44.76
Training MAE: 34.16
Training R-squared: 0.58
Test RMSE: 44.40
Test MAE: 34.10
Test R-squared: 0.57
Cross-validation R-squared: 0.57
Training error (R-squared difference from cross-validation): 0.01
The model is well-generalized and balanced.
