In [1]:
import numpy as np
import pandas as pd
from xgboost import XGBRegressor

# Data Train Process

In [2]:
df = pd.read_csv('CMAPSSData/train_FD003.csv')
df

Unnamed: 0,unit_number,cycles,op_setting_1,op_setting_2,op_setting_3,sm_1,sm_2,sm_3,sm_4,sm_5,...,sm_12,sm_13,sm_14,sm_15,sm_16,sm_17,sm_18,sm_19,sm_20,sm_21
0,1,1,-0.0005,0.0004,100.0,518.67,642.36,1583.23,1396.84,14.62,...,522.31,2388.01,8145.32,8.4246,0.03,391,2388,100.0,39.11,23.3537
1,1,2,0.0008,-0.0003,100.0,518.67,642.50,1584.69,1396.89,14.62,...,522.42,2388.03,8152.85,8.4403,0.03,392,2388,100.0,38.99,23.4491
2,1,3,-0.0014,-0.0002,100.0,518.67,642.18,1582.35,1405.61,14.62,...,522.03,2388.00,8150.17,8.3901,0.03,391,2388,100.0,38.85,23.3669
3,1,4,-0.0020,0.0001,100.0,518.67,642.92,1585.61,1392.27,14.62,...,522.49,2388.08,8146.56,8.3878,0.03,392,2388,100.0,38.96,23.2951
4,1,5,0.0016,0.0000,100.0,518.67,641.68,1588.63,1397.65,14.62,...,522.58,2388.03,8147.80,8.3869,0.03,392,2388,100.0,39.14,23.4583
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24715,100,148,-0.0016,-0.0003,100.0,518.67,643.78,1596.01,1424.11,14.62,...,519.66,2388.30,8138.08,8.5036,0.03,394,2388,100.0,38.44,22.9631
24716,100,149,0.0034,-0.0003,100.0,518.67,643.29,1596.38,1429.14,14.62,...,519.91,2388.28,8144.36,8.5174,0.03,395,2388,100.0,38.50,22.9746
24717,100,150,-0.0016,0.0004,100.0,518.67,643.84,1604.53,1431.41,14.62,...,519.44,2388.24,8135.95,8.5223,0.03,396,2388,100.0,38.39,23.0682
24718,100,151,-0.0023,0.0004,100.0,518.67,643.94,1597.56,1426.57,14.62,...,520.01,2388.26,8141.24,8.5148,0.03,395,2388,100.0,38.31,23.0753


In [3]:
def piecewise_linear_rul(df, rul_max=128):
    def compute_rul(cycles):
        max_cycle = cycles.max()
        rul = max_cycle - cycles
        return np.where(rul > rul_max, rul_max, rul)

    df['RUL'] = df.groupby('unit_number')['cycles'].transform(compute_rul)
    return df


def natural_rul(df):
    df['RUL'] = df.groupby('unit_number')['cycles'].transform(
        lambda x: x.max() - x
    )
    return df

piecewise_linear_rul(df)
# natural_rul(df)

Unnamed: 0,unit_number,cycles,op_setting_1,op_setting_2,op_setting_3,sm_1,sm_2,sm_3,sm_4,sm_5,...,sm_13,sm_14,sm_15,sm_16,sm_17,sm_18,sm_19,sm_20,sm_21,RUL
0,1,1,-0.0005,0.0004,100.0,518.67,642.36,1583.23,1396.84,14.62,...,2388.01,8145.32,8.4246,0.03,391,2388,100.0,39.11,23.3537,128
1,1,2,0.0008,-0.0003,100.0,518.67,642.50,1584.69,1396.89,14.62,...,2388.03,8152.85,8.4403,0.03,392,2388,100.0,38.99,23.4491,128
2,1,3,-0.0014,-0.0002,100.0,518.67,642.18,1582.35,1405.61,14.62,...,2388.00,8150.17,8.3901,0.03,391,2388,100.0,38.85,23.3669,128
3,1,4,-0.0020,0.0001,100.0,518.67,642.92,1585.61,1392.27,14.62,...,2388.08,8146.56,8.3878,0.03,392,2388,100.0,38.96,23.2951,128
4,1,5,0.0016,0.0000,100.0,518.67,641.68,1588.63,1397.65,14.62,...,2388.03,8147.80,8.3869,0.03,392,2388,100.0,39.14,23.4583,128
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24715,100,148,-0.0016,-0.0003,100.0,518.67,643.78,1596.01,1424.11,14.62,...,2388.30,8138.08,8.5036,0.03,394,2388,100.0,38.44,22.9631,4
24716,100,149,0.0034,-0.0003,100.0,518.67,643.29,1596.38,1429.14,14.62,...,2388.28,8144.36,8.5174,0.03,395,2388,100.0,38.50,22.9746,3
24717,100,150,-0.0016,0.0004,100.0,518.67,643.84,1604.53,1431.41,14.62,...,2388.24,8135.95,8.5223,0.03,396,2388,100.0,38.39,23.0682,2
24718,100,151,-0.0023,0.0004,100.0,518.67,643.94,1597.56,1426.57,14.62,...,2388.26,8141.24,8.5148,0.03,395,2388,100.0,38.31,23.0753,1


In [4]:
X_xgb = df.drop(columns=['unit_number', 'cycles', 'RUL'])
y_xgb = df['RUL'] + 1

print(np.isnan(X_xgb).any()) 
print(np.isnan(y_xgb).any()) 

op_setting_1    False
op_setting_2    False
op_setting_3    False
sm_1            False
sm_2            False
sm_3            False
sm_4            False
sm_5            False
sm_6            False
sm_7            False
sm_8            False
sm_9            False
sm_10           False
sm_11           False
sm_12           False
sm_13           False
sm_14           False
sm_15           False
sm_16           False
sm_17           False
sm_18           False
sm_19           False
sm_20           False
sm_21           False
dtype: bool
False


In [5]:
from sklearn.model_selection import GridSearchCV

# Inisialisasi model XGBoost
xgb = XGBRegressor(
    booster='gbtree',
    objective='reg:gamma',
    gamma=0.1,   
    reg_lambda=3,
    subsample=0.7,
)

# Range hyperparameter yang diuji
param_grid = {
    'n_estimators': [50, 250, 400, 500],
    'max_depth': [3, 5, 7, 9, 13, 15, 17, 20],
    'learning_rate': [0.001, 0.05, 0.1, 0.2, 0.3],
    'min_child_weight': [1,3, 5, 7, 10]
}

# Grid search
grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', verbose=2)
grid_search.fit(X_xgb, y_xgb)

# Hasil hyperparameter terbaik
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

# Latih model XGBoost dengan parameter terbaik
xgb_model = XGBRegressor(**best_params)
xgb_model.fit(X_xgb, y_xgb)

Fitting 3 folds for each of 800 candidates, totalling 2400 fits
[CV] END learning_rate=0.001, max_depth=3, min_child_weight=1, n_estimators=50; total time=   0.1s
[CV] END learning_rate=0.001, max_depth=3, min_child_weight=1, n_estimators=50; total time=   0.1s
[CV] END learning_rate=0.001, max_depth=3, min_child_weight=1, n_estimators=50; total time=   0.1s
[CV] END learning_rate=0.001, max_depth=3, min_child_weight=1, n_estimators=250; total time=   0.3s
[CV] END learning_rate=0.001, max_depth=3, min_child_weight=1, n_estimators=250; total time=   0.3s
[CV] END learning_rate=0.001, max_depth=3, min_child_weight=1, n_estimators=250; total time=   0.3s
[CV] END learning_rate=0.001, max_depth=3, min_child_weight=1, n_estimators=400; total time=   0.4s
[CV] END learning_rate=0.001, max_depth=3, min_child_weight=1, n_estimators=400; total time=   0.5s
[CV] END learning_rate=0.001, max_depth=3, min_child_weight=1, n_estimators=400; total time=   0.5s
[CV] END learning_rate=0.001, max_depth

  _data = np.array(data, dtype=dtype, copy=copy,


Best Parameters: {'learning_rate': 0.05, 'max_depth': 5, 'min_child_weight': 3, 'n_estimators': 250}


In [6]:
features = [col for col in df.columns if col not in ['unit_number', 'cycles', 'RUL']]

In [7]:
# feature_importances = xgb_model.feature_importances_
# # Pilih fitur dengan importance > threshold
# threshold = 0.02  # Sesuaikan threshold sesuai kebutuhan
# selected_features = np.array(features)[feature_importances > threshold]

# print("Fitur yang terpilih:", selected_features)

# Urutkan fitur berdasarkan nilai feature importance
sorted_indices = np.argsort(xgb_model.feature_importances_)[::-1]  # Urutkan dari terbesar ke terkecil
top_n = 12  # Jumlah fitur terbaik yang diinginkan

# Ambil fitur terbaik
selected_features = np.array(features)[sorted_indices[:top_n]]

print("Fitur yang terpilih:", selected_features)


Fitur yang terpilih: ['sm_11' 'sm_17' 'sm_9' 'sm_13' 'sm_4' 'sm_8' 'sm_14' 'sm_12' 'sm_6'
 'sm_7' 'sm_2' 'sm_3']
