In [89]:
'''from google.colab import drive
drive.mount('/content/drive')'''

"from google.colab import drive\ndrive.mount('/content/drive')"

In [90]:
#!pip install catboost

In [91]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scipy.stats import pearsonr

from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [92]:
#myfolder = "./"
myfolder = "./data/"

# **Columns' names**

In [93]:
#Columns' names
'''
1)  unit number
2)	time, in cycles
3)	operational setting 1
4)	operational setting 2
5)	operational setting 3
6)	sensor measurement  1
7)	sensor measurement  2
...
26)	sensor measurement  21
'''
unitNames = ['UnitNumber']
timeCycles = ["TimeInCycles"]
operSets = ["OperSet"+str(i) for i in range(1,4)] # 1,2 et 3
sensorMes = ["SensorMes"+str(j) for j in range(1, 22)] # de 1 à 21
columnsNames = unitNames + timeCycles + operSets +sensorMes

# **Datasets loading**

In [94]:
def data_loading(x):
  train_path = myfolder + "train_"+ x +".txt"
  test_path = myfolder + "test_"+ x +".txt"
  rul_path = myfolder + "RUL_"+ x +".txt"
  train = pd.read_csv(train_path, delim_whitespace=True, names=columnsNames)
  test = pd.read_csv(test_path, delim_whitespace=True, names=columnsNames)
  rul = pd.read_csv(rul_path, delim_whitespace=True, names=["RUL_FD"])
  return train, test, rul

train_fd002, test_fd002, rul_fd002 = data_loading("FD002")

In [95]:
# Forcer l'affichage de toutes les colonnes
pd.set_option('display.max_columns', None)

train_fd002.head(3)

Unnamed: 0,UnitNumber,TimeInCycles,OperSet1,OperSet2,OperSet3,SensorMes1,SensorMes2,SensorMes3,SensorMes4,SensorMes5,SensorMes6,SensorMes7,SensorMes8,SensorMes9,SensorMes10,SensorMes11,SensorMes12,SensorMes13,SensorMes14,SensorMes15,SensorMes16,SensorMes17,SensorMes18,SensorMes19,SensorMes20,SensorMes21
0,1,1,34.9983,0.84,100.0,449.44,555.32,1358.61,1137.23,5.48,8.0,194.64,2222.65,8341.91,1.02,42.02,183.06,2387.72,8048.56,9.3461,0.02,334,2223,100.0,14.73,8.8071
1,1,2,41.9982,0.8408,100.0,445.0,549.9,1353.22,1125.78,3.91,5.71,138.51,2211.57,8303.96,1.02,42.2,130.42,2387.66,8072.3,9.3774,0.02,330,2212,100.0,10.41,6.2665
2,1,3,24.9988,0.6218,60.0,462.54,537.31,1256.76,1047.45,7.05,9.02,175.71,1915.11,8001.42,0.94,36.69,164.22,2028.03,7864.87,10.8941,0.02,309,1915,84.93,14.08,8.6723


# **RUL column generation for train and test set**

In [96]:
def rul_train_generation(x):
  rul = pd.DataFrame(x.groupby('UnitNumber')['TimeInCycles'].max()).reset_index()
  rul.columns = ['UnitNumber', 'max']
  x = x.merge(rul, on=['UnitNumber'], how='left')
  x['RUL'] = x['max'] - x['TimeInCycles']
  x.drop('max', axis=1, inplace=True)
  return x

train_fd002 = rul_train_generation(train_fd002)

In [97]:
train_fd002.head(3)

Unnamed: 0,UnitNumber,TimeInCycles,OperSet1,OperSet2,OperSet3,SensorMes1,SensorMes2,SensorMes3,SensorMes4,SensorMes5,SensorMes6,SensorMes7,SensorMes8,SensorMes9,SensorMes10,SensorMes11,SensorMes12,SensorMes13,SensorMes14,SensorMes15,SensorMes16,SensorMes17,SensorMes18,SensorMes19,SensorMes20,SensorMes21,RUL
0,1,1,34.9983,0.84,100.0,449.44,555.32,1358.61,1137.23,5.48,8.0,194.64,2222.65,8341.91,1.02,42.02,183.06,2387.72,8048.56,9.3461,0.02,334,2223,100.0,14.73,8.8071,148
1,1,2,41.9982,0.8408,100.0,445.0,549.9,1353.22,1125.78,3.91,5.71,138.51,2211.57,8303.96,1.02,42.2,130.42,2387.66,8072.3,9.3774,0.02,330,2212,100.0,10.41,6.2665,147
2,1,3,24.9988,0.6218,60.0,462.54,537.31,1256.76,1047.45,7.05,9.02,175.71,1915.11,8001.42,0.94,36.69,164.22,2028.03,7864.87,10.8941,0.02,309,1915,84.93,14.08,8.6723,146


In [98]:
def rul_test_generation(x, rul):

  rul["UnitNumber"] = rul.index + 1 # +1 pour que UnitNumber demarre de 1 au lieu de 0, car il s'agit du numero des moteur

  x = x.merge(rul, on=['UnitNumber'], how='left')

  max_cycle = pd.DataFrame(x.groupby('UnitNumber')['TimeInCycles'].max()).reset_index()
  max_cycle.columns = ['UnitNumber', 'max']
  x = x.merge(max_cycle, on=['UnitNumber'], how='left')
  x['RUL'] = x['RUL_FD'] + x['max'] - x['TimeInCycles']
  x.drop(['max', 'RUL_FD'], axis=1, inplace=True)

  return x

test_fd002 = rul_test_generation(test_fd002, rul_fd002)

In [99]:
test_fd002.head(3)

Unnamed: 0,UnitNumber,TimeInCycles,OperSet1,OperSet2,OperSet3,SensorMes1,SensorMes2,SensorMes3,SensorMes4,SensorMes5,SensorMes6,SensorMes7,SensorMes8,SensorMes9,SensorMes10,SensorMes11,SensorMes12,SensorMes13,SensorMes14,SensorMes15,SensorMes16,SensorMes17,SensorMes18,SensorMes19,SensorMes20,SensorMes21,RUL
0,1,1,9.9987,0.2502,100.0,489.05,605.03,1497.17,1304.99,10.52,15.49,394.54,2318.96,8763.8,1.26,45.61,371.69,2388.18,8114.1,8.6476,0.03,369,2319,100.0,28.42,17.1551,275
1,1,2,20.0026,0.7,100.0,491.19,607.82,1481.2,1246.11,9.35,13.66,334.36,2323.95,8713.21,1.08,44.26,315.32,2388.12,8053.06,9.2405,0.02,364,2324,100.0,24.29,14.8039,274
2,1,3,35.0045,0.84,100.0,449.44,556.0,1359.08,1128.36,5.48,8.0,193.55,2222.67,8340.2,1.02,41.8,183.04,2387.75,8053.04,9.3472,0.02,333,2223,100.0,14.98,8.9125,273


# **CLASS BALANCING**

In [100]:
'''element_counts = test_fd002[test_fd002['UnitNumber']==1]
len(element_counts)'''

"element_counts = test_fd002[test_fd002['UnitNumber']==1]\nlen(element_counts)"

In [101]:
import random

def selection_aleatoire(df, sample_size, rand_state):
    unique_values = df["UnitNumber"].unique()
    selected_rows = []
    for value in unique_values:
        rows = df[df["UnitNumber"] == value]
        if len(rows) < sample_size : # si la taille de l'echantillon donnée est superieur au nombre total de ligne pour un moteur, reinitialiser la valeur
            sample_size = len(rows)
        random_sample = rows.sample(n=sample_size, random_state = rand_state)  # Sélectionne 50 lignes aléatoires
        selected_rows.append(random_sample)
    result = pd.concat(selected_rows)
    return result

# **Data normalization**

In [102]:
def normalised_df(train, test):

  from sklearn.preprocessing import MinMaxScaler

  # Instancier l'objet MinMaxScaler pour normaliser les données
  scaler = MinMaxScaler()

  # Normaliser train
  train_scaled = scaler.fit_transform(train)
  train_df = pd.DataFrame(train_scaled)

  # Normaliser test
  test_scaled = scaler.fit_transform(test)
  test_df = pd.DataFrame(test_scaled)


  train_df.columns = train.columns
  test_df.columns = test.columns
    
  return train_df, test_df

# **Data splitting**

In [103]:
def data_split(train, test):

  # data split
  X_train = train.drop('RUL', axis=1)
  Y_train = train['RUL']
  X_test = test.drop('RUL', axis=1)
  Y_test = test['RUL']
    
  return X_train, Y_train, X_test, Y_test


# **The XGB Regressor model**

In [137]:
import time

def my_XGB_Regressor(x, y):

    # Time tracking, Operation time (min)
    t = time.process_time()


    mse_test_list = []
    rmse_test_list = []
    mae_test_list = []
    mape_test_list = []
    
    for j in range(1, 11):
        
        train_selected = selection_aleatoire(x, 50, j)
        test_selected = selection_aleatoire(y, 25, j)
        

        normalized_train_df, normalized_test_df =  normalised_df(train_selected, test_selected)

        X_train, Y_train, X_test, Y_test = data_split(normalized_train_df, normalized_test_df)

        # Créer le modèle XGBoost Regressor
        model = XGBRegressor()

        # Define the parameters to test in the grid search
        '''param_grid = {
            'n_estimators': [100, 500, 1000],
            'max_depth': [3, 5, 7],
            'learning_rate': [0.01, 0.1, 0.3], #np.logspace(-3,-1,10)
            'subsample': [0.5, 0.7, 0.9],
            'colsample_bytree': [0.5, 0.7, 0.9],
            }'''
        param_grid = {
            'n_estimators': [100],
            'max_depth': [3],
            'learning_rate': [0.01], #np.logspace(-3,-1,10)
            'subsample': [0.5],
            'colsample_bytree': [0.5],
            }

        # Create the GridSearchCV object
        grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error')

        # Perform the grid search on the training data
        grid_search.fit(X_train, Y_train)

        
        # Créer l'objet GridSearchCV
        grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error')

        # Perform the grid search on the training data
        grid_search.fit(X_train, Y_train)

        print("\n********************** Test No ",j,"/10:**********************")

        # Display the best parameters found
        print("Best parameters found:")
        print(grid_search.best_params_)

        '''#-------------Train---------------------
        # Prédire les valeurs en utilisant le modèle KNN Regressor pour les données Train
        y_pred_train = grid_search.predict(X_train)
        # Afficher l'erreur quadratique moyenne et le coefficient de détermination R2
        mse_train = mean_squared_error(Y_train, y_pred_train)
        rmse_train = np.sqrt(mse_train)
        print('Train:==========================================================')
        print('MSE : ',mse_train * 100,'%')
        print('RMSE : ',rmse_train * 100,'%')'''

        #-------------Test---------------------
        # Predict the values using the XGBoost Regressor model for Test data
        y_pred_test = grid_search.predict(X_test)
        # Display the mean squared error and the determination R2 coefficient
        mse_test = mean_squared_error(Y_test, y_pred_test)
        rmse_test = np.sqrt(mse_test)
        mae_test = mean_absolute_error(Y_test, y_pred_test)
        mape_test = np.mean(np.abs((Y_test - y_pred_test) / Y_test)) * 100
        
        print('MSE : ',mse_test * 100,'%')
        print('RMSE : ',rmse_test * 100,'%')
        print('MAE : ',mae_test * 100,'%')
        print('MAPE : ',mape_test,'%')

        #listes
        mse_test_list.append(mse_test)
        rmse_test_list.append(rmse_test)
        mae_test_list.append(mae_test)
        mape_test_list.append(mape_test)
        
        print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
        print("time: " , (time.process_time()-t),"sec")
        print('')
    
    print('All MSE: ',mse_test_list)
    print('mean MSE', np.mean(mse_test_list), " == ",np.mean(mse_test_list)*100,"%\n")
    
    print('All RMSE: ',rmse_test_list)
    print('\nAll MAE: ',mae_test_list)
    print('\n******************************************************************************************')
    print('******************************************************************************************')
    print('***************** mean RMSE', np.mean(rmse_test_list), " ==> ",np.mean(rmse_test_list)*100,"% ***************")
    print('******************************************************************************************')
    print('***************** mean MAE : ',np.mean(mae_test_list), " ==> ", np.mean(mae_test_list) * 100,"% *************")
    print('******************************************************************************************')
    print('******************************************************************************************\n')
    print('mean MAPE :', np.mean(mape_test_list),'%')

    print("\n\n++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
    print("Total time: " , (time.process_time()-t),"sec")

In [138]:
my_XGB_Regressor(train_fd002, test_fd002)


********************** Test No  1 /10:**********************
Best parameters found:
{'colsample_bytree': 0.5, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.5}
MSE :  1.6832058862070656 %
RMSE :  12.973842477103942 %
MAE :  10.433505475702999 %
MAPE :  inf %
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
time:  19.578125 sec


********************** Test No  2 /10:**********************
Best parameters found:
{'colsample_bytree': 0.5, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.5}
MSE :  1.844497336664852 %
RMSE :  13.58122725185339 %
MAE :  10.738228623444181 %
MAPE :  inf %
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
time:  39.09375 sec


********************** Test No  3 /10:**********************
Best parameters found:
{'colsample_bytree': 0.5, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.5}
MSE :  1.8859472257049579 %
RMSE :  13.732979377050553 %
MAE :  10.86543

In [134]:
import time
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np

def my_XGB_Regressor2(x, y):

    # Time tracking, Operation time (min)
    t = time.process_time()

    mse_test_list = []
    rmse_test_list = []
    mae_test_list = []
    mape_test_list = []
    
    for j in range(1, 11):
        train_selected = selection_aleatoire(x, 50, j)
        test_selected = selection_aleatoire(y, 25, j)

        normalized_train_df, normalized_test_df =  normalised_df(train_selected, test_selected)

        X_train, Y_train, X_test, Y_test = data_split(normalized_train_df, normalized_test_df)

        # Create the XGBoost Regressor model
        model = XGBRegressor(
            n_estimators=100,
            max_depth=3,
            learning_rate=0.01,
            subsample=0.5,
            colsample_bytree=0.5,
        )

        # Train the model
        model.fit(X_train, Y_train)

        print("\n********************** Test No ",j,"/10:**********************")

        #-------------Test---------------------
        # Predict the values using the XGBRegressor model for the Test data
        y_pred_test = model.predict(X_test)

        # Display the mean squared error and the determination R2 coefficient
        mse_test = mean_squared_error(Y_test, y_pred_test)
        rmse_test = np.sqrt(mse_test)
        mae_test = mean_absolute_error(Y_test, y_pred_test)
        mape_test = np.mean(np.abs((Y_test - y_pred_test) / Y_test)) * 100
        s_score = compute_s_score(Y_test, y_pred_test)

        print('MSE : ',mse_test * 100,'%')
        print('RMSE : ',rmse_test * 100,'%')
        print('MAE : ',mae_test * 100,'%')
        print('MAPE : ',mape_test,'%')

        #score
        mse_test_list.append(mse_test)
        rmse_test_list.append(rmse_test)
        mae_test_list.append(mae_test)
        mape_test_list.append(mape_test)
        
        print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
        print("time: " , (time.process_time()-t),"sec")
        print('')
    
    print('All MSE: ',mse_test_list)
    print('mean MSE', np.mean(mse_test_list), " == ",np.mean(mse_test_list)*100,"%\n")
    
    print('All RMSE: ',rmse_test_list)
    print('\nAll MAE: ',mae_test_list)
    print('\n******************************************************************************************')
    print('******************************************************************************************')
    print('***************** mean RMSE', np.mean(rmse_test_list), " ==> ",np.mean(rmse_test_list)*100,"% ***************")
    print('******************************************************************************************')
    print('***************** mean MAE : ',np.mean(mae_test_list), " ==> ", np.mean(mae_test_list) * 100,"% *************")
    print('******************************************************************************************')
    print('******************************************************************************************\n')
    print('mean MAPE :', np.mean(mape_test_list),'%')

    print("\n\n++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
    print("Total time: " , (time.process_time()-t),"sec")

In [135]:
my_XGB_Regressor2(train_fd002, test_fd002)


********************** Test No  1 /10:**********************
MSE :  1.6832058862070656 %
RMSE :  12.973842477103942 %
MAE :  10.433505475702999 %
MAPE :  inf %
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
time:  2.703125 sec


********************** Test No  2 /10:**********************
MSE :  1.844497336664852 %
RMSE :  13.58122725185339 %
MAE :  10.738228623444181 %
MAPE :  inf %
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
time:  6.21875 sec


********************** Test No  3 /10:**********************
MSE :  1.8859472257049579 %
RMSE :  13.732979377050553 %
MAE :  10.865432705268002 %
MAPE :  inf %
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
time:  9.34375 sec


********************** Test No  4 /10:**********************
MSE :  1.9162815512896811 %
RMSE :  13.842982161693632 %
MAE :  10.910975071425305 %
MAPE :  inf %
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
time:  12.375 sec


********************** Test No  5