In [None]:
'''from google.colab import drive
drive.mount('/content/drive')'''

In [None]:
#!pip install catboost

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scipy.stats import pearsonr

from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [None]:
#myfolder = "./"
myfolder = "../data/CMaps/"

# **Columns' names**

In [None]:
#Columns' names
'''
1)  unit number
2)	time, in cycles
3)	operational setting 1
4)	operational setting 2
5)	operational setting 3
6)	sensor measurement  1
7)	sensor measurement  2
...
26)	sensor measurement  21
'''
unitNames = ['UnitNumber']
timeCycles = ["TimeInCycles"]
operSets = ["OperSet"+str(i) for i in range(1,4)] # 1,2 et 3
sensorMes = ["SensorMes"+str(j) for j in range(1, 22)] # de 1 à 21
columnsNames = unitNames + timeCycles + operSets +sensorMes

# **Datasets loading**

In [None]:
def data_loading(x):
  train_path = myfolder + "train_"+ x +".txt"
  test_path = myfolder + "test_"+ x +".txt"
  rul_path = myfolder + "RUL_"+ x +".txt"
  train = pd.read_csv(train_path, delim_whitespace=True, names=columnsNames)
  test = pd.read_csv(test_path, delim_whitespace=True, names=columnsNames)
  rul = pd.read_csv(rul_path, delim_whitespace=True, names=["RUL_FD"])
  return train, test, rul

train_fd001, test_fd001, rul_fd001 = data_loading("FD001")
train_fd002, test_fd002, rul_fd002 = data_loading("FD002")
train_fd003, test_fd003, rul_fd003 = data_loading("FD003")
train_fd004, test_fd004, rul_fd004 = data_loading("FD004")

In [None]:
# Forcer l'affichage de toutes les colonnes
pd.set_option('display.max_columns', None)

train_fd004.head(3)

# **RUL column generation for train and test set**

In [None]:
def rul_train_generation(x):
  rul = pd.DataFrame(x.groupby('UnitNumber')['TimeInCycles'].max()).reset_index()
  rul.columns = ['UnitNumber', 'max']
  x = x.merge(rul, on=['UnitNumber'], how='left')
  x['RUL'] = x['max'] - x['TimeInCycles']
  x.drop('max', axis=1, inplace=True)
  return x

train_fd001 = rul_train_generation(train_fd001)
train_fd002 = rul_train_generation(train_fd002)
train_fd003 = rul_train_generation(train_fd003)
train_fd004 = rul_train_generation(train_fd004)

In [None]:
train_fd004.head(3)

In [None]:
def rul_test_generation(x, rul):

  rul["UnitNumber"] = rul.index + 1 # +1 pour que UnitNumber demarre de 1 au lieu de 0, car il s'agit du numero des moteur

  x = x.merge(rul, on=['UnitNumber'], how='left')

  max_cycle = pd.DataFrame(x.groupby('UnitNumber')['TimeInCycles'].max()).reset_index()
  max_cycle.columns = ['UnitNumber', 'max']
  x = x.merge(max_cycle, on=['UnitNumber'], how='left')
  x['RUL'] = x['RUL_FD'] + x['max'] - x['TimeInCycles']
  x.drop(['max', 'RUL_FD'], axis=1, inplace=True)

  return x

test_fd001 = rul_test_generation(test_fd001, rul_fd001)
test_fd002 = rul_test_generation(test_fd002, rul_fd002)
test_fd003 = rul_test_generation(test_fd003, rul_fd003)
test_fd004 = rul_test_generation(test_fd004, rul_fd004)

In [None]:
test_fd004.head(3)

In [None]:
print("train_FD001: ",train_fd001.shape, ". test_FD001: ", test_fd001.shape,". RUL: ", rul_fd001.shape)
print("train_FD002: ",train_fd002.shape, ". test_FD002: ", test_fd002.shape,". RUL: ", rul_fd002.shape)
print("train_FD003: ",train_fd003.shape, ". test_FD003: ", test_fd003.shape,". RUL: ", rul_fd003.shape)
print("train_FD004: ",train_fd004.shape, ". test_FD004: ", test_fd004.shape,". RUL: ", rul_fd004.shape)

In [None]:
print("train_FD001: ",train_fd001.shape, ". test_FD001: ", test_fd001.shape,". nbr moteurs", len(train_fd001['UnitNumber'].unique()))
print("train_FD002: ",train_fd002.shape, ". test_FD002: ", test_fd002.shape,". nbr moteurs", len(train_fd002['UnitNumber'].unique()))
print("train_FD003: ",train_fd003.shape, ". test_FD003: ", test_fd003.shape,". nbr moteurs", len(train_fd003['UnitNumber'].unique()))
print("train_FD004: ",train_fd004.shape, ". test_FD004: ", test_fd004.shape,". nbr moteurs", len(train_fd004['UnitNumber'].unique()))

In [None]:
len(train_fd002['UnitNumber'].unique())

# **Bivariate Correlation Analysis**

In [None]:
def correl(df):
    sns.heatmap(df.corr(),annot=True,cmap='RdYlGn',linewidths=0.2)
    fig=plt.gcf()
    fig.set_size_inches(20,20)
    plt.show()


def correlation_analyz(df, seuil):
  # Calculer la corrélation entre les variables avec différentes méthodes
  corr_pearson = df.corr(method='pearson')
  corr_spearman = df.corr(method='spearman')
  corr_kendall = df.corr(method='kendall')

  # Colonnes avec corrélation plus faible que le seuil pour chaque méthode
  colonnes_faible_corr_pearson = corr_pearson.loc[corr_pearson['RUL'].abs() < seuil].index.tolist()
  colonnes_faible_corr_spearman = corr_spearman.loc[corr_spearman['RUL'].abs() < seuil].index.tolist()
  colonnes_faible_corr_kendall = corr_kendall.loc[corr_kendall['RUL'].abs() < seuil].index.tolist()

  # Afficher les colonnes avec corrélation plus faible que le seuil pour chaque méthode
  print("Colonnes avec corrélation plus faible que", seuil, "pour la corrélation de Pearson:")
  print(colonnes_faible_corr_pearson)
  print("\nColonnes avec corrélation plus faible que", seuil, "pour la corrélation de Spearman:")
  print(colonnes_faible_corr_spearman)
  print("\nColonnes avec corrélation plus faible que", seuil, "pour la corrélation de Kendall:")
  print(colonnes_faible_corr_kendall)

In [None]:
correl(train_fd001)

In [17]:
correlation_analyz(train_fd001, 0.6)

Colonnes avec corrélation plus faible que 0.6 pour la corrélation de Pearson:
['UnitNumber', 'OperSet1', 'OperSet2', 'SensorMes3', 'SensorMes6', 'SensorMes8', 'SensorMes9', 'SensorMes13', 'SensorMes14']

Colonnes avec corrélation plus faible que 0.6 pour la corrélation de Spearman:
['UnitNumber', 'OperSet1', 'OperSet2', 'SensorMes6', 'SensorMes8', 'SensorMes9', 'SensorMes13', 'SensorMes14']

Colonnes avec corrélation plus faible que 0.6 pour la corrélation de Kendall:
['UnitNumber', 'TimeInCycles', 'OperSet1', 'OperSet2', 'SensorMes2', 'SensorMes3', 'SensorMes4', 'SensorMes6', 'SensorMes7', 'SensorMes8', 'SensorMes9', 'SensorMes11', 'SensorMes12', 'SensorMes13', 'SensorMes14', 'SensorMes15', 'SensorMes17', 'SensorMes20', 'SensorMes21']


In [18]:
# Les colonnes à supprimer
operSets_to_drop = ["OperSet"+str(i) for i in [1, 2]] # 1 et 2
sensorMes_to_drop = ["SensorMes"+str(j) for j in [3, 6, 8, 9, 13, 14]]
cols_to_drop = operSets_to_drop + sensorMes_to_drop

# Suppression dans train et test
drop_cols=True
if(drop_cols == True):
  train_fd001 = train_fd001.drop(cols_to_drop, axis=1)
  test_fd001 = test_fd001.drop(cols_to_drop, axis=1)

In [19]:
train_fd001.columns, len(train_fd001.columns)

(Index(['UnitNumber', 'TimeInCycles', 'OperSet3', 'SensorMes1', 'SensorMes2',
        'SensorMes4', 'SensorMes5', 'SensorMes7', 'SensorMes10', 'SensorMes11',
        'SensorMes12', 'SensorMes15', 'SensorMes16', 'SensorMes17',
        'SensorMes18', 'SensorMes19', 'SensorMes20', 'SensorMes21', 'RUL'],
       dtype='object'),
 19)

In [None]:
correl(train_fd002)

In [21]:
correlation_analyz(train_fd002, 0.03)

Colonnes avec corrélation plus faible que 0.03 pour la corrélation de Pearson:
['UnitNumber', 'OperSet1', 'OperSet2', 'OperSet3', 'SensorMes1', 'SensorMes2', 'SensorMes3', 'SensorMes5', 'SensorMes6', 'SensorMes7', 'SensorMes8', 'SensorMes9', 'SensorMes10', 'SensorMes12', 'SensorMes13', 'SensorMes17', 'SensorMes18', 'SensorMes19', 'SensorMes20', 'SensorMes21']

Colonnes avec corrélation plus faible que 0.03 pour la corrélation de Spearman:
['UnitNumber', 'OperSet1', 'OperSet2', 'OperSet3', 'SensorMes1', 'SensorMes5', 'SensorMes10', 'SensorMes18', 'SensorMes19']

Colonnes avec corrélation plus faible que 0.03 pour la corrélation de Kendall:
['UnitNumber', 'OperSet1', 'OperSet2', 'OperSet3', 'SensorMes1', 'SensorMes5', 'SensorMes10', 'SensorMes18', 'SensorMes19']


In [22]:
# Les colonnes à supprimer
operSets_to_drop = ["OperSet"+str(i) for i in range(1,4)] # 1,2 et 3
sensorMes_to_drop = ["SensorMes"+str(j) for j in [1, 5, 10, 18, 19]]
cols_to_drop = operSets_to_drop + sensorMes_to_drop

# Suppression dans train et test
drop_cols=True
if(drop_cols == True):
  train_fd002 = train_fd002.drop(cols_to_drop, axis=1)
  test_fd002 = test_fd002.drop(cols_to_drop, axis=1)

In [23]:
train_fd002.columns, len(train_fd002.columns)

(Index(['UnitNumber', 'TimeInCycles', 'SensorMes2', 'SensorMes3', 'SensorMes4',
        'SensorMes6', 'SensorMes7', 'SensorMes8', 'SensorMes9', 'SensorMes11',
        'SensorMes12', 'SensorMes13', 'SensorMes14', 'SensorMes15',
        'SensorMes16', 'SensorMes17', 'SensorMes20', 'SensorMes21', 'RUL'],
       dtype='object'),
 19)

In [None]:
correl(train_fd003)

In [25]:
correlation_analyz(train_fd003, 0.4)

Colonnes avec corrélation plus faible que 0.4 pour la corrélation de Pearson:
['UnitNumber', 'OperSet1', 'OperSet2', 'SensorMes6', 'SensorMes7', 'SensorMes10', 'SensorMes12', 'SensorMes15', 'SensorMes20', 'SensorMes21']

Colonnes avec corrélation plus faible que 0.4 pour la corrélation de Spearman:
['UnitNumber', 'OperSet1', 'OperSet2', 'SensorMes6', 'SensorMes7', 'SensorMes12', 'SensorMes15', 'SensorMes20', 'SensorMes21']

Colonnes avec corrélation plus faible que 0.4 pour la corrélation de Kendall:
['UnitNumber', 'OperSet1', 'OperSet2', 'SensorMes6', 'SensorMes7', 'SensorMes10', 'SensorMes12', 'SensorMes14', 'SensorMes15', 'SensorMes20', 'SensorMes21']


In [26]:
# Les colonnes à supprimer
operSets_to_drop = ["OperSet"+str(i) for i in [1, 2]] # 1 et 2
sensorMes_to_drop = ["SensorMes"+str(j) for j in [6, 7, 10, 12, 15, 20, 21]]
cols_to_drop = operSets_to_drop + sensorMes_to_drop

# Suppression dans train et test
drop_cols=True
if(drop_cols == True):
  train_fd003 = train_fd003.drop(cols_to_drop, axis=1)
  test_fd003 = test_fd003.drop(cols_to_drop, axis=1)

In [27]:
train_fd003.columns, len(train_fd003.columns)

(Index(['UnitNumber', 'TimeInCycles', 'OperSet3', 'SensorMes1', 'SensorMes2',
        'SensorMes3', 'SensorMes4', 'SensorMes5', 'SensorMes8', 'SensorMes9',
        'SensorMes11', 'SensorMes13', 'SensorMes14', 'SensorMes16',
        'SensorMes17', 'SensorMes18', 'SensorMes19', 'RUL'],
       dtype='object'),
 18)

In [None]:
correl(train_fd004)

In [29]:
correlation_analyz(train_fd004, 0.03)

Colonnes avec corrélation plus faible que 0.03 pour la corrélation de Pearson:
['UnitNumber', 'OperSet1', 'OperSet2', 'OperSet3', 'SensorMes1', 'SensorMes2', 'SensorMes5', 'SensorMes6', 'SensorMes7', 'SensorMes8', 'SensorMes9', 'SensorMes10', 'SensorMes12', 'SensorMes13', 'SensorMes15', 'SensorMes18', 'SensorMes19', 'SensorMes20', 'SensorMes21']

Colonnes avec corrélation plus faible que 0.03 pour la corrélation de Spearman:
['UnitNumber', 'OperSet1', 'OperSet2', 'OperSet3', 'SensorMes1', 'SensorMes5', 'SensorMes7', 'SensorMes12', 'SensorMes15', 'SensorMes18', 'SensorMes19', 'SensorMes20', 'SensorMes21']

Colonnes avec corrélation plus faible que 0.03 pour la corrélation de Kendall:
['UnitNumber', 'OperSet1', 'OperSet2', 'OperSet3', 'SensorMes1', 'SensorMes5', 'SensorMes6', 'SensorMes7', 'SensorMes12', 'SensorMes15', 'SensorMes18', 'SensorMes19', 'SensorMes20', 'SensorMes21']


In [30]:
# Les colonnes à supprimer
operSets_to_drop = ["OperSet"+str(i) for i in range(1,4)] # 1,2 et 3
sensorMes_to_drop = ["SensorMes"+str(j) for j in [1, 5, 6, 7, 12, 15, 18, 19, 20, 21]]
cols_to_drop = operSets_to_drop + sensorMes_to_drop

# Suppression dans train et test
drop_cols=True
if(drop_cols == True):
  train_fd004 = train_fd004.drop(cols_to_drop, axis=1)
  test_fd004 = test_fd004.drop(cols_to_drop, axis=1)

In [31]:
train_fd004.columns, len(train_fd004.columns)

(Index(['UnitNumber', 'TimeInCycles', 'SensorMes2', 'SensorMes3', 'SensorMes4',
        'SensorMes8', 'SensorMes9', 'SensorMes10', 'SensorMes11', 'SensorMes13',
        'SensorMes14', 'SensorMes16', 'SensorMes17', 'RUL'],
       dtype='object'),
 14)

# **Data normalization**

In [32]:
def normalised_df(train, test):

  from sklearn.preprocessing import MinMaxScaler

  # Instancier l'objet MinMaxScaler pour normaliser les données
  scaler = MinMaxScaler()

  # Normaliser train
  train_scaled = scaler.fit_transform(train)
  train_df = pd.DataFrame(train_scaled)

  # Normaliser test
  test_scaled = scaler.fit_transform(test)
  test_df = pd.DataFrame(test_scaled)


  train_df.columns = train.columns
  test_df.columns = test.columns
    
  return train_df, test_df

In [33]:
train_fd001, test_fd001 = normalised_df(train_fd001, test_fd001)
train_fd002, test_fd002 = normalised_df(train_fd002, test_fd002)
train_fd003, test_fd003 = normalised_df(train_fd003, test_fd003)
train_fd004, test_fd004 = normalised_df(train_fd004, test_fd004)

In [34]:
train_fd004.head(3)

Unnamed: 0,UnitNumber,TimeInCycles,SensorMes2,SensorMes3,SensorMes4,SensorMes8,SensorMes9,SensorMes10,SensorMes11,SensorMes13,SensorMes14,SensorMes16,SensorMes17,RUL
0,0.0,0.0,0.130347,0.272082,0.212586,0.626983,0.269578,0.205128,0.458604,0.993111,0.550773,0.0,0.28866,0.590406
1,0.0,0.001845,0.647971,0.634407,0.511781,0.862888,0.601411,0.358974,0.641234,0.992395,0.481761,0.0,0.608247,0.588561
2,0.0,0.00369,0.123646,0.271245,0.222481,0.62711,0.265759,0.205128,0.456169,0.993056,0.531031,0.0,0.278351,0.586716


# **Data splitting**

In [35]:
def data_split(train, test):

  # data split
  X_train = train.drop('RUL', axis=1)
  Y_train = train['RUL']
  X_test = test.drop('RUL', axis=1)
  Y_test = test['RUL']
    
  return X_train, Y_train, X_test, Y_test


# **The XGBoost Regressor model**

In [36]:
def my_xgboost_regressor(train, test):

    X_train, Y_train, X_test, Y_test = data_split(train, test)

    
    # Créer le modèle XGBoost Regressor
    model = XGBRegressor()

    # Définir les paramètres à tester dans la recherche par grille
    param_grid = {
        'n_estimators': [100, 500, 1000],
        'max_depth': [3, 5, 7],
        'learning_rate': [0.01, 0.1, 0.3], #np.logspace(-3,-1,10)
        'subsample': [0.5, 0.7, 0.9],
        'colsample_bytree': [0.5, 0.7, 0.9],
        }

    # Créer l'objet GridSearchCV
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error')

    # Effectuer la recherche par grille sur les données d'entraînement
    grid_search.fit(X_train, Y_train)

    # Afficher les meilleurs paramètres trouvés
    print("Meilleurs paramètres trouvés :")
    print(grid_search.best_params_)
    
    
    
    '''#-------------Train---------------------
    # Prédire les valeurs en utilisant le modèle KNN Regressor pour les données Train
    y_pred_train = grid_search.predict(X_train)
    # Afficher l'erreur quadratique moyenne et le coefficient de détermination R2
    mse_train = mean_squared_error(Y_train, y_pred_train)
    rmse_train = np.sqrt(mse_train)
    mae_train = mean_absolute_error(Y_train, y_pred_train)
    mape_train = np.mean(np.abs((Y_train - y_pred_train) / Y_train)) * 100
    r2_train = r2_score(Y_train, y_pred_train)
    adjusted_r2_train = 1 - (1-r2_train)*(len(Y_train)-1)/(len(Y_train)-X_train.shape[1]-1)

    print('\n=============================Train=============================')
    print('MSE : ',mse_train * 100,'%')
    print('RMSE : ',rmse_train * 100,'%')
    print('MAE : ',mae_train * 100,'%')
    print('MAPE : ',mape_train,'%')
    print('R2 : ',r2_train * 100,'%')
    print('Adjusted R2 : ', adjusted_r2_train * 100,'%')'''

    #-------------Test---------------------
    # Prédire les valeurs en utilisant le modèle KNN Regressor pour les données Test
    y_pred_test = grid_search.predict(X_test)
    # Afficher l'erreur quadratique moyenne et le coefficient de détermination R2
    mse_test = mean_squared_error(Y_test, y_pred_test)
    rmse_test = np.sqrt(mse_test)
    mae_test = mean_absolute_error(Y_test, y_pred_test)
    mape_test = np.mean(np.abs((Y_test - y_pred_test) / Y_test)) * 100
    r2_test = r2_score(Y_test, y_pred_test)
    adjusted_r2_test = 1 - (1-r2_test)*(len(Y_test)-1)/(len(Y_test)-X_test.shape[1]-1)

    print('\n=============================Test=============================')
    print('MSE : ',mse_test * 100,'%')
    print('RMSE : ',rmse_test * 100,'%')
    print('MAE : ',mae_test * 100,'%')
    print('MAPE : ',mape_test,'%')
    print('R2 : ',r2_test * 100,'%')
    print('Adjusted R2 : ', adjusted_r2_test * 100,'%')

In [37]:
my_xgboost_regressor(train_fd001, test_fd001)

Meilleurs paramètres trouvés :
{'colsample_bytree': 0.5, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 500, 'subsample': 0.5}

MSE :  2.347415117814772 %
RMSE :  15.321276440997897 %
MAE :  11.171493064850031 %
MAPE :  inf %
R2 :  25.165767075427723 %
Adjusted R2 :  25.06276056073459 %


In [None]:
my_xgboost_regressor(train_fd002, test_fd002)

In [None]:
my_xgboost_regressor(train_fd003, test_fd003)

In [None]:
my_xgboost_regressor(train_fd004, test_fd004)