In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
from scipy import stats
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, RobustScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error

In [2]:
# Load the datasets
df_train_raw = pd.read_csv('PM_train.txt', sep=" ", header=None)
df_test_raw = pd.read_csv('PM_train.txt', sep=" ", header=None)

In [3]:
df_train_raw.sample(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27
12637,64,157,0.0002,0.0001,100.0,518.67,642.93,1588.94,1413.55,14.62,21.61,553.88,2388.12,9060.82,1.3,47.49,521.51,2388.09,8141.14,8.454,0.03,393,2388,100.0,38.71,23.2891,,
16449,82,72,-0.0046,-0.0003,100.0,518.67,642.21,1590.88,1393.9,14.62,21.61,554.75,2388.01,9066.91,1.3,47.35,522.48,2388.04,8145.54,8.3838,0.03,393,2388,100.0,39.1,23.4313,,
13687,69,57,0.0017,0.0004,100.0,518.67,642.68,1593.8,1405.01,14.62,21.61,553.22,2388.06,9063.66,1.3,47.43,521.24,2388.07,8135.48,8.4233,0.03,393,2388,100.0,39.01,23.2959,,


In [4]:
df_test_raw.sample(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27
19849,96,298,0.0001,0.0,100.0,518.67,643.34,1602.08,1428.24,14.62,21.61,552.46,2388.17,9067.22,1.3,47.87,520.98,2388.15,8138.05,8.4852,0.03,396,2388,100.0,38.57,23.1607,,
1001,5,155,0.0005,-0.0002,100.0,518.67,642.32,1584.17,1399.99,14.62,21.61,554.56,2388.01,9079.84,1.3,47.32,521.83,2387.99,8155.6,8.4703,0.03,393,2388,100.0,38.76,23.3817,,
4251,21,84,0.0002,0.0003,100.0,518.67,642.06,1587.43,1403.28,14.62,21.61,554.3,2388.05,9053.42,1.3,47.46,521.27,2388.08,8130.74,8.4299,0.03,393,2388,100.0,38.87,23.3318,,


In [5]:
df_train = df_train_raw.iloc[:, :-2]
df_test = df_test_raw.iloc[:, :-2]
df_train.columns = df_test.columns = [
    'AssetId', 'Runtime', 'Setting1', 'Setting2', 'Setting3', 
    'Tag1', 'Tag2', 'Tag3', 'Tag4', 'Tag5', 'Tag6', 'Tag7', 'Tag8', 
    'Tag9', 'Tag10', 'Tag11', 'Tag12', 'Tag13', 'Tag14', 'Tag15', 
    'Tag16', 'Tag17', 'Tag18', 'Tag19', 'Tag20', 'Tag21'
    ]

In [6]:
df_train.sample(5)

Unnamed: 0,AssetId,Runtime,Setting1,Setting2,Setting3,Tag1,Tag2,Tag3,Tag4,Tag5,Tag6,Tag7,Tag8,Tag9,Tag10,Tag11,Tag12,Tag13,Tag14,Tag15,Tag16,Tag17,Tag18,Tag19,Tag20,Tag21
6100,31,114,-0.0007,-0.0005,100.0,518.67,642.47,1573.92,1398.89,14.62,21.61,554.7,2388.07,9058.71,1.3,47.31,522.43,2388.06,8141.55,8.3988,0.03,391,2388,100.0,39.04,23.4133
9597,49,102,0.0019,-0.0002,100.0,518.67,642.21,1595.3,1406.31,14.62,21.61,553.05,2388.16,9032.99,1.3,47.56,521.08,2388.17,8124.37,8.4576,0.03,394,2388,100.0,38.6,23.3487
16687,83,96,-0.0026,-0.0003,100.0,518.67,642.09,1580.45,1394.44,14.62,21.61,554.26,2387.97,9061.83,1.3,47.24,522.14,2387.96,8142.41,8.3875,0.03,393,2388,100.0,38.98,23.4204
3299,16,204,-0.0006,-0.0001,100.0,518.67,643.86,1598.86,1427.74,14.62,21.61,551.12,2388.24,9059.76,1.3,47.99,519.63,2388.31,8131.7,8.4722,0.03,396,2388,100.0,38.41,23.0637
14340,72,3,0.0,0.0001,100.0,518.67,642.39,1588.19,1402.37,14.62,21.61,553.84,2388.04,9043.02,1.3,47.39,521.66,2388.05,8127.28,8.4303,0.03,393,2388,100.0,38.99,23.3593


In [7]:
def get_RUL(df):
    
    # Determine o valor máximo de "runtime" para cada máquina
    max_runtime_per_machine = df.groupby('AssetId')['Runtime'].transform('max')
    # Subtraia o valor atual de "runtime" do valor máximo para calcular a RUL
    df['RUL'] = max_runtime_per_machine - df['Runtime']
    
    return df

df_train = get_RUL(df_train)
df_test = get_RUL(df_test)

In [8]:
df_train.loc[:, 'Setting1':'Tag21'].describe().round(4)

Unnamed: 0,Setting1,Setting2,Setting3,Tag1,Tag2,Tag3,Tag4,Tag5,Tag6,Tag7,Tag8,Tag9,Tag10,Tag11,Tag12,Tag13,Tag14,Tag15,Tag16,Tag17,Tag18,Tag19,Tag20,Tag21
count,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0
mean,-0.0,0.0,100.0,518.67,642.6809,1590.5231,1408.9338,14.62,21.6098,553.3677,2388.0967,9065.2429,1.3,47.5412,521.4135,2388.0962,8143.7527,8.4421,0.03,393.2107,2388.0,100.0,38.8163,23.2897
std,0.0022,0.0003,0.0,0.0,0.5001,6.1311,9.0006,0.0,0.0014,0.8851,0.071,22.0829,0.0,0.2671,0.7376,0.0719,19.0762,0.0375,0.0,1.5488,0.0,0.0,0.1807,0.1083
min,-0.0087,-0.0006,100.0,518.67,641.21,1571.04,1382.25,14.62,21.6,549.85,2387.9,9021.73,1.3,46.85,518.69,2387.88,8099.94,8.3249,0.03,388.0,2388.0,100.0,38.14,22.8942
25%,-0.0015,-0.0002,100.0,518.67,642.325,1586.26,1402.36,14.62,21.61,552.81,2388.05,9053.1,1.3,47.35,520.96,2388.04,8133.245,8.4149,0.03,392.0,2388.0,100.0,38.7,23.2218
50%,0.0,0.0,100.0,518.67,642.64,1590.1,1408.04,14.62,21.61,553.44,2388.09,9060.66,1.3,47.51,521.48,2388.09,8140.54,8.4389,0.03,393.0,2388.0,100.0,38.83,23.2979
75%,0.0015,0.0003,100.0,518.67,643.0,1594.38,1414.555,14.62,21.61,554.01,2388.14,9069.42,1.3,47.7,521.95,2388.14,8148.31,8.4656,0.03,394.0,2388.0,100.0,38.95,23.3668
max,0.0087,0.0006,100.0,518.67,644.53,1616.91,1441.49,14.62,21.61,556.06,2388.56,9244.59,1.3,48.53,523.38,2388.56,8293.72,8.5848,0.03,400.0,2388.0,100.0,39.43,23.6184


In [9]:
df_test.loc[:, 'Setting1':'Tag21'].describe().round(4)

Unnamed: 0,Setting1,Setting2,Setting3,Tag1,Tag2,Tag3,Tag4,Tag5,Tag6,Tag7,Tag8,Tag9,Tag10,Tag11,Tag12,Tag13,Tag14,Tag15,Tag16,Tag17,Tag18,Tag19,Tag20,Tag21
count,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0
mean,-0.0,0.0,100.0,518.67,642.6809,1590.5231,1408.9338,14.62,21.6098,553.3677,2388.0967,9065.2429,1.3,47.5412,521.4135,2388.0962,8143.7527,8.4421,0.03,393.2107,2388.0,100.0,38.8163,23.2897
std,0.0022,0.0003,0.0,0.0,0.5001,6.1311,9.0006,0.0,0.0014,0.8851,0.071,22.0829,0.0,0.2671,0.7376,0.0719,19.0762,0.0375,0.0,1.5488,0.0,0.0,0.1807,0.1083
min,-0.0087,-0.0006,100.0,518.67,641.21,1571.04,1382.25,14.62,21.6,549.85,2387.9,9021.73,1.3,46.85,518.69,2387.88,8099.94,8.3249,0.03,388.0,2388.0,100.0,38.14,22.8942
25%,-0.0015,-0.0002,100.0,518.67,642.325,1586.26,1402.36,14.62,21.61,552.81,2388.05,9053.1,1.3,47.35,520.96,2388.04,8133.245,8.4149,0.03,392.0,2388.0,100.0,38.7,23.2218
50%,0.0,0.0,100.0,518.67,642.64,1590.1,1408.04,14.62,21.61,553.44,2388.09,9060.66,1.3,47.51,521.48,2388.09,8140.54,8.4389,0.03,393.0,2388.0,100.0,38.83,23.2979
75%,0.0015,0.0003,100.0,518.67,643.0,1594.38,1414.555,14.62,21.61,554.01,2388.14,9069.42,1.3,47.7,521.95,2388.14,8148.31,8.4656,0.03,394.0,2388.0,100.0,38.95,23.3668
max,0.0087,0.0006,100.0,518.67,644.53,1616.91,1441.49,14.62,21.61,556.06,2388.56,9244.59,1.3,48.53,523.38,2388.56,8293.72,8.5848,0.03,400.0,2388.0,100.0,39.43,23.6184


In [10]:
zero_std_filter = df_test.loc[:, 'Setting1':'Tag21'].describe().loc['std'] == 0

In [11]:
zero_std_columns = df_test.loc[:, 'Setting1':'Tag21'].columns[zero_std_filter].values

In [12]:
zero_std_columns

array(['Setting3', 'Tag1', 'Tag10', 'Tag18', 'Tag19'], dtype=object)

In [13]:
df_test.drop(columns=zero_std_columns)

Unnamed: 0,AssetId,Runtime,Setting1,Setting2,Tag2,Tag3,Tag4,Tag5,Tag6,Tag7,Tag8,Tag9,Tag11,Tag12,Tag13,Tag14,Tag15,Tag16,Tag17,Tag20,Tag21,RUL
0,1,1,-0.0007,-0.0004,641.82,1589.70,1400.60,14.62,21.61,554.36,2388.06,9046.19,47.47,521.66,2388.02,8138.62,8.4195,0.03,392,39.06,23.4190,191
1,1,2,0.0019,-0.0003,642.15,1591.82,1403.14,14.62,21.61,553.75,2388.04,9044.07,47.49,522.28,2388.07,8131.49,8.4318,0.03,392,39.00,23.4236,190
2,1,3,-0.0043,0.0003,642.35,1587.99,1404.20,14.62,21.61,554.26,2388.08,9052.94,47.27,522.42,2388.03,8133.23,8.4178,0.03,390,38.95,23.3442,189
3,1,4,0.0007,0.0000,642.35,1582.79,1401.87,14.62,21.61,554.45,2388.11,9049.48,47.13,522.86,2388.08,8133.83,8.3682,0.03,392,38.88,23.3739,188
4,1,5,-0.0019,-0.0002,642.37,1582.85,1406.22,14.62,21.61,554.00,2388.06,9055.15,47.28,522.19,2388.04,8133.80,8.4294,0.03,393,38.90,23.4044,187
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20626,100,196,-0.0004,-0.0003,643.49,1597.98,1428.63,14.62,21.61,551.43,2388.19,9065.52,48.07,519.49,2388.26,8137.60,8.4956,0.03,397,38.49,22.9735,4
20627,100,197,-0.0016,-0.0005,643.54,1604.50,1433.58,14.62,21.61,550.86,2388.23,9065.11,48.04,519.68,2388.22,8136.50,8.5139,0.03,395,38.30,23.1594,3
20628,100,198,0.0004,0.0000,643.42,1602.46,1428.18,14.62,21.61,550.94,2388.24,9065.90,48.09,520.01,2388.24,8141.05,8.5646,0.03,398,38.44,22.9333,2
20629,100,199,-0.0011,0.0003,643.23,1605.26,1426.53,14.62,21.61,550.68,2388.25,9073.72,48.39,519.67,2388.23,8139.29,8.5389,0.03,395,38.29,23.0640,1


In [14]:
def drop_zero_std_columns(df):
    zero_std_filter = df.loc[:, 'Setting1':'Tag21'].describe().loc['std'] == 0
    zero_std_columns = df_test.loc[:, 'Setting1':'Tag21'].columns[zero_std_filter].values
    df = df.drop(columns=zero_std_columns)
    
    return df

In [15]:
df_train = drop_zero_std_columns(df_train)
df_test = drop_zero_std_columns(df_test)

In [16]:
df_train.loc[:, 'Setting1':'Tag21'].isna().sum().to_frame().T

Unnamed: 0,Setting1,Setting2,Tag2,Tag3,Tag4,Tag5,Tag6,Tag7,Tag8,Tag9,Tag11,Tag12,Tag13,Tag14,Tag15,Tag16,Tag17,Tag20,Tag21
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [17]:
df_test.loc[:, 'Setting1':'Tag21'].isna().sum().to_frame().T

Unnamed: 0,Setting1,Setting2,Tag2,Tag3,Tag4,Tag5,Tag6,Tag7,Tag8,Tag9,Tag11,Tag12,Tag13,Tag14,Tag15,Tag16,Tag17,Tag20,Tag21
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [18]:
df_train.sample(3)

Unnamed: 0,AssetId,Runtime,Setting1,Setting2,Tag2,Tag3,Tag4,Tag5,Tag6,Tag7,Tag8,Tag9,Tag11,Tag12,Tag13,Tag14,Tag15,Tag16,Tag17,Tag20,Tag21,RUL
4536,22,174,0.0007,0.0002,643.3,1599.02,1424.88,14.62,21.61,551.61,2388.27,9052.29,48.0,520.54,2388.21,8128.82,8.5059,0.03,395,38.8,23.1571,28
16364,81,227,0.0048,0.0002,643.53,1595.27,1435.9,14.62,21.61,551.73,2388.16,9056.9,47.88,520.05,2388.23,8127.82,8.5257,0.03,396,38.57,23.1584,13
5481,28,17,-0.0005,-0.0001,641.88,1581.72,1398.89,14.62,21.61,555.02,2387.99,9065.75,47.09,522.48,2387.99,8136.66,8.3755,0.03,392,39.1,23.4449,148


In [19]:
# # Defina um nível de significância para o teste (geralmente 0.05)
# alpha = 0.05

# # Loop através das colunas do DataFrame
# for column in df.loc[:, 'Setting1':'Tag21']:
#     # Realize o teste de Shapiro-Wilk
#     stat, p_value = stats.shapiro(df[column])
    
#     # Verifique se o valor-p (p-value) é menor que o nível de significância
#     if p_value > alpha:
#         print(f"A variável {column} segue uma distribuição normal (p-value={p_value:.4f})")
#     else:
#         print(f"A variável {column} não segue uma distribuição normal (p-value={p_value:.4f})")


In [20]:
# Defina o fator de escala para identificar outliers (geralmente 1.5)
scale_factor = 1.5

# Crie um DataFrame para armazenar as porcentagens de outliers
outlier_percentages = []

# Loop através das colunas do DataFrame
for column in df_train.loc[:, 'Setting1':'Tag21']:
    # Calcule o IQR (Intervalo Interquartil)
    Q1 = df_train[column].quantile(0.25)
    Q3 = df_train[column].quantile(0.75)
    IQR = Q3 - Q1
    
    # Calcule os limites inferior e superior para identificar outliers
    lower_limit = Q1 - scale_factor * IQR
    upper_limit = Q3 + scale_factor * IQR
    
    # Identifique os outliers
    outliers = df_train[(df_train[column] < lower_limit) | (df_train[column] > upper_limit)]
    
    # Calcule a porcentagem de outliers
    percentage = round(100*len(outliers) / len(df_train),2)
    
    # Adicione as informações ao DataFrame de porcentagens de outliers
    #outlier_percentages = outlier_percentages.append({'Variável': column, 'Porcentagem de Outliers': percentage}, ignore_index=True)
    outlier_percentages.append(percentage)
    
outlier_percentages = pd.DataFrame(outlier_percentages)
# Imprima o DataFrame de porcentagens de outliers
outlier_percentages.index = df_train.loc[:, 'Setting1':'Tag21'].columns
outlier_percentages.columns = ['Outliers (%)']
outlier_percentages.T

Unnamed: 0,Setting1,Setting2,Tag2,Tag3,Tag4,Tag5,Tag6,Tag7,Tag8,Tag9,Tag11,Tag12,Tag13,Tag14,Tag15,Tag16,Tag17,Tag20,Tag21
Outliers (%),0.51,0.0,0.62,0.8,0.58,0.0,1.97,0.53,1.55,8.17,0.81,0.71,0.78,7.48,0.58,0.0,0.39,0.57,0.66


In [21]:
# Standardize the selected columns
scaler = StandardScaler()
df_train.loc[:, 'Setting1':'Tag21'] = scaler.fit_transform(df_train.loc[:, 'Setting1':'Tag21'])
df_test.loc[:, 'Setting1':'Tag21'] = scaler.fit_transform(df_test.loc[:, 'Setting1':'Tag21'])

In [22]:
from random import sample
assets_ids = df_train['AssetId'].unique().tolist()
training_ids = sample(assets_ids, k=int(len(assets_ids)*0.75))
validation_ids = list(filter(lambda x: x not in training_ids, assets_ids))

# Split the DataFrame into training and validation datasets
train_df = df_train[df_train['AssetId'].isin(training_ids)]
val_df = df_train[df_train['AssetId'].isin(validation_ids)] 

# Print the shapes of the resulting datasets
print("Training dataset shape:", train_df.shape)
print("Validation dataset shape:", val_df.shape)

X_train = train_df.drop(columns='RUL')
y_train = train_df['RUL']
X_val = val_df.drop(columns='RUL')
y_val = val_df['RUL']

Training dataset shape: (15466, 22)
Validation dataset shape: (5165, 22)


In [23]:
# Reshape data for LSTM input (assuming a 3D input shape)
sequence_length = 20  # Length of sequences to use for prediction
X_train_lstm = np.array([X_train[i:i+sequence_length] for i in range(len(X_train) - sequence_length + 1)])
X_test_lstm = np.array([X_val[i:i+sequence_length] for i in range(len(X_val) - sequence_length + 1)])
y_train_lstm = y_train.iloc[sequence_length - 1:]
y_test_lstm = y_val.iloc[sequence_length - 1:]

In [24]:
# Build an LSTM model
model = Sequential()
model.add(LSTM(50, activation='relu', input_shape=(sequence_length, X_train.shape[1])))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mean_squared_error')

In [25]:
# Train the model
model.fit(X_train_lstm, y_train_lstm, epochs=50, batch_size=32)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x292723e82d0>

In [26]:
# Make predictions on the test set
y_pred = model.predict(X_test_lstm)



In [27]:
# Evaluate the model (you can use different evaluation metrics)
mse = mean_squared_error(y_test_lstm, y_pred)
print(f"Mean Squared Error: {mse}")

Mean Squared Error: 2097.023450846419


In [28]:
prediction = pd.Series(y_pred.flatten()).round(0).astype(int)

In [29]:
prediction = prediction.rename('RUL_prediction')

In [32]:
real = pd.concat([val_df['AssetId'], y_test_lstm], axis=1, join='inner').reset_index(drop=True)

In [33]:
df_real_prediction = pd.concat([real, prediction], axis=1)

In [34]:
df_real_prediction[(df_real_prediction['RUL'] <=20) | (df_real_prediction['RUL_prediction'] <=20)]

Unnamed: 0,AssetId,RUL,RUL_prediction
130,6,38,13
147,6,21,15
148,6,20,17
149,6,19,17
150,6,18,14
...,...,...,...
5141,100,4,12
5142,100,3,13
5143,100,2,12
5144,100,1,13


In [35]:
red_zone_total = len(df_real_prediction[df_real_prediction['RUL_prediction'] <=20])

In [36]:
red_zone_hit = (df_real_prediction[df_real_prediction['RUL_prediction'] <=20]['RUL'] < 20).sum()

In [37]:
round(100*red_zone_hit/red_zone_total, 2)

82.79