# LIBRARIES

In [21]:
import pandas as pd
pd.set_option('display.max_columns', None)
from lifelines import KaplanMeierFitter
from lifelines.utils import concordance_index
from lifelines import CoxPHFitter
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
import lightgbm as lgb
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from random import sample

# LOADING DATA

In [2]:
# Load the datasets
df_train_raw = pd.read_csv('../data/PM_train.txt', sep=" ", header=None)
df_test_raw = pd.read_csv('../data/PM_test.txt', sep=" ", header=None)

# DATA WRANGLING

In [3]:
# Visualização de 3 linhas aleatórias do dataset de treino
df_train_raw.sample(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27
14201,71,72,-0.0049,-0.0001,100.0,518.67,642.68,1590.85,1404.46,14.62,21.61,553.6,2388.14,9053.82,1.3,47.57,520.99,2388.08,8132.41,8.4391,0.03,393,2388,100.0,38.64,23.2927,,
11934,60,165,-0.0012,-0.0005,100.0,518.67,643.43,1601.09,1425.62,14.62,21.61,552.69,2388.16,9119.65,1.3,48.19,520.61,2388.16,8191.59,8.5099,0.03,397,2388,100.0,38.45,23.0497,,
6767,34,156,-0.0002,0.0001,100.0,518.67,642.93,1587.54,1413.29,14.62,21.61,552.81,2388.09,9103.29,1.3,47.76,521.35,2388.11,8173.79,8.4224,0.03,394,2388,100.0,38.65,23.1507,,


In [4]:
# Visualização de 3 linhas aleatórias do dataset de teste
df_test_raw.sample(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27
6227,49,266,0.0001,-0.0003,100.0,518.67,643.08,1593.08,1411.54,14.62,21.61,552.8,2388.15,9061.67,1.3,47.67,521.49,2388.11,8137.36,8.459,0.03,393,2388,100.0,38.57,23.205,,
7607,59,66,0.0003,0.0001,100.0,518.67,642.25,1585.21,1406.96,14.62,21.61,554.56,2388.05,9061.7,1.3,47.12,522.33,2388.03,8134.46,8.4097,0.03,393,2388,100.0,38.98,23.415,,
6381,51,43,-0.0014,-0.0005,100.0,518.67,642.7,1592.52,1406.07,14.62,21.61,553.37,2388.06,9053.0,1.3,47.41,521.85,2388.09,8133.51,8.4388,0.03,391,2388,100.0,38.88,23.3894,,


In [5]:
# As duas últimas colunas dos datasets estão vazias, então irei retirá-las.
df_train = df_train_raw.iloc[:, :-2]
df_test = df_test_raw.iloc[:, :-2]

In [6]:
# Renomeando as colunas
df_train.columns = df_test.columns = [
    'AssetId', 'Runtime', 'Setting1', 'Setting2', 'Setting3', 
    'Tag1', 'Tag2', 'Tag3', 'Tag4', 'Tag5', 'Tag6', 'Tag7', 'Tag8', 
    'Tag9', 'Tag10', 'Tag11', 'Tag12', 'Tag13', 'Tag14', 'Tag15', 
    'Tag16', 'Tag17', 'Tag18', 'Tag19', 'Tag20', 'Tag21'
    ]

In [7]:
df_train.sample(3)

Unnamed: 0,AssetId,Runtime,Setting1,Setting2,Setting3,Tag1,Tag2,Tag3,Tag4,Tag5,Tag6,Tag7,Tag8,Tag9,Tag10,Tag11,Tag12,Tag13,Tag14,Tag15,Tag16,Tag17,Tag18,Tag19,Tag20,Tag21
8992,46,198,0.0042,-0.0002,100.0,518.67,643.29,1595.46,1416.63,14.62,21.61,553.27,2388.14,9056.1,1.3,47.54,521.54,2388.11,8133.56,8.4578,0.03,394,2388,100.0,38.87,23.2968
3518,17,214,0.0002,-0.0,100.0,518.67,643.01,1595.01,1414.55,14.62,21.61,553.74,2388.07,9095.88,1.3,47.51,521.34,2388.04,8175.45,8.4108,0.03,393,2388,100.0,38.74,23.1943
7320,38,5,-0.0051,-0.0004,100.0,518.67,642.69,1587.7,1412.24,14.62,21.61,553.37,2388.14,9043.0,1.3,47.36,521.65,2388.09,8127.32,8.4261,0.03,392,2388,100.0,38.64,23.4375


Transformando a coluna Runtime na variável RUL, que será o target do modelo:

In [8]:
# Determinando o valor máximo de "runtime" para cada máquina
max_runtime_per_asset = df_train.groupby('AssetId')['Runtime'].transform('max')

# Subtraindo o valor atual de "runtime" do valor máximo para calcular a RUL
df_train['RUL'] = max_runtime_per_asset - df_train['Runtime']

Análise Descritiva dos Dados

In [9]:
df_train.loc[:, 'Setting1':'Tag21'].describe().round(4)

Unnamed: 0,Setting1,Setting2,Setting3,Tag1,Tag2,Tag3,Tag4,Tag5,Tag6,Tag7,Tag8,Tag9,Tag10,Tag11,Tag12,Tag13,Tag14,Tag15,Tag16,Tag17,Tag18,Tag19,Tag20,Tag21
count,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0
mean,-0.0,0.0,100.0,518.67,642.6809,1590.5231,1408.9338,14.62,21.6098,553.3677,2388.0967,9065.2429,1.3,47.5412,521.4135,2388.0962,8143.7527,8.4421,0.03,393.2107,2388.0,100.0,38.8163,23.2897
std,0.0022,0.0003,0.0,0.0,0.5001,6.1311,9.0006,0.0,0.0014,0.8851,0.071,22.0829,0.0,0.2671,0.7376,0.0719,19.0762,0.0375,0.0,1.5488,0.0,0.0,0.1807,0.1083
min,-0.0087,-0.0006,100.0,518.67,641.21,1571.04,1382.25,14.62,21.6,549.85,2387.9,9021.73,1.3,46.85,518.69,2387.88,8099.94,8.3249,0.03,388.0,2388.0,100.0,38.14,22.8942
25%,-0.0015,-0.0002,100.0,518.67,642.325,1586.26,1402.36,14.62,21.61,552.81,2388.05,9053.1,1.3,47.35,520.96,2388.04,8133.245,8.4149,0.03,392.0,2388.0,100.0,38.7,23.2218
50%,0.0,0.0,100.0,518.67,642.64,1590.1,1408.04,14.62,21.61,553.44,2388.09,9060.66,1.3,47.51,521.48,2388.09,8140.54,8.4389,0.03,393.0,2388.0,100.0,38.83,23.2979
75%,0.0015,0.0003,100.0,518.67,643.0,1594.38,1414.555,14.62,21.61,554.01,2388.14,9069.42,1.3,47.7,521.95,2388.14,8148.31,8.4656,0.03,394.0,2388.0,100.0,38.95,23.3668
max,0.0087,0.0006,100.0,518.67,644.53,1616.91,1441.49,14.62,21.61,556.06,2388.56,9244.59,1.3,48.53,523.38,2388.56,8293.72,8.5848,0.03,400.0,2388.0,100.0,39.43,23.6184


A partir da análise descritiva dos dados, é possível ver que algumas features não apresentam variação em seus valores, indicado pelo std (Standard Deviation) = 0.0000. Portanto, essas features não irão contribuir em nada com a preditibilidade do modelo. Irei retirá-las do dataset.

In [10]:
# retirando as colunas com std igual a ouase igual a zero
def drop_zero_std_columns(df):
    zero_std_filter = df.loc[:, 'Setting1':'Tag21'].describe().loc['std'] < 0.01
    zero_std_columns = df_test.loc[:, 'Setting1':'Tag21'].columns[zero_std_filter].values
    df = df.drop(columns=zero_std_columns)
    
    return df

df_train = drop_zero_std_columns(df_train)
df_test = drop_zero_std_columns(df_test)

Checando a presença de nulos nos datasets

In [11]:
# Checando a presença de nulos do dataset de treino
df_train.isna().sum().to_frame().T

Unnamed: 0,AssetId,Runtime,Tag2,Tag3,Tag4,Tag7,Tag8,Tag9,Tag11,Tag12,Tag13,Tag14,Tag15,Tag17,Tag20,Tag21,RUL
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


Checando a presença de outliers no dataset de treino

In [12]:
# Checando a presença de outliers

# Defina o fator de escala para identificar outliers (geralmente 1.5)
scale_factor = 1.5

# Crie um DataFrame para armazenar as porcentagens de outliers
outlier_percentages = []

# Loop através das colunas do DataFrame
for column in df_train:
    # Calcule o IQR (Intervalo Interquartil)
    Q1 = df_train[column].quantile(0.25)
    Q3 = df_train[column].quantile(0.75)
    IQR = Q3 - Q1
    
    # Calcule os limites inferior e superior para identificar outliers
    lower_limit = Q1 - scale_factor * IQR
    upper_limit = Q3 + scale_factor * IQR
    
    # Identifique os outliers
    outliers = df_train[(df_train[column] < lower_limit) | (df_train[column] > upper_limit)]
    
    # Calcule a porcentagem de outliers
    percentage = round(100*len(outliers) / len(df_train),2)
    
    # Adicione as informações ao DataFrame de porcentagens de outliers
    #outlier_percentages = outlier_percentages.append({'Variável': column, 'Porcentagem de Outliers': percentage}, ignore_index=True)
    outlier_percentages.append(percentage)
    
outlier_percentages = pd.DataFrame(outlier_percentages)
# Imprima o DataFrame de porcentagens de outliers
outlier_percentages.index = df_train.columns
outlier_percentages.columns = ['Outliers (%)']
outlier_percentages.T

Unnamed: 0,AssetId,Runtime,Tag2,Tag3,Tag4,Tag7,Tag8,Tag9,Tag11,Tag12,Tag13,Tag14,Tag15,Tag17,Tag20,Tag21,RUL
Outliers (%),0.0,0.5,0.62,0.8,0.58,0.53,1.55,8.17,0.81,0.71,0.78,7.48,0.58,0.39,0.57,0.66,0.5


Manterei os outliers pois eles podem ser o indicativo que a máquina não está operando bem, e portanto, são importantes para que o modelo identifique esses regimes de operação.

## DATA ANALYSIS

In [13]:
df_train[['AssetId', 'Runtime']].groupby('AssetId').max().describe().astype(int).T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Runtime,100,206,46,128,177,199,229,362


In [14]:
df_test[['AssetId', 'Runtime']].groupby('AssetId').max().describe().astype(int).T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Runtime,100,130,53,31,88,133,164,303


In [96]:
# Data
x = df_train.groupby('AssetId')['RUL'].max()

# Create histogram trace
histogram_trace = go.Histogram(
    x=x,
    xbins=dict(
        start=0  # Set the start value of the x-axis bins to 0
    ),
    nbinsx=20,
    marker=dict(color='green')
)

# Create layout
layout = go.Layout(
    title={'text': 'Histograma RUL',
           'x': 0.5,
           'y': 0.95,
           'xanchor':'center',
           'yanchor':'top'},
    xaxis=dict(title='RUL'),
    yaxis=dict(title='Frequency'),
    height=600,
    width=600
)

# Create figure
fig = go.Figure(data=[histogram_trace], layout=layout)

# Show the figure
fig.show()

In [97]:
# Boxplot de RUL por AssetId

data = df_train[['AssetId', 'RUL']].groupby('AssetId').max()

# Criação do boxplot
fig = go.Figure()
fig.add_trace(go.Box(y=data['RUL'], name='RUL', marker=dict(color='green')))

# Layout
fig.update_layout(
    title={'text': 'Boxplot de RUL por AssetId',
           'x': 0.5,
           'y': 0.95,
           'xanchor':'center',
           'yanchor':'top'},
    yaxis=dict(title='RUL'),
    height=400,
    width=400
)

# Exibir o boxplot
fig.show()

In [15]:
df_failure = df_train.copy()
df_failure['Failure'] = 0
df_failure.loc[df_failure['RUL'] == 0, 'Failure'] = 1
df_failure.sample(3)

Unnamed: 0,AssetId,Runtime,Tag2,Tag3,Tag4,Tag7,Tag8,Tag9,Tag11,Tag12,Tag13,Tag14,Tag15,Tag17,Tag20,Tag21,RUL,Failure
2030,10,117,642.24,1582.51,1400.54,554.26,2388.03,9063.34,47.41,521.92,2388.06,8145.35,8.3974,393,39.02,23.4015,105,0
4924,25,45,642.81,1584.99,1408.14,553.23,2388.12,9045.09,47.48,521.72,2388.13,8126.84,8.4262,394,38.83,23.3319,185,0
4464,22,102,642.71,1589.02,1406.67,552.65,2388.2,9040.86,47.67,520.53,2388.15,8124.05,8.4826,394,38.82,23.3046,100,0


In [16]:
prob_survival = (100 - df_failure['Runtime'].value_counts().sort_index()).reset_index()

In [17]:
prob_survival.to_csv('prob_survival.csv', index=False, sep=';')

In [18]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=prob_survival['Runtime'],
                         y=prob_survival['count'],
                         name='Probabilidade de Falha')
              )
fig.update_layout(
    title=f'Análise da Probabilidade de Falha ao Longo dos Ciclos de Operação',
    height=600
)

fig.show()

In [19]:
cph = CoxPHFitter()
cph.fit(df_failure.drop(columns='RUL'),
        duration_col='Runtime',
        event_col='Failure',
        cluster_col='AssetId'
        )
cph.print_summary()


0,1
model,lifelines.CoxPHFitter
duration col,'Runtime'
event col,'Failure'
cluster col,'AssetId'
robust variance,True
baseline estimation,breslow
number of observations,20631
number of events observed,100
partial log-likelihood,-497.59
time fit was run,2023-10-05 16:01:05 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,cmp to,z,p,-log2(p)
Tag2,0.36,1.43,0.34,-0.3,1.02,0.74,2.76,0.0,1.07,0.28,1.81
Tag3,0.03,1.04,0.03,-0.02,0.09,0.98,1.09,0.0,1.24,0.21,2.22
Tag4,0.16,1.17,0.03,0.1,0.21,1.11,1.23,0.0,5.81,<0.005,27.27
Tag7,-0.57,0.56,0.25,-1.06,-0.08,0.35,0.92,0.0,-2.3,0.02,5.54
Tag8,2.24,9.41,2.77,-3.19,7.68,0.04,2157.36,0.0,0.81,0.42,1.26
Tag9,0.0,1.0,0.02,-0.04,0.04,0.97,1.04,0.0,0.2,0.84,0.24
Tag11,4.39,80.5,0.92,2.59,6.19,13.28,487.94,0.0,4.77,<0.005,19.07
Tag12,-1.22,0.3,0.32,-1.84,-0.6,0.16,0.55,0.0,-3.84,<0.005,12.99
Tag13,3.9,49.32,2.69,-1.38,9.18,0.25,9654.14,0.0,1.45,0.15,2.76
Tag14,0.01,1.01,0.02,-0.04,0.05,0.96,1.05,0.0,0.36,0.72,0.47

0,1
Concordance,0.97
Partial AIC,1023.17
log-likelihood ratio test,480.92 on 14 df
-log2(p) of ll-ratio test,308.91


# MODEL

### Preparação dos datasets para o modelo

In [23]:
# Split the DataFrame into training and validation datasets
assets_ids = df_train['AssetId'].unique().tolist()
training_ids = sample(assets_ids, k=int(len(assets_ids)*0.75))
validation_ids = list(filter(lambda x: x not in training_ids, assets_ids))

train_df = df_train[df_train['AssetId'].isin(training_ids)]
val_df = df_train[df_train['AssetId'].isin(validation_ids)] 

# Print the shapes of the resulting datasets
print("Training dataset shape:", train_df.shape)
print("Validation dataset shape:", val_df.shape)

X_train = train_df.drop(columns='RUL')
y_train = train_df['RUL']
X_val = val_df.drop(columns='RUL')
y_val = val_df['RUL']

train_data = lgb.Dataset(X_train, label=y_train)
val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)

Training dataset shape: (15557, 17)
Validation dataset shape: (5074, 17)


### Ajustando os parâmetros do modelo

In [39]:
params = {
    "objective": "poisson",
    "boosting_type": "gbdt",
    "metric": "Cox",
    "verbose": 0,
    "random_state": 42
}

num_round = 100

In [40]:
survival_model = lgb.train(params,
                           train_data,
                           num_round,
                           )

In [41]:
survival_model.save_model('../models/model.pkl')

<lightgbm.basic.Booster at 0x237f3b14c10>

In [42]:
survival_model = lgb.Booster(model_file='../models/model.pkl')

In [43]:
# Make predictions on test data
y_pred = survival_model.predict(X_val, 
                                num_iteration=survival_model.best_iteration
                                )

In [44]:
# Evaluate the model using the concordance index (C-index)
c_index = concordance_index(y_val, y_pred)
print(f"Concordance Index (C-index): {round(c_index, 2)}")

Concordance Index (C-index): 0.82


In [45]:
mean_squared_error(y_val, y_pred).round(2)

1884.6

In [46]:
mean_absolute_percentage_error(y_val + 1, y_pred)

0.3722569039171073

In [47]:
results = pd.concat([pd.Series(y_val).reset_index(drop=True),
                     pd.Series(y_pred)
                     ],
                    axis=1
                    )

In [48]:
results['Diff'] = results['RUL'] - results[0]

In [49]:
abs((results['Diff'] + 1)/(results['RUL'] + 1)).mean()

0.3722569039171073

In [171]:
prediction = pd.Series(y_pred.flatten()).round(0).astype(int)

In [172]:
prediction = prediction.rename('RUL_prediction')

In [176]:
real = pd.concat([val_df[['AssetId', 'Runtime']],
                  y_val
                  ],
                 axis=1,
                 join='inner'
                 ).reset_index(drop=True)

In [177]:
df_real_prediction = pd.concat([real, prediction], axis=1)

In [178]:
df_real_prediction[(df_real_prediction['RUL'] <=20) | (df_real_prediction['RUL_prediction'] <=20)]

Unnamed: 0,AssetId,Runtime,RUL,RUL_prediction
171,1,172,20,32
172,1,173,19,23
173,1,174,18,21
174,1,175,17,21
175,1,176,16,25
...,...,...,...,...
5120,95,279,4,10
5121,95,280,3,7
5122,95,281,2,13
5123,95,282,1,11


In [179]:
red_zone_total = len(df_real_prediction[df_real_prediction['RUL_prediction'] <=20])

In [180]:
red_zone_hit = (df_real_prediction[df_real_prediction['RUL_prediction'] <=20]['RUL'] < 20).sum()

In [181]:
model_accuracy = round(100*red_zone_hit/red_zone_total, 2)

In [226]:
model_accuracy

88.89

In [227]:
df_train[['AssetId', 'RUL']].groupby('AssetId').max().quantile(0.1)

RUL    153.9
Name: 0.1, dtype: float64

In [182]:
df_train[['AssetId', 'RUL']].groupby('AssetId').max().quantile(round((100 - model_accuracy)/100, 4))

RUL    154.9989
Name: 0.1111, dtype: float64

In [228]:
df_real_prediction[(df_real_prediction['RUL'] <=20) & (df_real_prediction['RUL_prediction'] <=154)]

Unnamed: 0,AssetId,Runtime,RUL,RUL_prediction
171,1,172,20,32
172,1,173,19,23
173,1,174,18,21
174,1,175,17,21
175,1,176,16,25
...,...,...,...,...
5120,95,279,4,10
5121,95,280,3,7
5122,95,281,2,13
5123,95,282,1,11


In [229]:
df_real_prediction[df_real_prediction['RUL_prediction'] <=20]['RUL'] < 20

176     True
177     True
178     True
180     True
181     True
        ... 
5120    True
5121    True
5122    True
5123    True
5124    True
Name: RUL, Length: 477, dtype: bool

In [44]:
# métrica deve ser acerto ou erro para cada um dos 100 e não para todas as medições

# fazer uma matriz de confusão para acerto quando RUL menor que 20 ciclos

# comparar com a matriz de confusão do modelo naive

# gerar algumas análises de dados do dataset

# montar apresentação

In [230]:
accuracy_zone_lgbm = df_real_prediction.copy()#[(df_real_prediction['RUL'] <=20) | (df_real_prediction['RUL_prediction'] <=20)].copy()

In [231]:
accuracy_zone_naive = df_real_prediction.copy()#[(df_real_prediction['RUL'] <=20) | (df_real_prediction['Runtime'] <=154)].copy()

In [232]:
accuracy_zone_lgbm['Result'] = ''
accuracy_zone_naive['Result'] = ''
accuracy_zone_lgbm.loc[(accuracy_zone_lgbm['RUL'] <=20) & (accuracy_zone_lgbm['RUL_prediction'] <=20), 'Result'] = 'TP'
accuracy_zone_naive.loc[(accuracy_zone_naive['RUL'] <=20) & (accuracy_zone_naive['Runtime'] <= 154), 'Result'] = 'TP'

In [260]:
accuracy_zone_naive

Unnamed: 0,AssetId,Runtime,RUL,RUL_prediction,Result
0,1,1,191,248,FP
1,1,2,190,229,FP
2,1,3,189,250,FP
3,1,4,188,249,FP
4,1,5,187,233,FP
...,...,...,...,...,...
5120,95,279,4,10,FN
5121,95,280,3,7,FN
5122,95,281,2,13,FN
5123,95,282,1,11,FN


In [233]:
accuracy_zone_lgbm.loc[(accuracy_zone_lgbm['RUL'] > 20) & (accuracy_zone_lgbm['RUL_prediction'] <=20), 'Result'] = 'FP'
accuracy_zone_naive.loc[(accuracy_zone_naive['RUL'] > 20) & (accuracy_zone_naive['Runtime'] <= 154), 'Result'] = 'FP'

In [234]:
accuracy_zone_lgbm.loc[(accuracy_zone_lgbm['RUL'] <= 20) & (accuracy_zone_lgbm['RUL_prediction'] > 20), 'Result'] = 'FN'
accuracy_zone_naive.loc[(accuracy_zone_naive['RUL'] <= 20) & (accuracy_zone_naive['Runtime'] > 154), 'Result'] = 'FN'

In [235]:
accuracy_zone_lgbm.loc[(accuracy_zone_lgbm['RUL'] > 20) & (accuracy_zone_lgbm['RUL_prediction'] > 20), 'Result'] = 'TN'
accuracy_zone_naive.loc[(accuracy_zone_naive['RUL'] > 20) & (accuracy_zone_naive['Runtime'] > 154), 'Result'] = 'TN'

In [236]:
accuracy_zone_lgbm['Result'].value_counts(normalize=True).mul(100).round(2)

Result
TN    88.84
TP     8.39
FN     1.85
FP     0.92
Name: proportion, dtype: float64

In [237]:
accuracy_zone_naive['Result'].value_counts(normalize=True).mul(100).round(2)

Result
FP    72.96
TN    16.80
FN     8.57
TP     1.68
Name: proportion, dtype: float64

In [238]:
accuracy_zone_lgbm[accuracy_zone_lgbm['RUL_prediction'] <= 20].groupby('AssetId')['RUL'].max().min()

11

In [239]:
accuracy_zone_lgbm[accuracy_zone_lgbm['RUL_prediction'] <= 20].groupby('AssetId')['RUL'].max().idxmin()

65

In [240]:
accuracy_zone_lgbm[accuracy_zone_lgbm['RUL_prediction'] <= 20].groupby('AssetId')['RUL'].max().head(60)

AssetId
1     15
11    18
14    25
26    22
32    32
33    19
34    18
37    24
43    19
45    13
46    21
53    21
58    15
59    23
63    18
65    11
68    21
70    13
73    38
75    47
80    19
81    33
83    24
94    25
95    14
Name: RUL, dtype: int64

In [241]:
accuracy_zone_naive.groupby('AssetId')['Runtime'].max().between(0, 154).sum()

3

In [266]:
len(accuracy_zone_naive.groupby('AssetId').size())

25

In [263]:
accuracy_zone_naive.groupby('AssetId')['Runtime'].max().between(155, 174).sum()

3

In [267]:
accuracy_zone_naive.groupby('AssetId')['Runtime'].max().between(175, 400).sum()

19

In [268]:
accuracy_zone_naive.groupby('AssetId')['Runtime'].max().between(195, 400).sum()

15

In [272]:
accuracy_zone_lgbm[accuracy_zone_lgbm['RUL'] == 0].groupby('AssetId')['RUL_prediction'].max().between(21, 400).sum()

0

In [273]:
accuracy_zone_lgbm[accuracy_zone_lgbm['RUL_prediction'] <= 20].groupby('AssetId')['RUL'].max().between(0, 20).sum()

12

In [297]:
accuracy_zone_lgbm[accuracy_zone_lgbm['RUL_prediction'] <= 20].groupby('AssetId')['RUL'].max().between(21, 400).sum()

13

In [275]:
accuracy_zone_lgbm[accuracy_zone_lgbm['RUL_prediction'] <= 20].groupby('AssetId')['RUL'].max().between(41, 400).sum()

1

In [276]:
accuracy_zone_lgbm.groupby('AssetId')[['Runtime', 'RUL_prediction']].max()

Unnamed: 0_level_0,Runtime,RUL_prediction
AssetId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,192,262
11,240,205
14,180,177
26,199,211
32,191,175
33,200,201
34,195,185
37,170,179
43,207,202
45,158,202


In [277]:
accuracy_zone_lgbm[accuracy_zone_lgbm['RUL_prediction'] <= 20].groupby('AssetId')[['RUL', 'RUL_prediction']].max().sort_values('RUL').to_csv('RULvsRUL_predictionML.csv', sep=';')

In [290]:
error_margin = accuracy_zone_lgbm[accuracy_zone_lgbm['RUL_prediction'] <= 20].groupby('AssetId')[['RUL', 'RUL_prediction']].max()

In [291]:
error_margin['ML_Error'] = error_margin['RUL'] - error_margin['RUL_prediction']

In [292]:
naive_error_margin = accuracy_zone_naive.groupby('AssetId')['RUL'].max().sort_values() - 154

In [293]:
naive_error_margin = naive_error_margin.rename('Naive_Error')

In [294]:
error_margin = error_margin.merge(naive_error_margin, on='AssetId').sort_values('Naive_Error')

In [296]:
error_margin.sort_values('Naive_Error').reset_index().to_csv('MarginError.csv', sep=';')

In [285]:
accuracy_zone_lgbm[accuracy_zone_lgbm['RUL_prediction'] <= 20].groupby('AssetId')[['RUL', 'RUL_prediction']].max().sort_values('RUL')

Unnamed: 0_level_0,RUL,RUL_prediction
AssetId,Unnamed: 1_level_1,Unnamed: 2_level_1
65,11,17
70,13,17
45,13,18
95,14,19
1,15,18
58,15,20
11,18,20
63,18,20
34,18,20
43,19,20


In [286]:
accuracy_zone_naive.groupby('AssetId')['RUL'].max().sort_values().to_csv('RULvsRuntimeNaive.csv', sep=';')

In [287]:
accuracy_zone_naive.groupby('AssetId')['RUL'].max().reset_index().sort_values('RUL').reset_index(drop=True)

Unnamed: 0,AssetId,RUL
0,70,136
1,58,146
2,65,152
3,45,157
4,37,169
5,63,173
6,14,179
7,80,184
8,32,190
9,1,191


In [288]:
data = accuracy_zone_naive.groupby('AssetId')['RUL'].max().reset_index().sort_values('RUL').reset_index(drop=True)

In [289]:
data['Repair'] = 154

In [257]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=data['AssetId'].astype(str),
                         y=data['RUL']
                         )
             )
fig.add_trace(go.Scatter(x=data['AssetId'].astype(str),
                         y=data['Repair']
                         )
             )
    
# Layout
fig.update_layout(
    #title=f'Comportamento do valor médio de alguns sensores das 100 máquinas ao longo dos ciclos de operação',
    #xaxis=dict(title='Runtime'),
    height=600,
    paper_bgcolor='rgba(0,0,0,0)',
    plot_bgcolor='rgba(0,0,0,0)',
    #xaxis_title='Runtime'
    showlegend=False,
    xaxis=dict(showgrid=False,
               tickfont=dict(color='white',
                             size=24
                             ),
               
               #categoryorder='array'
               ),
    yaxis=dict(showgrid=False,
               tickfont=dict(color='white',
                             size=24
                             ),
               
               )
)
fig.update_traces(marker=dict(color='yellow'))

In [258]:
accuracy_zone_naive.groupby('AssetId')['RUL'].max().reset_index().sort_values('RUL')

Unnamed: 0,AssetId,RUL
17,70,136
12,58,146
15,65,152
9,45,157
7,37,169
14,63,173
2,14,179
20,80,184
4,32,190
0,1,191


In [259]:
accuracy_zone_naive

Unnamed: 0,AssetId,Runtime,RUL,RUL_prediction,Result
0,1,1,191,248,FP
1,1,2,190,229,FP
2,1,3,189,250,FP
3,1,4,188,249,FP
4,1,5,187,233,FP
...,...,...,...,...,...
5120,95,279,4,10,FN
5121,95,280,3,7,FN
5122,95,281,2,13,FN
5123,95,282,1,11,FN


In [194]:
accuracy_zone_lgbm.groupby('AssetId')[]

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000019B1F5E8BD0>

In [299]:
# Group the DataFrame by "AssetId" and get the index of the row with the maximum "runtime"
max_runtime_indices = df_test.groupby('AssetId')['Runtime'].idxmax()

# Get the rows with the maximum "runtime" for each "AssetId"
max_runtime_rows = df_test.loc[max_runtime_indices]

# Print the resulting DataFrame
max_runtime_rows

Unnamed: 0,AssetId,Runtime,Tag2,Tag3,Tag4,Tag7,Tag8,Tag9,Tag11,Tag12,Tag13,Tag14,Tag15,Tag17,Tag20,Tag21
30,1,31,642.58,1581.22,1398.91,554.42,2388.08,9056.40,47.23,521.79,2388.06,8130.11,8.4024,393,38.81,23.3552
79,2,49,642.55,1586.59,1410.83,553.52,2388.10,9044.77,47.67,521.74,2388.09,8126.90,8.4505,391,38.81,23.2618
205,3,126,642.88,1589.75,1418.89,552.59,2388.16,9049.26,47.88,520.83,2388.14,8131.46,8.4119,395,38.93,23.2740
311,4,106,642.78,1594.53,1406.88,552.64,2388.13,9051.30,47.65,521.88,2388.11,8133.64,8.4634,395,38.58,23.2581
409,5,98,642.27,1589.94,1419.36,553.29,2388.10,9053.99,47.46,521.00,2388.15,8125.74,8.4362,394,38.75,23.4117
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12545,96,97,642.30,1590.88,1397.94,553.99,2388.03,9062.41,47.14,522.30,2388.01,8148.24,8.4110,391,38.96,23.4606
12679,97,134,642.59,1582.96,1410.92,554.05,2388.06,9076.36,47.38,521.58,2388.06,8155.48,8.4500,395,38.61,23.2953
12800,98,121,642.68,1599.51,1415.47,553.44,2388.13,9062.34,47.66,521.53,2388.09,8146.39,8.4235,394,38.76,23.3608
12897,99,97,642.00,1585.03,1397.98,554.75,2388.01,9067.16,47.26,521.82,2388.02,8150.38,8.4003,391,38.95,23.3595
