In [35]:
import pandas as pd
import re

import pickle

extruder = 'EX2'

ex_data = pd.read_csv(f"data/{extruder}_processed.csv")
ex_data['Unnamed: 0'] = pd.to_datetime(ex_data['Unnamed: 0'])
ex_data = ex_data.set_index('Unnamed: 0').rename_axis('Date and Time')
#ex_data.index = pd.to_datetime(ex_data.index)
ex_data.dropna(inplace=True)

In [36]:
min_time = ex_data.index.min()
well_sampled_cutoff = min_time + pd.Timedelta(hours=48)
min_time, well_sampled_cutoff

ex_data.loc[ex_data.index < well_sampled_cutoff,'well_sampled'] = False

In [37]:
# Extruder Pressure - for rolling
# for lagged variables - Extruder Die Temp, Extruder Thrust

#test = ex1_data.iloc[:1000][['Extruder Pressure']]

for time_window in ['5T', '1H', '3H', '6H', '12H', '24H', '48H']:
    if 'T' in time_window:
        label_append = time_window.replace('T', 'min')
    elif 'H' in time_window:
        label_append = time_window.replace('H', 'hour')

    col_label = 'pressure-' + label_append + '_avg'
    ex_data[col_label] = ex_data['Extruder Pressure'].rolling(time_window).mean()

    col_label = 'pressure-' + label_append + '_var'
    ex_data[col_label] = ex_data['Extruder Pressure'].rolling(time_window).var()

In [38]:
variables_to_lag = ['Extruder Die Temp', 'Extruder Thrust ',
                    'Screw Speed Output']

colname_map = {
    'Extruder Die Temp' : 'die_temp-',
    'Extruder Thrust ' : 'thrust-',
    'Screw Speed Output' : 'screw_speed_output-',
}

for time_window in ['15T', '1H', '3H', '6H']:
    if 'T' in time_window:
        label_append = time_window.replace('T', 'min')
    elif 'H' in time_window:
        label_append = time_window.replace('H', 'hour')
    for lagvar in variables_to_lag:
        col_label = colname_map[lagvar] + label_append + '_avg'
        ex_data[col_label] = \
            ex_data[lagvar].rolling(time_window).mean()

In [39]:
# gives ground truth for eval
stop_target_to_hours_map = {
#    '1min' : 1/60,
    '5min' : 5/60,
    '10min': 10/60,
    '15min': 15/60,
    '30min': 30/60,
    '1hour': 1,
    '2hour': 2,
    '3hour': 3,
    '6hour':6,
    '12hour':12,
    '24hour':24,
}

for window in stop_target_to_hours_map:
    ex_data[f'{window}_hazard'] = \
        ex_data['hours_to_hazard'] < stop_target_to_hours_map[window]

In [40]:
ex_data.head()

Unnamed: 0_level_0,Extruder Pressure,Screw Speed Output,Screw Speed,Extruder Die Temp,Extruder Thrust,Feed Screw Current (Amps),Discharge Conveyor Current (Amps),Discharge Conveyor Speed (%),sample_gap_minutes,48H_max_gap_backward,...,5min_hazard,10min_hazard,15min_hazard,30min_hazard,1hour_hazard,2hour_hazard,3hour_hazard,6hour_hazard,12hour_hazard,24hour_hazard
Date and Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2023-07-17 16:06:00,0.0,6.952558e-310,0.0,0.0,0.0,0.0,0.0,6.952558e-310,0.333333,0.333333,...,False,False,False,False,False,False,False,False,False,False
2023-07-17 16:06:30,0.0,6.952558e-310,0.0,0.0,0.0,0.0,0.0,6.952558e-310,0.333333,0.333333,...,False,False,False,False,False,False,False,False,False,False
2023-07-17 16:07:00,0.0,6.952558e-310,0.0,0.0,0.0,0.0,0.0,6.952558e-310,0.333333,0.333333,...,False,False,False,False,False,False,False,False,False,False
2023-07-17 16:07:30,3354.5,35.0,27.765884,297.515625,62.653107,4.77,0.94,80.0,1.316667,1.316667,...,False,False,False,False,False,False,False,False,False,False
2023-07-17 16:08:00,3458.941406,35.0,27.765884,297.515625,62.652573,4.77,0.94,80.0,0.05,1.316667,...,False,False,False,False,False,False,False,False,False,False


In [41]:
input_features = {}

input_features['basevars_only'] = [
    'Extruder Pressure',
    'Screw Speed Output',
    'Screw Speed',
    'Extruder Die Temp',
    'Extruder Thrust ',
]

input_features['fullvar'] = [
    'Extruder Pressure',
    'Screw Speed Output',
    'Screw Speed',
    'Extruder Die Temp',
    'Extruder Thrust ',
    'pressure-1hour_avg',
    'pressure-12hour_avg',
    'pressure-5min_var',
    'pressure-3hour_var',
    'pressure-6hour_var',
    'die_temp-15min_avg',
    'thrust-15min_avg',
    'die_temp-1hour_avg',
    'thrust-1hour_avg',
    'die_temp-6hour_avg',
    'thrust-6hour_avg',
]

best_models_ex1 = {
    '5min' : ('basevars_only', 'Gradient Boosting'),
    '10min' : ('basevars_only', 'Random Forest'),
    '15min' : ('basevars_only', 'Gradient Boosting'),
    '30min' : ('basevars_only', 'Gradient Boosting'),
    '1hr' : ('basevars_only', 'Gradient Boosting'),
    '2hr' : ('basevars_only', 'Gradient Boosting'),
    '3hr' : ('fullvar', 'Support Vector Machine'),
    '6hr' : ('fullvar', 'Support Vector Machine'),
    '12hr' : ('fullvar', 'Support Vector Machine')
}

best_models_ex2 = {
    '5min' : ('basevars_only', 'Logistic Regression'),
    '10min' : ('basevars_only', 'Random Forest'),
    '15min' : ('basevars_only', 'Support Vector Machine'),
    '30min' : ('basevars_only', 'Support Vector Machine'),
    '1hr' : ('basevars_only', 'Support Vector Machine'),
    '2hr' : ('basevars_only', 'Support Vector Machine'),
    '3hr' : ('basevars_only', 'Support Vector Machine'),
    '6hr' : ('fullvar', 'MLP'),
    '12hr' : ('fullvar', 'Support Vector Machine'),
}

In [42]:
model_collection['basevars_only']['5min_hazard']['Logistic Regression']

In [43]:
tmp_input.shape, tmp_groundtruth.shape, probs.shape

((6493, 16), (6493,), (6493,))

In [44]:
import matplotlib.pyplot as plt
from sklearn.metrics import precision_score, recall_score, confusion_matrix

test_cutoff = pd.to_datetime('12/1/2023')
test_data = ex_data[(ex_data.index >= test_cutoff) & (ex_data['hours_to_hazard'] < 96) & (ex_data['well_sampled'])]


# Assuming positive_class_probabilities are the predicted probabilities of the positive class
# and true_labels are the ground truth labels

# Threshold the probabilities to create class predictions
threshold = 0.5


for targ_key in selected_models:
    print(targ_key)
    input_features = selected_models[targ_key]['input_features']
    
    tmp_input = test_data[input_features].values
    tmp_groundtruth = test_data[targ_key].values
    print(tmp_groundtruth.sum() / len(tmp_groundtruth))
    
    tmp_model = selected_models[targ_key]['model']

    time_key = targ_key.replace('hour', 'hr')
    time_key = time_key.replace('_hazard', '')
    
    if best_models_ex2[time_key][1] == 'Support Vector Machine':
        probs = tmp_model.predict(tmp_input)
    else:
        probs = tmp_model.predict_proba(tmp_input)[:,1]

    class_predictions = (probs >= threshold).astype(int)

    # Calculate precision and recall
    precision = precision_score(tmp_groundtruth, class_predictions)
    recall = recall_score(tmp_groundtruth, class_predictions)
    
    # Generate confusion matrix
    conf_matrix = confusion_matrix(tmp_groundtruth, class_predictions)
    
    # Display results
    print("Precision:", precision)
    print("Recall:", recall)
    print("Confusion Matrix:\n", conf_matrix)
    print()

5min_hazard
0.0017954786582877662
Precision: 1.0
Recall: 0.09090909090909091
Confusion Matrix:
 [[12231     0]
 [   20     2]]

10min_hazard
0.0034277319840039172
Precision: 0.021
Recall: 1.0
Confusion Matrix:
 [[10253  1958]
 [    0    42]]

15min_hazard
0.004896759977148454
Precision: 1.0
Recall: 0.38333333333333336
Confusion Matrix:
 [[12193     0]
 [   37    23]]

30min_hazard
0.009793519954296908
Precision: 1.0
Recall: 0.19166666666666668
Confusion Matrix:
 [[12133     0]
 [   97    23]]

1hour_hazard
0.019587039908593815
Precision: 1.0
Recall: 0.09583333333333334
Confusion Matrix:
 [[12013     0]
 [  217    23]]

2hour_hazard
0.03917407981718763
Precision: 1.0
Recall: 0.04791666666666667
Confusion Matrix:
 [[11773     0]
 [  457    23]]

3hour_hazard
0.05876111972578144
Precision: 1.0
Recall: 0.03194444444444444
Confusion Matrix:
 [[11533     0]
 [  697    23]]

6hour_hazard
0.11752223945156288
Precision: 0.1566973029983426
Recall: 0.7222222222222222
Confusion Matrix:
 [[5216 559

In [15]:
best_models_ex2[time_key][1]

'Logistic Regression'

In [17]:
tmp_model

In [18]:
time_key

'5min'