# 09 - LSTM Preprocessing

In [None]:
import sys
import os

# Making the project modules available in the notebook
root = os.path.abspath(os.path.join('../..'))
if root not in sys.path: sys.path.append(root)

In [None]:
import pandas as pd
import numpy as np

import torch
from sklearn.preprocessing import StandardScaler

In [None]:
data_dict = torch.load(os.path.join(root, 'project/data/splitted_data.pt'))
data_dict.keys()

In [None]:
X_train = data_dict['X_train']
X_val = data_dict['X_val']
X_test = data_dict['X_test']

y_train = data_dict['y_train']
y_val = data_dict['y_val']
y_test = data_dict['y_test']

In [None]:
X_train[0].head()

In [None]:
y_train[0]

We will start of by encoding the feature `timestamp`. Since we plan on using LSTM models, we will encode the timestamp in cyclical format. We do this since we have timeseries data and we want to capture the cyclical behaviour of time. This means that the hours 23 and 0 are close to each other and not far apart as they would be if we encoded them as 23 and 0.

In [None]:
def encode_timestamp_to_cyclical_features(df):
    df['timestamp'] = pd.to_datetime(df['timestamp'], format='mixed', utc=True)

    df['year'] = df['timestamp'].dt.year
    df['month'] = df['timestamp'].dt.month
    df['day_of_week'] = df['timestamp'].dt.dayofweek
    df['hour'] = df['timestamp'].dt.hour
    df['minute'] = df['timestamp'].dt.minute
    df['second'] = df['timestamp'].dt.second
    
    def encode_cyclical_feature(feature, value):
        return np.sin(2 * np.pi * feature / value), np.cos(2 * np.pi * feature / value)

    df['month_sin'], df['month_cos'] = encode_cyclical_feature(df['month'], 12)
    df['day_sin'], df['day_cos'] = encode_cyclical_feature(df['day_of_week'], 7)
    df['hour_sin'], df['hour_cos'] = encode_cyclical_feature(df['hour'], 24)
    df['minute_sin'], df['minute_cos'] = encode_cyclical_feature(df['minute'], 60)
    df['second_sin'], df['second_cos'] = encode_cyclical_feature(df['second'], 60)

    df['unix_time'] = df['timestamp'].astype('int64') // 10**9

    return df.drop(columns=['timestamp', 'month', 'day_of_week', 'hour', 'minute', 'second']) # Remove original timestamp and intermediate components

In [None]:
X_train = [encode_timestamp_to_cyclical_features(group) for group in X_train]
X_val = [encode_timestamp_to_cyclical_features(group) for group in X_val]
X_test = [encode_timestamp_to_cyclical_features(group) for group in X_test]

In [None]:
X_train[0][['month_sin', 'month_cos', 'day_sin', 'day_cos', 'hour_sin', 'hour_cos', 'minute_sin', 'minute_cos', 'second_sin', 'second_cos', 'unix_time']].head()

In [None]:
X_train[0].select_dtypes(include=['object']).columns

We're now left with `machine_shdr_execution` and `Machine_state_machine`. We will encode these columns by using them as an embedding layer in our model. This will allow the model to learn the relationship between the different machines and machine states.

In [None]:
unqiue_executions = [
    'ACTIVE',
    'FEED_HOLD',
    'INTERRUPTED',
    'OPTIONAL_STOP',
    'PROGRAM_STOPPED',
    'PROGRAM_STOPPED\r',
    'READY',
    'STOPPED',
    'UNAVAILABLE',
    'WAIT',
    'PROGRAM_COMPLETED',
]
unqiue_executions_to_int = {execution: idx for idx, execution in enumerate(unqiue_executions)}
unqiue_executions_to_int

For the machines we have choosen, based on all execution timeseries data, these are all unique values. The code for this was written in a temporary test file and is not included in this notebook. The code is as follows:

```python

unique_execution_values = set()
for machine in machine_external_ids:
    exe = client.time_series.data.retrieve(external_id=f'{machine}_shdr_execution', limit=None).to_pandas()
    unique_execution_values.update(exe[f'{machine}_shdr_execution'].unique())

unique_execution_values
```

In [None]:
def encode_execution_feature(df):
    df['machine_shdr_execution'] = df['machine_shdr_execution'].map(unqiue_executions_to_int).astype(int)
    return df

X_train = [encode_execution_feature(group) for group in X_train]
X_val = [encode_execution_feature(group) for group in X_val]
X_test = [encode_execution_feature(group) for group in X_test]

In [None]:
X_train[28]['machine_shdr_execution'].unique()

In [None]:
unique_states = [
    'INCYCLE',
    'IDLE',
    'MANUAL MODE',
    'POWER OFF',
    'CAM CYCLE',
    'MDI MODE',
    'MDI CYCLE',
    'FEEDHOLD',
    'PROGRAM STOP',
    'M0',
    'ESTOP',
    'ALARM',
    'OPTIONAL STOP'
]
unique_state_to_int = {state: idx for idx, state in enumerate(unique_states)}

def encode_state_feature(df):
    df['Machine_state_machine'] = df['Machine_state_machine'].map(unique_state_to_int).astype(int)
    return df

In [None]:
X_train = [encode_state_feature(group) for group in X_train]
X_val = [encode_state_feature(group) for group in X_val]
X_test = [encode_state_feature(group) for group in X_test]

In [None]:
X_train[4]['Machine_state_machine'].unique()

In [None]:
X_train[0].select_dtypes(include=['object']).columns

We have now encoded all the categorical features and can proceed to impute the missing values and scale the data.

We will start by decalring two lists, one for the numerical features that we will scale and one for the categorical features that will be used as an embedding layer in the model.

In [None]:
X_train[0].columns

In [None]:
cols_to_scale = [
    'machine_shdr_Zpos', 
    'machine_shdr_Fact_Numeric',
    'machine_shdr_Frapidovr_Numeric', 
    'machine_shdr_Xpos',
    'machine_shdr_Zfrt_Numeric', 
    'machine_shdr_Cpos',
    'machine_shdr_Tool_group_Numeric', 
    'machine_shdr_Zabs',
    'machine_shdr_Cload', 
    'machine_shdr_S2rpm_Numeric',
    'accumulated_workorder_downtime_machine', 
    'accumulated_uptime_machine',
    'machine_shdr_total_time_Numeric', 
    'machine_shdr_sequenceNum_Numeric',
    'machine_shdr_Xfrt_Numeric', 
    'machine_shdr_auto_time_Numeric',
    'machine_shdr_R172_Numeric', 
    'machine_shdr_Xload_Numeric',
    'machine_shdr_Wfrt_Numeric', 
    'accumulated_workorder_uptime_machine',
    'machine_shdr_Tool_number_Numeric', 
    'accumulated_downtime_machine', 
    'machine_shdr_Zload_Numeric',
    'machine_shdr_Yfrt_Numeric', 
    'machine_shdr_Fovr_Numeric',
    'machine_shdr_Yload_Numeric', 
    'machine_shdr_S2temp_Numeric',
    'machine_shdr_Sovr_Numeric', 
    'machine_shdr_Xabs', 
    'machine_shdr_Bload', 
    'machine_shdr_Yabs',
    'machine_shdr_Ypos', 
    'machine_shdr_S2load_Numeric',
    'machine_shdr_cut_time_Numeric', 
    'machine_shdr_Wload_Numeric',
    'machine_shdr_Srpm_Numeric', 
    'machine_shdr_Bpos',
    'machine_shdr_Sload_Numeric', 
    'machine_shdr_Stemp_Numeric', 
    'unix_time'   
]

embedding_columns = ['Machine_state_machine', 'machine_shdr_execution']

In [None]:
other_columns = [col for col in X_train[0].columns if col not in cols_to_scale + embedding_columns]
other_columns

In [None]:
X_train_concat_before_scaling = pd.concat(X_train, axis=0)

In [None]:
scaler = StandardScaler().fit(X_train_concat_before_scaling[cols_to_scale])  # Fitting only on training data

def scale_group(df):    
    df[cols_to_scale] = scaler.transform(df[cols_to_scale])
    return df

X_train = [scale_group(group) for group in X_train]
X_val = [scale_group(group) for group in X_val]
X_test = [scale_group(group) for group in X_test]

In [None]:
X_train_concat_after_scaling = pd.concat(X_train, axis=0)
X_train_concat_after_scaling[cols_to_scale].describe().loc[['mean', 'std']].round(2)

The only thing left to do now is to seperate the samples into the embedding features and the rest of the features. Then we need to convert the data to numpy arrays and save the pre-processed data to the data folder.

In [None]:
X_train_num = [group[cols_to_scale + other_columns].values for group in X_train]
X_train_cat = [group[embedding_columns].values for group in X_train]

X_val_num = [group[cols_to_scale + other_columns].values for group in X_val]
X_val_cat = [group[embedding_columns].values for group in X_val]

X_test_num = [group[cols_to_scale + other_columns].values for group in X_test]
X_test_cat = [group[embedding_columns].values for group in X_test]

In [None]:
X_train_num[0], X_train_cat[0]

In [None]:
X_train_num[0].shape, X_train_cat[0].shape  

In [None]:
type(X_train_num[0]), type(X_train_cat[0])

In [None]:
y_train[0]

In [None]:
type(y_train[0])

Everything looks good and we can now proceed to save all preprocessed arrays and any needed objects (like the scaler and numerical imputer) to a .pt file.

In [None]:
data_folder = os.path.join(root, 'project/data/')

torch.save({
    'X_train_num': X_train_num,
    'X_train_cat': X_train_cat,
    'y_train': y_train,
    'X_val_num': X_val_num,
    'X_val_cat': X_val_cat,
    'y_val': y_val,
    'X_test_num': X_test_num,
    'X_test_cat': X_test_cat,
    'y_test': y_test,
    'scaler': scaler,
}, data_folder + 'preprocessed_lstm_data.pt')