# 05 - Linear  Pre-Processing

In this notebook we will continue to pre-process our splitted data, to prepare it for a linear logistic regression model that will act as our benchmark and an XGBoost model.

**Note**: Scaling is not performed here, as it is done in the python files for both models, using `sklearn.preprocessing.StandardScaler` inside the `sklearn.pipeline.Pipeline` object. This is done to avoid data leakage, as the scaler is fitted on the training data and then used to transform both the training, validation, and test data.

In [None]:
import sys
import os

# Making the project modules available in the notebook
root = os.path.abspath(os.path.join('../..'))
if root not in sys.path: sys.path.append(root)

In [None]:
import torch
import numpy as np
import pandas as pd

In [None]:
data_dict = torch.load(os.path.join(root, 'project/data/splitted_data.pt'))
data_dict.keys()

In [None]:
X_train = data_dict['X_train']
X_val = data_dict['X_val']
X_test = data_dict['X_test']

y_train = data_dict['y_train']
y_val = data_dict['y_val']
y_test = data_dict['y_test']

In [None]:
X_train[0].head()

In [None]:
y_train[0]

In [None]:
vocab = {
    'machine_shdr_execution': [
        'ACTIVE',
        'FEED_HOLD',
        'INTERRUPTED',
        'OPTIONAL_STOP',
        'PROGRAM_STOPPED',
        'PROGRAM_STOPPED\r',
        'READY',
        'STOPPED',
        'UNAVAILABLE',
        'WAIT',
        'PROGRAM_COMPLETED',
    ],
    'Machine_state_machine': [
        'INCYCLE',
        'IDLE',
        'MANUAL MODE',
        'POWER OFF',
        'CAM CYCLE',
        'MDI MODE',
        'MDI CYCLE',
        'FEEDHOLD',
        'PROGRAM STOP',
        'M0',
        'ESTOP',
        'ALARM',
        'OPTIONAL STOP'
    ]
}

In [None]:
def extract_categorical_features(sensor_data, column_name):
    counts = {}
    total = len(sensor_data)
    for cat in vocab[column_name]:
        counts[cat] = 0
    for val in sensor_data:
        counts[val] += 1
    return [counts[cat] / total for cat in vocab[column_name]]

In [None]:
def compress_window(sub_df):
    features = []
    names = []
    
    # numerical features (flaot64)
    numeric_columns = sub_df.select_dtypes(include=['float64']).columns
    for column in numeric_columns:
        sensor_data = sub_df[column].values
        features.extend([
            np.mean(sensor_data),
            np.std(sensor_data),
            np.min(sensor_data),
            np.max(sensor_data), 
        ])
        names.extend([
            f'{column}_mean',
            f'{column}_std',
            f'{column}_min',
            f'{column}_max'
        ])
        
    # numerical features (int64) 
    int_columns = sub_df.select_dtypes(include=['int64']).columns
    for column in int_columns:
        sensor_data = sub_df[column].values
        features.extend([
            sensor_data[0],
        ])
        names.append(f'{column}_first')
        
    
    # categorical features
    cat_columns = sub_df.select_dtypes(include=['object']).columns
    for column in cat_columns:
        sensor_data = sub_df[column].values
        if column == 'timestamp':
            downtime_duration = pd.to_datetime(sensor_data[-1]) - pd.to_datetime(sensor_data[0])
            features.extend([
                downtime_duration.total_seconds(),
            ])
            names.append('downtime_duration')
        else:
            features.extend([
                *extract_categorical_features(sensor_data, column)
            ])
            names.extend([f'{column}_{cat}' for cat in vocab[column]])
            
    return np.array(features), names

In [None]:
sample_features, feature_names = compress_window(X_train[0]) # initialize the feature names

In [None]:
X_train_features, _  = zip(*[compress_window(sub_df) for sub_df in X_train])
X_val_features, _ = zip(*[compress_window(sub_df) for sub_df in X_val])
X_test_features, _ = zip(*[compress_window(sub_df) for sub_df in X_test])

In [None]:
feature_names

In [None]:
X_train_features_array = np.array(X_train_features)

downtime_index = feature_names.index('downtime_duration')
downtime_duration_feature = X_train_features_array[:, downtime_index]

correlation = np.corrcoef(downtime_duration_feature, y_train)[0, 1]
print(f'Correlation between downtime_duration and target label: {correlation:.2f}')

In [None]:
X_train_features = np.array(X_train_features)
X_val_features = np.array(X_val_features)
X_test_features = np.array(X_test_features)

In [None]:
X_train_features.shape, X_test_features.shape

In [None]:
X_train_features[0]

In [None]:
X_train_features.shape, X_val_features.shape, X_test_features.shape

In [None]:
y_train.shape, y_val.shape, y_test.shape

### Saving the data

In [None]:
torch.save({
    'X_train': X_train_features,
    'y_train': y_train,
    'X_val': X_val_features,
    'y_val': y_val,
    'X_test': X_test_features,
    'y_test': y_test,
    'feature_names': feature_names,
    # 'scaler': scaler
}, os.path.join(root, 'project/data/logistic_xgboost_data.pt'))