# Predictive Maintenance - Label Bias

## Setup

In [761]:
### IMPORT LIBRARIES
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import KBinsDiscretizer, MinMaxScaler, normalize
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report
from sklearn.decomposition import PCA

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression, Lasso
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

from sklearn.linear_model import SGDClassifier
from sklearn import tree

### READ DATA
##### Needs to be ran from the project directory
train_measurements = pd.read_csv('../datasets/measurements.csv')
train_measurements = train_measurements.sort_values(by=['measurement_time'], ascending=[True])

train_failures = pd.read_csv('../datasets/failures.csv')
train_failures = train_failures.sort_values(by=['failure_time'], ascending=[True])

## Data Preprocessing

In [762]:
train_measurements.measurement_time = pd.to_datetime(train_measurements.measurement_time, format="%Y-%m-%d %H:%M:%S")
train_failures.failure_time = pd.to_datetime(train_failures.failure_time)

### MERGE NEXT FAILURE TO MEASUREMENTS
train_combined = pd.merge_asof(
    train_measurements,
    train_failures,
    left_on='measurement_time',
    right_on='failure_time',
    by='gadget_id',
    direction='forward',
)

### TRANSFORM COLUMNS
train_combined['time_to_fail'] = train_combined['failure_time']-train_combined['measurement_time']
train_combined['fail_in_1h'] = np.where(train_combined['time_to_fail']<pd.Timedelta(hours=1), 1, 0)

### CALCULATE RUNNING MEASURES
train_combined = train_combined.reset_index(drop=True)
train_combined = train_combined.sort_values(by=['gadget_id', 'measurement_time'], ascending=[True, True])

train_combined['temperature_6h_std'] = train_combined.groupby('gadget_id')['temperature'].rolling(6).std(ddof=0).reset_index(drop=True)
train_combined['pressure_6h_mean'] = train_combined.groupby('gadget_id')['pressure'].rolling(6).mean().reset_index(drop=True)

train_combined.to_csv('../datasets/train_combined.csv', index=False)

In [763]:
# specify labels
X = ['vibration_y', 'pressure_6h_mean', 'temperature_6h_std']
y = 'fail_in_1h'
cols = X + [y]

df_to_split = train_combined.copy()
df_to_split = df_to_split.dropna(subset=cols)
df_to_split = df_to_split.reset_index(drop=True)

##### Create binary bins to 
binner = KBinsDiscretizer(n_bins=10, encode='onehot-dense', strategy='kmeans')
binner.fit(df_to_split[X])
arr_bins= binner.transform(df_to_split[X])
df_bins = pd.DataFrame(arr_bins)

X = list(df_bins.columns)
cols = X + [y]

df_to_split = pd.concat([df_to_split, df_bins], axis=1)

### Split balanced dataset randomly into Train and Validation 

In [764]:
# create test set (environmental dataset)
df_test = df_to_split[df_to_split['gadget_id'].isin([5,6])].reset_index(drop=True)

# create training, validation set
df_to_split = df_to_split[df_to_split['gadget_id'].isin([1,2,3,4])].reset_index(drop=True)

print(df_to_split.shape)
print(df_test.shape)




(651, 43)
(327, 43)


In [768]:
# Use 70% of remaining data set for training
df_train = df_to_split.sample(frac = 0.7)

# Remaining 30% used for validation test
df_validation = df_to_split.drop(df_train.index)


df_train = df_train.sort_values(by=['gadget_id', 'measurement_time'], ascending=[True, True])
df_validation = df_validation.sort_values(by=['gadget_id', 'measurement_time'], ascending=[True, True])


df_train = df_train.reset_index(drop=True)
df_validation = df_validation.reset_index(drop=True)

print("Balanced")
print("--------------")
print("Training dataset samples: ", df_train.shape)
print("Validation dataset samples: ",df_validation.shape)

Training dataset samples:  (456, 43)
Validation dataset samples:  (195, 43)


### Split unbalanced dataset randomly into Train and Validation

In [765]:
# create csv to manually unbalance data
# df_random = df_to_split.sample(frac=1).reset_index(drop=True)
# df_random.to_csv('../datasets/train_combined_unbalanced.csv', index=False)

df_unbalanced_to_split = pd.read_csv('../datasets/train_combined_unbalanced.csv')
df_unbalanced_to_split = df_unbalanced_to_split.sort_values(by=['measurement_time'], ascending=[True])

df_unbalanced_to_split.measurement_time = pd.to_datetime(df_unbalanced_to_split.measurement_time, format="%Y-%m-%d %H:%M:%S")
df_unbalanced_to_split.failure_time = pd.to_datetime(df_unbalanced_to_split.failure_time)

df_unbalanced_to_split = df_unbalanced_to_split.reset_index(drop=True)
df_unbalanced_to_split = df_unbalanced_to_split.sort_values(by=['gadget_id', 'measurement_time'], ascending=[True, True])

df_unbalanced_to_split.shape

(651, 43)

In [771]:
# Use 70% of remaining data set for training
df_train_unbalanced = df_unbalanced_to_split.sample(frac = 0.7)

# Remaining 30% used for validation test
df_validation_unbalanced = df_unbalanced_to_split.drop(df_train.index)


df_train_unbalanced = df_train_unbalanced.sort_values(by=['gadget_id', 'measurement_time'], ascending=[True, True])
df_validation_unbalanced = df_validation_unbalanced.sort_values(by=['gadget_id', 'measurement_time'], ascending=[True, True])


df_train_unbalanced = df_train_unbalanced.reset_index(drop=True)
df_validation_unbalanced = df_validation_unbalanced.reset_index(drop=True)

print("Unbalanced")
print("--------------")
print("Training dataset samples: ", df_train_unbalanced.shape)
print("Validation dataset samples: ",df_validation_unbalanced.shape)

Unbalanced
--------------
Training dataset samples:  (456, 43)
Validation dataset samples:  (195, 43)


# Training, Validation and Test - SVM

In [767]:
### PREDICTION PARAMETERS
w0 = 1
w1 = 8
pos_label = 1

### SVM
svm = SVC(
    class_weight={0:w0, 1:w1},
    C=1,
    random_state=42,
    kernel='linear'
)
# fit model
svm.fit(df_train[X], df_train[y])

# make prediction on validation set
val_pred = svm.predict(df_val_test[X])

accuracy = accuracy_score(df_val_test['fail_in_1h'], val_pred )
print("validation accuracy", round(accuracy,3))

# make prediction on test set
test_pred = svm.predict(df_test[X])

accuracy2 = accuracy_score(df_test['fail_in_1h'], test_pred )
print("test accuracy", round(accuracy2, 3))

validation accuracy 0.723
test accuracy 0.783
