# Predictive Maintenance - Label Bias

## Setup

In [25]:
### IMPORT LIBRARIES
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import KBinsDiscretizer, MinMaxScaler, normalize
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report
from sklearn.decomposition import PCA

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression, Lasso
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

from sklearn.linear_model import SGDClassifier
from sklearn import tree

### READ DATA
##### Needs to be ran from the project directory
train_measurements = pd.read_csv('../../datasets/IoT/measurements.csv')
train_measurements = train_measurements.sort_values(by=['measurement_time'], ascending=[True])

train_failures = pd.read_csv('../../datasets/IoT/failures.csv')
train_failures = train_failures.sort_values(by=['failure_time'], ascending=[True])

## Data Preprocessing

In [26]:
train_measurements.measurement_time = pd.to_datetime(train_measurements.measurement_time, format="%Y-%m-%d %H:%M:%S")
train_failures.failure_time = pd.to_datetime(train_failures.failure_time)

### MERGE NEXT FAILURE TO MEASUREMENTS
train_combined = pd.merge_asof(
    train_measurements,
    train_failures,
    left_on='measurement_time',
    right_on='failure_time',
    by='gadget_id',
    direction='forward',
)

### TRANSFORM COLUMNS
train_combined['time_to_fail'] = train_combined['failure_time']-train_combined['measurement_time']
train_combined['fail_in_1h'] = np.where(train_combined['time_to_fail']<pd.Timedelta(hours=1), 1, 0)

### CALCULATE RUNNING MEASURES
train_combined = train_combined.reset_index(drop=True)
train_combined = train_combined.sort_values(by=['gadget_id', 'measurement_time'], ascending=[True, True])

train_combined['temperature_6h_std'] = train_combined.groupby('gadget_id')['temperature'].rolling(6).std(ddof=0).reset_index(drop=True)
train_combined['pressure_6h_mean'] = train_combined.groupby('gadget_id')['pressure'].rolling(6).mean().reset_index(drop=True)

train_combined.to_csv('../../datasets/IoT/train_combined.csv', index=False)

In [27]:
# specify labels
X = ['vibration_y', 'pressure_6h_mean', 'temperature_6h_std']
y = 'fail_in_1h'
cols = X + [y]

df_to_split = train_combined.copy()
df_to_split = df_to_split.dropna(subset=cols)
df_to_split = df_to_split.reset_index(drop=True)

##### Create binary bins to 
binner = KBinsDiscretizer(n_bins=10, encode='onehot-dense', strategy='kmeans')
binner.fit(df_to_split[X])
arr_bins= binner.transform(df_to_split[X])
df_bins = pd.DataFrame(arr_bins)

X = list(df_bins.columns)
cols = X + [y]

df_to_split = pd.concat([df_to_split, df_bins], axis=1)
df_to_split['temperature_6h_std']

0      3.453742
1      3.803752
2      3.679122
3      9.337838
4      3.253625
         ...   
973    2.523321
974    4.888758
975    3.862833
976    5.541126
977    4.835146
Name: temperature_6h_std, Length: 978, dtype: float64

### Split balanced dataset randomly into Train and Validation 

In [28]:
# create test set (environmental dataset)
df_test = df_to_split[df_to_split['gadget_id'].isin([5,6])].reset_index(drop=True)

# create training, validation set
df_to_split = df_to_split[df_to_split['gadget_id'].isin([1,2,3,4])].reset_index(drop=True)

print(df_to_split.shape)
print(df_test.shape)

(651, 43)
(327, 43)


In [29]:
# Use 70% of remaining data set for training
df_train = df_to_split.sample(frac = 0.7).copy()

# Remaining 30% used for validation test
df_validation = df_to_split.drop(df_train.index).copy()


df_train = df_train.sort_values(by=['gadget_id', 'measurement_time'], ascending=[True, True])
df_validation = df_validation.sort_values(by=['gadget_id', 'measurement_time'], ascending=[True, True])


df_train = df_train.reset_index(drop=True)
df_validation = df_validation.reset_index(drop=True)

print("Balanced")
print("--------------")
print("Training dataset samples: ", df_train.shape)
print("Validation dataset samples: ",df_validation.shape)

Balanced
--------------
Training dataset samples:  (456, 43)
Validation dataset samples:  (195, 43)


### Split unbalanced dataset randomly into Train and Validation

In [30]:
# create csv to manually unbalance data
df_random = df_to_split.sample(frac=1).reset_index(drop=True).copy()

length=32
start=32
for i in range(length):
    df_random.at[i, 'fail_in_1h']=110

for i in range(start, start+length):
    df_random.at[i, 'temperature_6h_std']=110


df_unbalanced_to_split = df_random.sort_values(by=['measurement_time'], ascending=[True])

# df_unbalanced_to_split.measurement_time = pd.to_datetime(df_unbalanced_to_split.measurement_time, format="%Y-%m-%d %H:%M:%S")
# df_unbalanced_to_split.failure_time = pd.to_datetime(df_unbalanced_to_split.failure_time)

df_unbalanced_to_split = df_unbalanced_to_split.reset_index(drop=True)
df_unbalanced_to_split = df_unbalanced_to_split.sort_values(by=['gadget_id', 'measurement_time'], ascending=[True, True])

df_unbalanced_to_split.to_csv('../datasets/train_combined_unbalanced.csv', index=False)


In [31]:
# Use 70% of remaining data set for training
df_train_unbalanced = df_unbalanced_to_split.sample(frac = 0.7)

# Remaining 30% used for validation test
df_validation_unbalanced = df_unbalanced_to_split.drop(df_train.index)


df_train_unbalanced = df_train_unbalanced.sort_values(by=['gadget_id', 'measurement_time'], ascending=[True, True])
df_validation_unbalanced = df_validation_unbalanced.sort_values(by=['gadget_id', 'measurement_time'], ascending=[True, True])


df_train_unbalanced = df_train_unbalanced.reset_index(drop=True)
df_validation_unbalanced = df_validation_unbalanced.reset_index(drop=True)

print("Unbalanced")
print("--------------")
print("Training dataset samples: ", df_train_unbalanced.shape)
print("Validation dataset samples: ",df_validation_unbalanced.shape)

Unbalanced
--------------
Training dataset samples:  (456, 43)
Validation dataset samples:  (195, 43)


# Training, Validation and Test - SVM

## Balanced

In [32]:
### PREDICTION PARAMETERS
w0 = 1
w1 = 8
pos_label = 1

### SVM
svm = SVC(
    class_weight={0:w0, 1:w1},
    C=1,
    random_state=42,
    kernel='linear'
)

# fit model
svm.fit(df_train[X], df_train[y])

# make prediction on validation set
val_pred = svm.predict(df_validation[X])

accuracy_val = accuracy_score(df_validation['fail_in_1h'], val_pred )
print("validation accuracy", round(accuracy_val,3))

# make prediction on test set
test_pred = svm.predict(df_test[X])

accuracy_test = accuracy_score(df_test['fail_in_1h'], test_pred )
print("test accuracy", round(accuracy_test, 3))

validation accuracy 0.713
test accuracy 0.786


## Unbalanced

In [33]:
# fit model
svm.fit(df_train_unbalanced[X], df_train_unbalanced[y])

# make prediction on validation set
val_pred_unbalanced = svm.predict(df_validation_unbalanced[X])

accuracy_val_unbalanced = accuracy_score(df_validation_unbalanced['fail_in_1h'],val_pred_unbalanced )
print("validation accuracy", round(accuracy_val_unbalanced,3))

# make prediction on test set
test_pred_unbalanced = svm.predict(df_test[X])

accuracy_test_unbalanced = accuracy_score(df_test['fail_in_1h'], test_pred_unbalanced)
print("test accuracy", round(accuracy_test_unbalanced, 3))

validation accuracy 0.744
test accuracy 0.792


# Tests


In [34]:
data = {'First':  [1,2,3,4,5,6],
        'Second': [9,8,7,6,5,4],
        'Third': [1,2,1,2,2,0]}

df = pd.DataFrame (data, columns = ['First','Second','Third'])

print (df)

   First  Second  Third
0      1       9      1
1      2       8      2
2      3       7      1
3      4       6      2
4      5       5      2
5      6       4      0


In [35]:
for i in df.index:
    if df.at[i, 'Third'] == 0 or df.at[i, 'Third'] == 2:
        df.at[i, 'First']=99
print(df)

   First  Second  Third
0      1       9      1
1     99       8      2
2      3       7      1
3     99       6      2
4     99       5      2
5     99       4      0


In [36]:
# length = 3
# for i in range(length):
#     df.at[i, 'Second']=7

# print(df)