## Setup

In [1]:
### IMPORT LIBRARIES
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import KBinsDiscretizer, MinMaxScaler, normalize
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report
from sklearn.decomposition import PCA

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression, Lasso
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

from sklearn.linear_model import SGDClassifier
from sklearn import tree

### READ DATA
##### Needs to be ran from the project directory
train_measurements = pd.read_csv('../../datasets/IoT/measurements.csv')
train_measurements = train_measurements.sort_values(by=['measurement_time'], ascending=[True])

train_failures = pd.read_csv('../../datasets/IoT/failures.csv')
train_failures = train_failures.sort_values(by=['failure_time'], ascending=[True])

## Data Preprocessing

In [2]:
train_measurements.measurement_time = pd.to_datetime(train_measurements.measurement_time, format="%Y-%m-%d %H:%M:%S")
train_failures.failure_time = pd.to_datetime(train_failures.failure_time)

### MERGE NEXT FAILURE TO MEASUREMENTS
train_combined = pd.merge_asof(
    train_measurements,
    train_failures,
    left_on='measurement_time',
    right_on='failure_time',
    by='gadget_id',
    direction='forward',
)

### TRANSFORM COLUMNS
train_combined['time_to_fail'] = train_combined['failure_time']-train_combined['measurement_time']
train_combined['fail_in_1h'] = np.where(train_combined['time_to_fail']<pd.Timedelta(hours=1), 1, 0)

### CALCULATE RUNNING MEASURES
train_combined = train_combined.reset_index(drop=True)
train_combined = train_combined.sort_values(by=['gadget_id', 'measurement_time'], ascending=[True, True])

train_combined['temperature_6h_std'] = train_combined.groupby('gadget_id')['temperature'].rolling(6).std(ddof=0).reset_index(drop=True)
train_combined['pressure_6h_mean'] = train_combined.groupby('gadget_id')['pressure'].rolling(6).mean().reset_index(drop=True)

train_combined.to_csv('../../datasets/IoT/train_combined.csv', index=False)

In [3]:
# specify labels
X = ['vibration_y', 'pressure_6h_mean', 'temperature_6h_std']
y = 'fail_in_1h'
cols = X + [y]

df_to_split = train_combined.copy()
df_to_split = df_to_split.dropna(subset=cols)
df_to_split = df_to_split.reset_index(drop=True)

##### Create binary bins to 
binner = KBinsDiscretizer(n_bins=10, encode='onehot-dense', strategy='kmeans')
binner.fit(df_to_split[X])
arr_bins= binner.transform(df_to_split[X])
df_bins = pd.DataFrame(arr_bins)

X = list(df_bins.columns)
cols = X + [y]

df_to_split = pd.concat([df_to_split, df_bins], axis=1)
df_to_split.shape

(978, 43)

## Create differing environmental data

In [4]:
# Shuffle complete dataset
df_to_split = df_to_split.sample(frac=1).reset_index(drop=True).copy()

# Use 20% of complete dataset as a common test dataset for both biased and unbiased
df_test1 = df_to_split.sample(frac = 0.2).copy()
df_train_val = df_to_split.drop(df_test1.index).copy()

df_test2 = df_test1.copy()
for i in df_test2.index:
    df_test2.at[i, 'pressure_6h_mean'] += 100


## Datashift Dataset

In [5]:
# train consists of gadgets 1 -4
df_train_bias = df_to_split[df_to_split['gadget_id'].isin([1,2,3,4])].reset_index(drop=True).copy()

# test consists of gfadgets 5 and 6
df_validation_bias = df_to_split[df_to_split['gadget_id'].isin([5,6])].reset_index(drop=True).copy()

for i in df_train_bias.index:
    df_train_bias.at[i, 'temperature_6h_std'] = 1

# # Validation consists of 70% of df_train dataset
# # First shuffle df_train
# df_train_val_bias = df_train_val_bias.sample(frac=1).reset_index(drop=True)

# # Use 70% of remaining data set for training
# df_train_bias = df_train_val_bias.sample(frac = 0.7)

# # Remaining 30% used for validation test
# df_validation_bias = df_train_val_bias.drop(df_train_bias.index)

# Reorder datasets by measurement time and gadget id
df_train_bias = df_train_bias.sort_values(by=['gadget_id', 'measurement_time'], ascending=[True, True])
df_validation_bias = df_validation_bias.sort_values(by=['gadget_id', 'measurement_time'], ascending=[True, True])
# df_test2 = df_test2.sort_values(by=['gadget_id', 'measurement_time'], ascending=[True, True])

df_train_bias = df_train_bias.reset_index(drop=True)
df_validation_bias = df_validation_bias.reset_index(drop=True)
# df_test2 = df_test2.reset_index(drop=True)

print("Datashift Dataset")
print("--------------")
print("Training dataset samples: ", df_train_bias.shape)
print("Validation dataset samples: ",df_validation_bias.shape)
print("Test dataset samples: ",df_test2.shape)

Datashift Dataset
--------------
Training dataset samples:  (651, 43)
Validation dataset samples:  (327, 43)
Test dataset samples:  (196, 43)


## Balanced Dataset

In [6]:
# Shuffle complete dataset
# df_train_val_test_balanced = df_to_split.sample(frac=1).reset_index(drop=True)

df_train_balanced = df_train_val.sample(frac = 0.8)
df_validation_balanced = df_train_val.drop(df_train_balanced.index)



# # 60-20-20 split for train-validation-test repectively
# df_train_balanced = df_train_val_test_balanced.sample(frac = 0.7)
# df_val_test_balanced = df_train_val_test_balanced.drop(df_train_balanced.index)

# # half the remaining remaining 40% of dataset
# df_validation_balanced = df_val_test_balanced.sample(frac = 0.7)
# df_test_balanced = df_val_test_balanced.drop(df_validation_balanced.index)

# Reorder datasets by measurement time and gadget id
df_train_balanced = df_train_balanced.sort_values(by=['gadget_id', 'measurement_time'], ascending=[True, True])
df_validation_balanced = df_validation_balanced.sort_values(by=['gadget_id', 'measurement_time'], ascending=[True, True])
df_test1 = df_test1.sort_values(by=['gadget_id', 'measurement_time'], ascending=[True, True])

df_train_balanced = df_train_balanced.reset_index(drop=True)
df_validation_balanced = df_validation_balanced.reset_index(drop=True)
df_test1 = df_test1.reset_index(drop=True)

print("Balanced Dataset")
print("--------------")
print("Training dataset samples: ", df_train_balanced.shape)
print("Validation dataset samples: ",df_validation_balanced.shape)
print("Test dataset samples: ",df_test1.shape)

Balanced Dataset
--------------
Training dataset samples:  (626, 43)
Validation dataset samples:  (156, 43)
Test dataset samples:  (196, 43)


# Training, Validation and Test - SVM

In [7]:
### PREDICTION PARAMETERS
w0 = 1
w1 = 8
pos_label = 1

### SVM
svm = SVC(
    class_weight={0:w0, 1:w1},
    C=1,
    random_state=42,
    kernel='linear'
)

### RANDOM FOREST MODEL
random_forest = RandomForestClassifier(
    min_samples_leaf=7,
    random_state=45,
    n_estimators=50,
    class_weight={0:w0, 1:w1}
)

## Balanced

In [8]:
# fit model
random_forest.fit(df_train_balanced[X], df_train_balanced[y])

# make prediction on validation set
val_pred_balanced = random_forest.predict(df_validation_balanced[X])

accuracy_val_balanced = accuracy_score(df_validation_balanced['fail_in_1h'], val_pred_balanced )
precision_val_balanced = precision_score(df_validation_balanced['fail_in_1h'], val_pred_balanced, zero_division=0, pos_label=pos_label)
recall_val_balanced = recall_score(df_validation_balanced['fail_in_1h'], val_pred_balanced, pos_label=pos_label)

print("validation accuracy", round(accuracy_val_balanced,3))
print("validation precision", round(precision_val_balanced, 3))
print("validation recall", round(recall_val_balanced, 3))

# # make prediction on test set
test_pred_balanced = random_forest.predict(df_test1[X])

accuracy_test_balanced = accuracy_score(df_test1['fail_in_1h'], test_pred_balanced )
precision_test_balanced = precision_score(df_test1['fail_in_1h'], test_pred_balanced, zero_division=0, pos_label=pos_label)
recall_test_balanced = recall_score(df_test1['fail_in_1h'], test_pred_balanced, pos_label=pos_label)

print("test accuracy", round(accuracy_test_balanced, 3))
print("test precision", round(precision_test_balanced, 3))
print("test recall", round(recall_test_balanced, 3))

validation accuracy 0.769
validation precision 0.357
validation recall 1.0
test accuracy 0.74
test precision 0.254
test recall 0.944


## Biased

In [9]:
# fit model
random_forest.fit(df_train_bias[X], df_train_bias[y])

# make prediction on validation set
val_pred_biased = random_forest.predict(df_validation_bias[X])

accuracy_val_biased = accuracy_score(df_validation_bias['fail_in_1h'], val_pred_biased )
precision_val_biased = precision_score(df_validation_bias['fail_in_1h'], val_pred_biased, zero_division=0, pos_label=pos_label)
recall_val_biased = recall_score(df_validation_bias['fail_in_1h'], val_pred_biased, pos_label=pos_label)

print("validation accuracy", round(accuracy_val_biased,3))
print("validation precision", round(precision_val_biased, 3))
print("validation recall", round(recall_val_biased, 3))

# make prediction on test set
test_pred_biased = random_forest.predict(df_test2[X])

accuracy_test_biased = accuracy_score(df_test2['fail_in_1h'], test_pred_biased )
precision_test_biased = precision_score(df_test2['fail_in_1h'], test_pred_biased, zero_division=0, pos_label=pos_label)
recall_test_biased = recall_score(df_test2['fail_in_1h'], test_pred_biased, pos_label=pos_label)

print("test accuracy", round(accuracy_test_biased, 3))
print("test precision", round(precision_test_biased, 3))
print("test recall", round(recall_test_biased, 3))

validation accuracy 0.783
validation precision 0.311
validation recall 1.0
test accuracy 0.74
test precision 0.261
test recall 1.0
