In [1]:
### IMPORT LIBRARIES
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import KBinsDiscretizer, MinMaxScaler, normalize
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report
from sklearn.decomposition import PCA

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression, Lasso
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

from sklearn.linear_model import SGDClassifier
from sklearn import tree

### READ DATA
##### Needs to be ran from the project directory
train_measurements = pd.read_csv('../../datasets/IoT/measurements.csv')
train_measurements = train_measurements.sort_values(by=['measurement_time'], ascending=[True])

train_failures = pd.read_csv('../../datasets/IoT/failures.csv')
train_failures = train_failures.sort_values(by=['failure_time'], ascending=[True])

In [2]:
train_measurements.measurement_time = pd.to_datetime(train_measurements.measurement_time, format="%Y-%m-%d %H:%M:%S")
train_failures.failure_time = pd.to_datetime(train_failures.failure_time)

### MERGE NEXT FAILURE TO MEASUREMENTS
train_combined = pd.merge_asof(
    train_measurements,
    train_failures,
    left_on='measurement_time',
    right_on='failure_time',
    by='gadget_id',
    direction='forward',
)

### TRANSFORM COLUMNS
train_combined['time_to_fail'] = train_combined['failure_time']-train_combined['measurement_time']
train_combined['fail_in_1h'] = np.where(train_combined['time_to_fail']<pd.Timedelta(hours=1), 1, 0)

### CALCULATE RUNNING MEASURES
train_combined = train_combined.reset_index(drop=True)
train_combined = train_combined.sort_values(by=['gadget_id', 'measurement_time'], ascending=[True, True])

train_combined['temperature_6h_std'] = train_combined.groupby('gadget_id')['temperature'].rolling(6).std(ddof=0).reset_index(drop=True)
train_combined['pressure_6h_mean'] = train_combined.groupby('gadget_id')['pressure'].rolling(6).mean().reset_index(drop=True)

train_combined.to_csv('../../datasets/IoT/train_combined.csv', index=False)

In [3]:
### SPLIT TO TRAIN AND TEST
X = ['vibration_y', 'pressure_6h_mean', 'temperature_6h_std']
y = 'fail_in_1h'
cols = X + [y]

df_to_split = train_combined.copy()
df_to_split = df_to_split.dropna(subset=cols)
df_to_split = df_to_split.reset_index(drop=True)
df_to_split.shape

(978, 13)

In [4]:
##### Create binary bins to 
binner = KBinsDiscretizer(n_bins=10, encode='onehot-dense', strategy='kmeans')
binner.fit(df_to_split[X])
arr_bins= binner.transform(df_to_split[X])
df_bins = pd.DataFrame(arr_bins)

X = list(df_bins.columns)
cols = X + [y]

df_to_split = pd.concat([df_to_split, df_bins], axis=1)

df_train = df_to_split.sample(frac=0.7)
df_test = df_to_split.drop(df_train.index)
df_test1 = df_test[df_test['gadget_id'].isin([1,2,3])].reset_index(drop=True).copy()
df_test2 = df_test[df_test['gadget_id'].isin([4,5,6])].reset_index(drop=True).copy()

print(f"Training data: {df_train.shape}")
print(f"Test data: {df_test.shape}")
print(f"Test1 data: {df_test.shape}")

Training data: (685, 43)
Test data: (293, 43)
Test1 data: (293, 43)


In [5]:
df_1 = df_train[df_train['gadget_id'].isin([1,2,3])].reset_index(drop=True).copy()
df_2 = df_train[df_train['gadget_id'].isin([4,5,6])].reset_index(drop=True).copy()

print(f"Training data: {df_train.shape}")
print(f"DF1 data: {df_1.shape}")
print(f"DF2 data: {df_2.shape}")
print(f"DF1+2 data: {df_2.shape[0]+df_1.shape[0]}")

Training data: (685, 43)
DF1 data: (337, 43)
DF2 data: (348, 43)
DF1+2 data: 685


In [6]:
### PREDICTION PARAMETERS
w0 = 1
w1 = 8
pos_label = 1

### LOGISTIC REGRESSION MODEL
log_regr = LogisticRegression(class_weight={0:w0, 1:w1})

### SVM
svm = SVC(
    class_weight={0:w0, 1:w1},
    C=1,
    random_state=42,
    kernel='linear'
)

In [7]:
svm.fit(df_train[X], df_train[y])
test1 = svm.predict(df_test[X])
accuracy1 = accuracy_score(df_test['fail_in_1h'], test1 )
cls1 = classification_report(df_test['fail_in_1h'], test1)

print("accuracy", round(accuracy1,3))
print("classification report: \n", cls1)

accuracy 0.747
classification report: 
               precision    recall  f1-score   support

           0       0.97      0.73      0.83       254
           1       0.33      0.85      0.47        39

    accuracy                           0.75       293
   macro avg       0.65      0.79      0.65       293
weighted avg       0.88      0.75      0.79       293



In [8]:
svm.fit(df_1[X], df_1[y])
test2 = svm.predict(df_test[X])
accuracy2 = accuracy_score(df_test['fail_in_1h'], test2 )
cls2 = classification_report(df_test['fail_in_1h'], test2)
print("Full Test Set: \n")
print("accuracy2", round(accuracy2,3))
print("classification report: \n", cls2)

test2 = svm.predict(df_test1[X])
accuracy2 = accuracy_score(df_test1['fail_in_1h'], test2 )
cls2 = classification_report(df_test1['fail_in_1h'], test2)
print("Same Samples from Test Set: \n")
print("accuracy2", round(accuracy2,3))
print("classification report: \n", cls2)

test2 = svm.predict(df_2[X])
accuracy2 = accuracy_score(df_2['fail_in_1h'], test2 )
cls2 = classification_report(df_2['fail_in_1h'], test2)
print("Other training Set (DF2): \n")
print("accuracy2", round(accuracy2,3))
print("classification report: \n", cls2)

Full Test Set: 

accuracy2 0.761
classification report: 
               precision    recall  f1-score   support

           0       0.96      0.76      0.85       254
           1       0.33      0.77      0.46        39

    accuracy                           0.76       293
   macro avg       0.64      0.76      0.65       293
weighted avg       0.87      0.76      0.80       293

Same Samples from Test Set: 

accuracy2 0.795
classification report: 
               precision    recall  f1-score   support

           0       0.95      0.80      0.87       131
           1       0.37      0.75      0.49        20

    accuracy                           0.79       151
   macro avg       0.66      0.78      0.68       151
weighted avg       0.88      0.79      0.82       151

Other training Set (DF2): 

accuracy2 0.73
classification report: 
               precision    recall  f1-score   support

           0       0.98      0.72      0.83       316
           1       0.23      0.84      0

In [9]:
svm.fit(df_2[X], df_2[y])
test3 = svm.predict(df_test[X])
accuracy3 = accuracy_score(df_test['fail_in_1h'], test3 )
cls3 = classification_report(df_test['fail_in_1h'], test3)
print("Full Test Set: \n")
print("accuracy3", round(accuracy3,3))
print("classification report: \n", cls3)

test3 = svm.predict(df_1[X])
accuracy3 = accuracy_score(df_1['fail_in_1h'], test3 )
cls3 = classification_report(df_1['fail_in_1h'], test3)
print("Same Samples from Test Set: \n")
print("accuracy3", round(accuracy3,3))
print("classification report: \n", cls3)

test3 = svm.predict(df_test2[X])
accuracy3 = accuracy_score(df_test2['fail_in_1h'], test3 )
cls3 = classification_report(df_test2['fail_in_1h'], test3)
print("Different Samples from Test Set: \n")
print("accuracy3", round(accuracy3,3))
print("classification report: \n", cls3)

Full Test Set: 

accuracy3 0.782
classification report: 
               precision    recall  f1-score   support

           0       0.94      0.80      0.86       254
           1       0.33      0.64      0.44        39

    accuracy                           0.78       293
   macro avg       0.63      0.72      0.65       293
weighted avg       0.86      0.78      0.81       293

Same Samples from Test Set: 

accuracy3 0.813
classification report: 
               precision    recall  f1-score   support

           0       0.98      0.81      0.89       306
           1       0.30      0.81      0.44        31

    accuracy                           0.81       337
   macro avg       0.64      0.81      0.67       337
weighted avg       0.91      0.81      0.85       337

Different Samples from Test Set: 

accuracy3 0.746
classification report: 
               precision    recall  f1-score   support

           0       0.94      0.76      0.84       123
           1       0.30      0.6