# Predictive Maintenance - Label Bias

## Setup

In [2]:
### IMPORT LIBRARIES
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import KBinsDiscretizer, MinMaxScaler, normalize
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report
from sklearn.decomposition import PCA

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression, Lasso
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

from sklearn.linear_model import SGDClassifier
from sklearn import tree

### READ DATA
##### Needs to be ran from the project directory
train_measurements = pd.read_csv('../datasets/measurements.csv')
train_measurements = train_measurements.sort_values(by=['measurement_time'], ascending=[True])

train_failures = pd.read_csv('../datasets/failures.csv')
train_failures = train_failures.sort_values(by=['failure_time'], ascending=[True])

## Data Preprocessing

In [3]:
train_measurements.measurement_time = pd.to_datetime(train_measurements.measurement_time, format="%Y-%m-%d %H:%M:%S")
train_failures.failure_time = pd.to_datetime(train_failures.failure_time)

### MERGE NEXT FAILURE TO MEASUREMENTS
train_combined = pd.merge_asof(
    train_measurements,
    train_failures,
    left_on='measurement_time',
    right_on='failure_time',
    by='gadget_id',
    direction='forward',
)

### TRANSFORM COLUMNS
train_combined['time_to_fail'] = train_combined['failure_time']-train_combined['measurement_time']
train_combined['fail_in_1h'] = np.where(train_combined['time_to_fail']<pd.Timedelta(hours=1), 1, 0)

### CALCULATE RUNNING MEASURES
train_combined = train_combined.reset_index(drop=True)
train_combined = train_combined.sort_values(by=['gadget_id', 'measurement_time'], ascending=[True, True])

train_combined['temperature_6h_std'] = train_combined.groupby('gadget_id')['temperature'].rolling(6).std(ddof=0).reset_index(drop=True)
train_combined['pressure_6h_mean'] = train_combined.groupby('gadget_id')['pressure'].rolling(6).mean().reset_index(drop=True)

train_combined.to_csv('../datasets/train_combined.csv', index=False)

In [6]:
train = train_combined.sample(frac = 0.5)
train.shape
train.head()

Unnamed: 0,measurement_time,gadget_id,vibration_x,vibration_y,pressure,temperature,Unnamed: 10,Unnamed: 11,failure_time,time_to_fail,fail_in_1h,temperature_6h_std,pressure_6h_mean
87,2020-07-15 13:59:59.930,3,10,2,16.0,63.335759,,,2020-07-15 19:17:09.459192460,0 days 05:17:09.529192460,0,4.259392,16.333333
279,2020-07-16 21:59:59.770,6,4,4,16.0,78.177636,,,2020-07-18 14:10:29.953822220,1 days 16:10:30.183822220,0,4.639621,14.666667
748,2020-07-20 03:59:59.505,6,7,2,16.0,72.904098,,,2020-07-21 05:21:41.791182960,1 days 01:21:42.286182960,0,5.310451,14.916667
919,2020-07-21 08:59:59.505,5,6,2,16.0,78.762979,,,NaT,NaT,0,3.924165,14.833333
999,2020-07-21 21:59:59.505,1,4,2,12.0,16.359486,,,NaT,NaT,0,5.183439,14.916667
