In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [None]:
# np.genfromtxt('sensor.csv', delimiter=',', dtype=None)
df = pd.read_csv('sensor.csv')

In [None]:
df.info()

In [None]:
df.drop(['Unnamed: 0','sensor_00','sensor_15','sensor_50','sensor_51'],axis=1, inplace=True)

In [None]:
df.plot(subplots =True, sharex = True, figsize = (20,50));

In [None]:
# Statistics of the machine. It is a highly imbalanced data set
df.machine_status.value_counts()

In [None]:
# Transforming mashine status from strings to integers 
conditions = [(df['machine_status'] =='NORMAL'), (df['machine_status'] =='BROKEN'), (df['machine_status'] =='RECOVERING')]
choices = [1, 0, 0.5]
df['Operation'] = np.select(conditions, choices, default=0)

In [None]:
# Status of the machine. 1 = operational, 0.5 maintenance and 0 = broken
df.set_index('timestamp').Operation.plot(figsize=(13,1));
plt.ylabel('Machine Status');

### Time Lag

In [None]:
# https://www.kaggle.com/code/ryanholbrook/time-series-as-features
# With trend and seasonality, we trained models to fit curves to plots like those on the left in the figure above -- 
# the models were learning time dependence. The goal in this lesson is to train models to fit curves to plots like those on the right 
# we want them to learn serial dependence

In [None]:
df['Lag_1'] = df['sensor_01'].shift(1)
#df = df.reindex(columns=['Hardcover', 'Lag_1'])

In [None]:
sns.regplot(x='Lag_1', y='sensor_01', data=df, ci=None, scatter_kws=dict(color='0.25'));

In [None]:
df['timestamp'] = pd.to_datetime(df.timestamp)

In [None]:
df = df.backfill() # somehow fillna('backfill') produced columns with objects.')
# df.info()

In [None]:
m = 7
df['rolling_mean_sensor_01'] = df['sensor_01'].rolling(window=m,center=True).mean()
# moving average
sns.lineplot(x='timestamp', y='rolling_mean_sensor_01', label=' Sensor Signal', data=df);
plt.title('For Specific Sensor')

## Machnine Learning

In [None]:
mms = MinMaxScaler()

In [None]:
to_convert = ['sensor_01', 'sensor_02', 'sensor_03', 'sensor_04',
       'sensor_05', 'sensor_06', 'sensor_07', 'sensor_08', 'sensor_09',
       'sensor_10', 'sensor_11', 'sensor_12', 'sensor_13', 'sensor_14',
       'sensor_16', 'sensor_17', 'sensor_18', 'sensor_19', 'sensor_20',
       'sensor_21', 'sensor_22', 'sensor_23', 'sensor_24', 'sensor_25',
       'sensor_26', 'sensor_27', 'sensor_28', 'sensor_29', 'sensor_30',
       'sensor_31', 'sensor_32', 'sensor_33', 'sensor_34', 'sensor_35',
       'sensor_36', 'sensor_37', 'sensor_38', 'sensor_39', 'sensor_40',
       'sensor_41', 'sensor_42', 'sensor_43', 'sensor_44', 'sensor_45',
       'sensor_46', 'sensor_47', 'sensor_48', 'sensor_49']

In [None]:
df_scaled = pd.DataFrame(mms.fit_transform(df[to_convert]))


In [None]:
df = pd.concat([df, df_scaled], axis = 1).drop(to_convert, axis= 1)

In [None]:
df = df.backfill() # somehow fillna('backfill') produced columns with objects.')

In [None]:
df.isna().describe()

In [None]:
df.set_index('timestamp', inplace=True)

In [None]:
# train/test split time series
train_df = df.loc[df.index < "2018-06-09 10:40:00"]
test_df = df.loc[df.index >= "2018-06-09 10:40:00"]

In [None]:
X_train = train_df.drop(['machine_status', 'Operation'], axis = 1)
y_train = train_df.Operation
X_test = test_df.drop(['machine_status', 'Operation'], axis = 1)
y_test = test_df.Operation

In [None]:
# A good overview how the traces look now
# train_df.plot(subplots = True, sharex = True, figsize= (12,30));
# X_train.info()

### Linear Regression

In [None]:
reg = LinearRegression()
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)

In [None]:
y_test_plot = y_test.copy()
y_test_plot = pd.DataFrame(y_test_plot)
y_test_plot['y_pred'] = y_pred.tolist()
y_test_plot.plot.line(figsize=(15,2));
plt.title('Linear Regression Prediction');

In [None]:
print('RMSE for Linear Regression: ', "%.3f" % mean_squared_error(y_pred, y_test)**(1/2))

In [None]:
y_test.Operation

### Random Forrest

In [None]:
rf = RandomForestRegressor()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

In [None]:
y_test_plot = pd.DataFrame(y_test_plot)
y_test_plot['y_pred'] = y_pred.tolist()
y_test_plot.plot.line(figsize=(15,2));
plt.title('Random Forest Prediction');

In [None]:
print('RMSE for Random Forest: ', "%.3f" % mean_squared_error(y_test.Operation, y_pred)**(1/2))

Conclusively, random forest show a lower root mean square error (RMSE) than linear regression algorithm. It can better predict the underlying machine status. However, those example are not predicitions in advance. As both algorothms predict the machine fallout within a minute, which is to short
to take action or do a maintanace precedure. Therefore, a shift in the window function is necessary.