# Regression: 
    - Predict the Remaining Useful Life (RUL), or Time to Failure (TTF).
# Binary classification: 
    - Predict if an asset will fail within certain time frame (e.g. days).
# Multi-class classification:
    - Predict if an asset will fail in different time windows: E.g., fails in window [1, w0] days; fails in the window [w0+1,w1] days; not fail within w1 days

In [1]:
%matplotlib notebook

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import fileinput
import re

from io import StringIO
from sklearn.preprocessing import MinMaxScaler

In [2]:
train_file = "Data/PM_train.txt"
test_file = "Data/PM_test.txt"
colnames = ["id","cycle","setting1","setting2","setting3","s1","s2","s3","s4","s5","s6","s7",
           "s8","s9","s10","s11","s12","s13","s14","s15","s16","s17","s18","s19","s20","s21"]

In [3]:
def move_col_last(df,col_name):
  return pd.concat([df.drop(col_name,axis=1),df[col_name]], axis=1)

def move_col_first(df,col_name):
  return pd.concat([df[col_name],df.drop(col_name,axis=1)], axis=1)

def transfer_data(text_file_path,col_names):
  # user defined variables to set the windows for classifcation
  w1 = 30
  w0 = 15
  # window size (window_size>=2),  most recent sensor values
  window_size = 5
  
  # read in the file
  with open(train_file, 'r') as f:
    filedata = f.read()
  
  # remove white space from the end of line
  filedata = re.sub('\s*$', '',filedata,flags = re.M)
  
  # filedata is a string that pretends to be a file input
  file_as_string = StringIO(filedata)

  dataset = pd.read_table(file_as_string,sep=" ",names=colnames)
  
  # get the maximum cycle number for each id
  d1 = dataset.groupby(["id"])["cycle"].max().to_frame()
  d1.columns = ['max']

  # add column with the name 'id' (just copy the index)
  d1['id'] = d1.index
  d2 = pd.merge(dataset, d1, on='id')
  
  # generate the column RUL based on the values of columns "max" and "cycle"
  d2['RUL'] = d2['max'] - d2['cycle']
  
  # exclude column "max" from the data frame
  d2 = d2.drop('max', 1)

  # genearte label1 and label2
  dataset = d2
  dataset['label1'] = np.where(d2['RUL'] <= w1, 1, 0)
  dataset['label2'] = np.where(d2['RUL'] <= w0, 2, (np.where(d2['RUL'] <= w1,1,0)))
  
  # exclude comuns id, cycle, setting1,setting2,setting3, and last 3 columns
  # only the 21 sensor columns are kept in the data frame
  n_pre_sensor_columns = 5 # id, cycle, setting1,setting2,setting3
  n_after_sensor_columns = 3 #RUL, label1, label2
  n_col = len(dataset.columns)

  data = dataset[dataset.columns[n_pre_sensor_columns:n_col - n_after_sensor_columns]]
  n_sensor=len(data.columns)

  ids = dataset.id.unique()
  n_id = len(ids) # 100

  a = ["a" + str(i) for i in range(1,n_sensor+1)] # average
  sd = ["sd" + str(i) for i in range(1,n_sensor+1)] # standard deviation

  df = None
  for i in range(1,n_id+1):
    # get the subset of the data that only contains the sensor columns for the id i
    subset_rolling_mean = data[dataset.id==i].rolling(window_size,axis=0,min_periods=1).mean()
    subset_rolling_mean.columns = a
    subset_rolling_std = data[dataset.id==i].rolling(window_size,axis=0,min_periods=1).std().fillna(0)
    subset_rolling_std.columns = sd
    subset = pd.concat([subset_rolling_mean,subset_rolling_std], axis=1)
    df1 = pd.concat([dataset[dataset.id == i],subset], axis=1)
    df1 = move_col_last(df1,['RUL', 'label1','label2'])

    if df is None:
      df = df1
    else:
      df = df.append(df1,ignore_index=True)
  df2 = df


  # Exclude column names: id
  df3 = df2.drop('id', 1)
  df4 = df3.drop(['RUL','label1','label2'], 1)
  
  scaler = MinMaxScaler()
  df5 = pd.DataFrame(scaler.fit_transform(df4), index=df4.index, columns=df4.columns)
  
  df6 = df5.dropna(axis=1)
  
  return df6

In [4]:
# read in the file
with open(train_file, 'r') as f:
  filedata = f.read()
  
# remove white space from the end of line
filedata = re.sub('\s*$', '',filedata,flags = re.M)

In [5]:
colnames = ["id","cycle","setting1","setting2","setting3","s1","s2","s3","s4","s5","s6","s7",
           "s8","s9","s10","s11","s12","s13","s14","s15","s16","s17","s18","s19","s20","s21"]

# filedata is a string that pretends to be a file input
filedata = StringIO(filedata)

dataset = pd.read_table(filedata,sep=" ",names=colnames)

In [6]:
# get the maximum cycle number for each id
d1 = dataset.groupby(["id"])["cycle"].max().to_frame()
d1.columns = ['max']

# add column with the name 'id' (just copy the index)
d1['id'] = d1.index
d2 = pd.merge(dataset, d1, on='id')


In [7]:
# generate the column RUL based on the values of columns "max" and "cycle"
d2['RUL'] = d2['max'] - d2['cycle']

In [8]:
# exclude column "max" from the data frame
d2 = d2.drop('max', 1)

In [9]:
# user defined variables to set the windows for classifcation
w1 = 30
w0 = 15

# genearte label1 and label2
dataset = d2
dataset['label1'] = np.where(d2['RUL'] <= w1, 1, 0)
dataset['label2'] = np.where(d2['RUL'] <= w0, 2, (np.where(d2['RUL'] <= w1,1,0)))

In [10]:
# exclude comuns id, cycle, setting1,setting2,setting3, and last 3 columns
# only the 21 sensor columns are kept in the data frame
n_pre_sensor_columns = 5 # id, cycle, setting1,setting2,setting3
n_after_sensor_columns = 3 #RUL, label1, label2
n_col = len(dataset.columns)

data = dataset[dataset.columns[n_pre_sensor_columns:n_col - n_after_sensor_columns]]
n_sensor=len(data.columns)

ids = dataset.id.unique()
n_id = len(ids) # 100

a = ["a" + str(i) for i in range(1,n_sensor+1)] # average
sd = ["sd" + str(i) for i in range(1,n_sensor+1)] # standard deviation

In [11]:
# window size (window_size>=2),  most recent sensor values
window_size = 5
df = None
for i in range(1,n_id+1):
  # get the subset of the data that only contains the sensor columns for the id i
  subset_rolling_mean = data[dataset.id==i].rolling(window_size,axis=0,min_periods=1).mean()
  subset_rolling_mean.columns = a
  subset_rolling_std = data[dataset.id==i].rolling(window_size,axis=0,min_periods=1).std().fillna(0)
  subset_rolling_std.columns = sd
  subset = pd.concat([subset_rolling_mean,subset_rolling_std], axis=1)
  df1 = pd.concat([dataset[dataset.id == i],subset], axis=1)
  df1 = move_col_last(df1,['RUL', 'label1','label2'])
  
  if df is None:
    df = df1
  else:
    df = df.append(df1,ignore_index=True)
df2 = df


# Exclude column names: id
df3 = df2.drop('id', 1)

In [12]:
df4 = df3.drop(['RUL','label1','label2'], 1)

In [13]:
scaler = MinMaxScaler()

df5 = pd.DataFrame(scaler.fit_transform(df4), index=df4.index, columns=df4.columns)

In [14]:
df6 = df5.dropna(axis=1)

In [15]:
#train_df = df6
test_df = df6

In [16]:
test_df.head()

Unnamed: 0,cycle,setting1,setting2,setting3,s1,s2,s3,s4,s5,s6,...,sd12,sd13,sd14,sd15,sd16,sd17,sd18,sd19,sd20,sd21
0,0.0,0.45977,0.166667,0.0,0.0,0.183735,0.406802,0.309757,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.00277,0.609195,0.25,0.0,0.0,0.283133,0.453019,0.352633,0.0,1.0,...,0.488189,0.160656,0.508246,0.132258,0.0,0.0,0.0,0.0,0.130435,0.017365
2,0.00554,0.252874,0.75,0.0,0.0,0.343373,0.369523,0.370527,0.0,1.0,...,0.450405,0.120224,0.374753,0.116172,0.0,0.408248,0.0,0.0,0.169323,0.237961
3,0.00831,0.54023,0.5,0.0,0.0,0.343373,0.256159,0.331195,0.0,1.0,...,0.552267,0.133773,0.307559,0.427568,0.0,0.353553,0.0,0.0,0.234642,0.202745
4,0.01108,0.390805,0.333333,0.0,0.0,0.349398,0.257467,0.404625,0.0,1.0,...,0.481694,0.11762,0.267278,0.394651,0.0,0.387298,0.0,0.0,0.226338,0.178837


# Machine Learning

In [24]:
X = df6
y = df3['RUL']

#Xt = test_df
#yt = df3['RUL']

In [25]:
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor

In [26]:
rng = np.random.RandomState(1)
regr_1 = DecisionTreeRegressor(max_depth=16)
regr_2 = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4),n_estimators=300, random_state=rng)

In [27]:
regr_1.fit(X, y)
regr_2.fit(X, y)

AdaBoostRegressor(base_estimator=DecisionTreeRegressor(criterion='mse', max_depth=4, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best'),
         learning_rate=1.0, loss='linear', n_estimators=300,
         random_state=<mtrand.RandomState object at 0x1121f0e58>)

In [28]:
# Predict
y_1 = regr_1.predict(Xt[1:10])
y_2 = regr_2.predict(Xt[1:10])

In [135]:
y_2[4] 
yt[4]

26