In [1]:
import numpy as np
import pandas as pd
import featuretools as ft

operational_settings = ['operational_setting_{}'.format(i + 1) for i in range (3)]
sensor_columns = ['sensor_measurement_{}'.format(i + 1) for i in range(26)]
cols = ['engine_no', 'time_in_cycles'] + operational_settings + sensor_columns

data = pd.read_csv('data/RUL_test.txt', sep=' ', header=-1, names=cols)

data = data.drop(cols[-5:], axis=1)


In [2]:
# Here we use the given labels from test
# It seems the training problem requires creation of our own labels.
# Using these for now to get DFS set up

label = pd.read_csv('data/RUL_test_truth.txt', sep=' ', header=-1, names=['label'], index_col=False)
label.index = label.index + 1
label.tail()

Unnamed: 0,label
244,35
245,131
246,194
247,112
248,26


In [3]:
es = ft.EntitySet('Dataset')
es.entity_from_dataframe(dataframe=data,
                         entity_id='recordings', 
                         index='index')

es.normalize_entity(base_entity_id='recordings', 
                    new_entity_id='engines',
                    index='engine_no')

es.normalize_entity(base_entity_id='recordings', 
                    new_entity_id='cycles',
                    index='time_in_cycles')
es['engines'].df.tail()



Unnamed: 0_level_0,engine_no
engine_no,Unnamed: 1_level_1
244,244
245,245
246,246
247,247
248,248


In [4]:
fm, features = ft.dfs(entityset=es, 
                      target_entity='engines', 
                      verbose=True)

Built 147 features
Elapsed: 00:04 | Remaining: 00:00 | Progress: 100%|██████████|| Calculated: 1/1 cutoff times


In [5]:
fm.tail()

Unnamed: 0_level_0,SUM(recordings.operational_setting_1),SUM(recordings.operational_setting_2),SUM(recordings.operational_setting_3),SUM(recordings.sensor_measurement_1),SUM(recordings.sensor_measurement_2),SUM(recordings.sensor_measurement_3),SUM(recordings.sensor_measurement_4),SUM(recordings.sensor_measurement_5),SUM(recordings.sensor_measurement_6),SUM(recordings.sensor_measurement_7),...,MEAN(recordings.sensor_measurement_15),MEAN(recordings.sensor_measurement_16),MEAN(recordings.sensor_measurement_17),MEAN(recordings.sensor_measurement_18),MEAN(recordings.sensor_measurement_19),MEAN(recordings.sensor_measurement_20),MEAN(recordings.sensor_measurement_21),COUNT(recordings),NUM_UNIQUE(recordings.time_in_cycles),MODE(recordings.time_in_cycles)
engine_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
244,4277.5685,102.9437,17160.0,87187.14,106697.5,260780.05,220916.27,1504.0,2168.3,52763.72,...,9.320834,0.023152,347.48913,2224.413043,97.461033,21.159891,12.688186,184,184,1
245,1617.1666,37.8299,5860.0,29499.36,36066.65,88015.23,74113.02,471.83,677.88,16452.41,...,9.32637,0.022857,342.206349,2208.952381,97.36873,19.465238,11.663252,63,63,1
246,785.066,18.7845,2780.0,13637.44,16744.41,40889.89,34327.23,216.62,313.67,7656.48,...,9.285124,0.021724,345.344828,2237.793103,98.441034,19.513103,11.731172,29,29,1
247,2119.2642,51.2044,8720.0,44577.78,54545.79,133557.0,113608.16,780.24,1123.96,27303.4,...,9.354521,0.023617,348.37234,2221.159574,97.274574,21.367021,12.826918,94,94,1
248,6565.811,158.7555,26260.0,133152.65,163020.8,398572.35,338090.47,2285.99,3296.15,80192.03,...,9.327237,0.023203,347.950178,2226.096085,97.533025,21.04274,12.628865,281,281,1


In [6]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

reg = RandomForestRegressor()
scores = cross_val_score(reg, fm, np.ravel(label), cv=3, scoring='neg_mean_absolute_error')
print('Average Negative MAE: {:.2f} | Standard dev: {:.3f}'.format(np.mean(scores), np.std(scores)))

Average Negative MAE: -26.86 | Standard dev: 1.368
