In [1]:
import numpy as np
import pandas as pd
import featuretools as ft

operational_settings = ['operational_setting_{}'.format(i + 1) for i in range (3)]
sensor_columns = ['sensor_measurement_{}'.format(i + 1) for i in range(26)]
cols = ['engine_no', 'time_in_cycles'] + operational_settings + sensor_columns

data = pd.read_csv('data/RUL_train.txt', sep=' ', header=-1, names=cols)

data = data.drop(cols[-5:], axis=1)
data['index'] = data.index
data.head()

Unnamed: 0,engine_no,time_in_cycles,operational_setting_1,operational_setting_2,operational_setting_3,sensor_measurement_1,sensor_measurement_2,sensor_measurement_3,sensor_measurement_4,sensor_measurement_5,...,sensor_measurement_13,sensor_measurement_14,sensor_measurement_15,sensor_measurement_16,sensor_measurement_17,sensor_measurement_18,sensor_measurement_19,sensor_measurement_20,sensor_measurement_21,index
0,1,1,42.0049,0.84,100.0,445.0,549.68,1343.43,1112.93,3.91,...,2387.99,8074.83,9.3335,0.02,330,2212,100.0,10.62,6.367,0
1,1,2,20.002,0.7002,100.0,491.19,606.07,1477.61,1237.5,9.35,...,2387.73,8046.13,9.1913,0.02,361,2324,100.0,24.37,14.6552,1
2,1,3,42.0038,0.8409,100.0,445.0,548.95,1343.12,1117.05,3.91,...,2387.97,8066.62,9.4007,0.02,329,2212,100.0,10.48,6.4213,2
3,1,4,42.0,0.84,100.0,445.0,548.7,1341.24,1118.03,3.91,...,2388.02,8076.05,9.3369,0.02,328,2212,100.0,10.54,6.4176,3
4,1,5,25.0063,0.6207,60.0,462.54,536.1,1255.23,1033.59,7.05,...,2028.08,7865.8,10.8366,0.02,305,1915,84.93,14.03,8.6754,4


In [2]:
gb = data.groupby(['engine_no'])
labels = []
for engine_no_df in gb:
    instances = engine_no_df[1].shape[0]
    label = [instances - i - 1 for i in range(instances)]
    labels += label
data['label'] = labels
data['time'] = pd.date_range('1/1/2000', periods=data.shape[0], freq='s')
data.tail()

Unnamed: 0,engine_no,time_in_cycles,operational_setting_1,operational_setting_2,operational_setting_3,sensor_measurement_1,sensor_measurement_2,sensor_measurement_3,sensor_measurement_4,sensor_measurement_5,...,sensor_measurement_15,sensor_measurement_16,sensor_measurement_17,sensor_measurement_18,sensor_measurement_19,sensor_measurement_20,sensor_measurement_21,index,label,time
61244,249,251,9.9998,0.25,100.0,489.05,605.33,1516.36,1315.28,10.52,...,8.4541,0.03,372,2319,100.0,29.11,17.5234,61244,4,2000-01-01 17:00:44
61245,249,252,0.0028,0.0015,100.0,518.67,643.42,1598.92,1426.77,14.62,...,8.2221,0.03,396,2388,100.0,39.38,23.7151,61245,3,2000-01-01 17:00:45
61246,249,253,0.0029,0.0,100.0,518.67,643.68,1607.72,1430.56,14.62,...,8.2525,0.03,395,2388,100.0,39.78,23.827,61246,2,2000-01-01 17:00:46
61247,249,254,35.0046,0.84,100.0,449.44,555.77,1381.29,1148.18,5.48,...,9.0515,0.02,337,2223,100.0,15.26,9.0774,61247,1,2000-01-01 17:00:47
61248,249,255,42.003,0.84,100.0,445.0,549.85,1369.75,1147.45,3.91,...,9.1207,0.02,333,2212,100.0,10.66,6.4341,61248,0,2000-01-01 17:00:48


In [3]:
# Here we use the given labels from test
# It seems the training problem requires creation of our own labels.
# Using these for now to get DFS set up
cutoff_times = data[['index', 'time']]
cutoff_times['label'] = data.pop('label')


In [4]:
es = ft.EntitySet('Dataset')
es.entity_from_dataframe(dataframe=data,
                         entity_id='recordings',
                         index='index',
                         time_index='time')

es.normalize_entity(base_entity_id='recordings', 
                    new_entity_id='engines',
                    index='engine_no')

es.normalize_entity(base_entity_id='recordings', 
                    new_entity_id='cycles',
                    index='time_in_cycles')
es['engines'].df.tail()

Unnamed: 0_level_0,engine_no,first_recordings_time
engine_no,Unnamed: 1_level_1,Unnamed: 2_level_1
245,245,2000-01-01 16:43:53
246,246,2000-01-01 16:47:18
247,247,2000-01-01 16:49:59
248,248,2000-01-01 16:53:30
249,249,2000-01-01 16:56:34


In [5]:
from featuretools.primitives import Sum, Mean, Std, Skew
fm, features = ft.dfs(entityset=es, 
                      target_entity='recordings',
                      agg_primitives=[Sum, Mean, Std, Skew],
                      trans_primitives=[],
                      cutoff_time=cutoff_times,
                      approximate='2400s',
                      verbose=True)

Built 218 features
Elapsed: 01:56 | Remaining: 00:00 | Progress: 100%|██████████|| Calculated: 26/26 cutoff times


In [6]:
fm.tail()

Unnamed: 0_level_0,label,engine_no,time_in_cycles,operational_setting_1,operational_setting_2,operational_setting_3,sensor_measurement_1,sensor_measurement_2,sensor_measurement_3,sensor_measurement_4,...,cycles.SKEW(recordings.sensor_measurement_12),cycles.SKEW(recordings.sensor_measurement_13),cycles.SKEW(recordings.sensor_measurement_14),cycles.SKEW(recordings.sensor_measurement_15),cycles.SKEW(recordings.sensor_measurement_16),cycles.SKEW(recordings.sensor_measurement_17),cycles.SKEW(recordings.sensor_measurement_18),cycles.SKEW(recordings.sensor_measurement_19),cycles.SKEW(recordings.sensor_measurement_20),cycles.SKEW(recordings.sensor_measurement_21)
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
61244,4,249,251,9.9998,0.25,100.0,489.05,605.33,1516.36,1315.28,...,0.455307,-1.887063,-1.312319,1.047247,0.358569,-0.046641,-1.232073,-1.887074,0.408226,0.406635
61245,3,249,252,0.0028,0.0015,100.0,518.67,643.42,1598.92,1426.77,...,0.54575,-1.773,-1.28078,1.008679,0.341506,-0.152103,-1.273836,-1.773008,0.49317,0.491553
61246,2,249,253,0.0029,0.0,100.0,518.67,643.68,1607.72,1430.56,...,0.890095,-2.091687,-1.475044,1.152309,1.17002,0.236143,-1.337283,-2.091699,0.807553,0.810568
61247,1,249,254,35.0046,0.84,100.0,449.44,555.77,1381.29,1148.18,...,0.659687,-2.074994,-1.350534,0.985689,0.494451,0.117784,-1.275352,-2.075006,0.605143,0.599462
61248,0,249,255,42.003,0.84,100.0,445.0,549.85,1369.75,1147.45,...,0.599164,-2.45671,-1.607869,1.16467,0.505179,0.088678,-1.432496,-2.456731,0.545119,0.539896


In [8]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error

X = fm.copy().fillna(0)
y = X.pop('label')

Xp = np.array(X)
yp = np.array(y)
tscv = TimeSeriesSplit(n_splits=10)

for train_index, test_index in tscv.split(Xp):
    X_train, X_test = Xp[train_index], Xp[test_index]
    y_train, y_test = yp[train_index], yp[test_index]
    reg = RandomForestRegressor()
    reg.fit(X_train, y_train)
    preds = reg.predict(X_test)
    print('Mean Abs Error: {:.2f}'.format(mean_absolute_error(preds, y_test)))
    feature_imps = [(imp, X.columns[i]) for i, imp in enumerate(reg.feature_importances_)]
    feature_imps.sort()
    feature_imps.reverse()
    print('Feature Importances:')
    for i, f in enumerate(feature_imps[0:5]):
        print('{}: {} [{:.3f}]'.format(i + 1, f[1], f[0]))
    print('-----\n')


Mean Abs Error: 38.41
Feature Importances:
1: time_in_cycles [0.624]
2: engine_no [0.211]
3: sensor_measurement_13 [0.079]
4: sensor_measurement_15 [0.014]
5: sensor_measurement_11 [0.010]
-----

Mean Abs Error: 52.12
Feature Importances:
1: time_in_cycles [0.541]
2: engine_no [0.182]
3: sensor_measurement_13 [0.091]
4: sensor_measurement_11 [0.034]
5: sensor_measurement_15 [0.031]
-----

Mean Abs Error: 47.11
Feature Importances:
1: time_in_cycles [0.473]
2: engine_no [0.167]
3: sensor_measurement_13 [0.122]
4: sensor_measurement_15 [0.038]
5: sensor_measurement_11 [0.038]
-----

Mean Abs Error: 55.99
Feature Importances:
1: time_in_cycles [0.450]
2: sensor_measurement_13 [0.128]
3: engine_no [0.126]
4: sensor_measurement_15 [0.049]
5: sensor_measurement_11 [0.044]
-----

Mean Abs Error: 57.74
Feature Importances:
1: time_in_cycles [0.463]
2: sensor_measurement_13 [0.126]
3: engine_no [0.098]
4: sensor_measurement_15 [0.049]
5: sensor_measurement_11 [0.046]
-----

Mean Abs Error: 37.8

In [None]:
|