In [1]:
import numpy as np
import pandas as pd
import featuretools as ft

operational_settings = ['operational_setting_{}'.format(i + 1) for i in range (3)]
sensor_columns = ['sensor_measurement_{}'.format(i + 1) for i in range(26)]
cols = ['engine_no', 'time_in_cycles'] + operational_settings + sensor_columns

data = pd.read_csv('data/RUL_train.txt', sep=' ', header=-1, names=cols)

data = data.drop(cols[-5:], axis=1)
data['index'] = data.index
data.head()

Unnamed: 0,engine_no,time_in_cycles,operational_setting_1,operational_setting_2,operational_setting_3,sensor_measurement_1,sensor_measurement_2,sensor_measurement_3,sensor_measurement_4,sensor_measurement_5,...,sensor_measurement_13,sensor_measurement_14,sensor_measurement_15,sensor_measurement_16,sensor_measurement_17,sensor_measurement_18,sensor_measurement_19,sensor_measurement_20,sensor_measurement_21,index
0,1,1,42.0049,0.84,100.0,445.0,549.68,1343.43,1112.93,3.91,...,2387.99,8074.83,9.3335,0.02,330,2212,100.0,10.62,6.367,0
1,1,2,20.002,0.7002,100.0,491.19,606.07,1477.61,1237.5,9.35,...,2387.73,8046.13,9.1913,0.02,361,2324,100.0,24.37,14.6552,1
2,1,3,42.0038,0.8409,100.0,445.0,548.95,1343.12,1117.05,3.91,...,2387.97,8066.62,9.4007,0.02,329,2212,100.0,10.48,6.4213,2
3,1,4,42.0,0.84,100.0,445.0,548.7,1341.24,1118.03,3.91,...,2388.02,8076.05,9.3369,0.02,328,2212,100.0,10.54,6.4176,3
4,1,5,25.0063,0.6207,60.0,462.54,536.1,1255.23,1033.59,7.05,...,2028.08,7865.8,10.8366,0.02,305,1915,84.93,14.03,8.6754,4


In [2]:
gb = data.groupby(['engine_no'])
labels = []
for engine_no_df in gb:
    instances = engine_no_df[1].shape[0]
    label = [instances - i - 1 for i in range(instances)]
    labels += label
data['label'] = labels
data['time'] = pd.date_range('1/1/2000', periods=data.shape[0], freq='s')
data.tail()

Unnamed: 0,engine_no,time_in_cycles,operational_setting_1,operational_setting_2,operational_setting_3,sensor_measurement_1,sensor_measurement_2,sensor_measurement_3,sensor_measurement_4,sensor_measurement_5,...,sensor_measurement_15,sensor_measurement_16,sensor_measurement_17,sensor_measurement_18,sensor_measurement_19,sensor_measurement_20,sensor_measurement_21,index,label,time
61244,249,251,9.9998,0.25,100.0,489.05,605.33,1516.36,1315.28,10.52,...,8.4541,0.03,372,2319,100.0,29.11,17.5234,61244,4,2000-01-01 17:00:44
61245,249,252,0.0028,0.0015,100.0,518.67,643.42,1598.92,1426.77,14.62,...,8.2221,0.03,396,2388,100.0,39.38,23.7151,61245,3,2000-01-01 17:00:45
61246,249,253,0.0029,0.0,100.0,518.67,643.68,1607.72,1430.56,14.62,...,8.2525,0.03,395,2388,100.0,39.78,23.827,61246,2,2000-01-01 17:00:46
61247,249,254,35.0046,0.84,100.0,449.44,555.77,1381.29,1148.18,5.48,...,9.0515,0.02,337,2223,100.0,15.26,9.0774,61247,1,2000-01-01 17:00:47
61248,249,255,42.003,0.84,100.0,445.0,549.85,1369.75,1147.45,3.91,...,9.1207,0.02,333,2212,100.0,10.66,6.4341,61248,0,2000-01-01 17:00:48


In [3]:
# Here we use the given labels from test
# It seems the training problem requires creation of our own labels.
# Using these for now to get DFS set up
cutoff_times = data[['index', 'time']]
cutoff_times['label'] = data.pop('label')


In [4]:
es = ft.EntitySet('Dataset')
es.entity_from_dataframe(dataframe=data,
                         entity_id='recordings',
                         index='index',
                         time_index='time')

es.normalize_entity(base_entity_id='recordings', 
                    new_entity_id='engines',
                    index='engine_no')

es.normalize_entity(base_entity_id='recordings', 
                    new_entity_id='cycles',
                    index='time_in_cycles')
es['engines'].df.tail()

Unnamed: 0_level_0,engine_no,first_recordings_time
engine_no,Unnamed: 1_level_1,Unnamed: 2_level_1
245,245,2000-01-01 16:43:53
246,246,2000-01-01 16:47:18
247,247,2000-01-01 16:49:59
248,248,2000-01-01 16:53:30
249,249,2000-01-01 16:56:34


In [10]:
from featuretools.primitives import Sum, Mean, Std, Skew
fm, features = ft.dfs(entityset=es, 
                      target_entity='recordings',
                      agg_primitives=[Sum, Mean, Std, Skew],
                      trans_primitives=[],
                      cutoff_time=cutoff_times[0:2000],
                      approximate='4s',
                      verbose=True)

Built 218 features
Elapsed: 02:07 | Remaining: 00:00 | Progress: 100%|██████████|| Calculated: 250/250 cutoff times


In [6]:
fm.tail()

Unnamed: 0_level_0,label,engine_no,time_in_cycles,operational_setting_1,operational_setting_2,operational_setting_3,sensor_measurement_1,sensor_measurement_2,sensor_measurement_3,sensor_measurement_4,...,cycles.SKEW(recordings.sensor_measurement_12),cycles.SKEW(recordings.sensor_measurement_13),cycles.SKEW(recordings.sensor_measurement_14),cycles.SKEW(recordings.sensor_measurement_15),cycles.SKEW(recordings.sensor_measurement_16),cycles.SKEW(recordings.sensor_measurement_17),cycles.SKEW(recordings.sensor_measurement_18),cycles.SKEW(recordings.sensor_measurement_19),cycles.SKEW(recordings.sensor_measurement_20),cycles.SKEW(recordings.sensor_measurement_21)
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
495,124,2,175,20.0057,0.7003,100.0,491.19,608.04,1488.24,1247.27,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
496,123,2,176,42.0036,0.8417,100.0,445.0,549.78,1347.81,1123.17,...,1.886785e-16,0.0,0.0,0.0,-1.058791e-15,0.0,0.0,0.0,0.0,2.78559e-16
497,122,2,177,25.0024,0.62,60.0,462.54,536.99,1255.83,1053.04,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
498,121,2,178,0.002,0.0,100.0,518.67,643.05,1590.25,1405.24,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
499,120,2,179,25.0043,0.6212,60.0,462.54,535.99,1262.9,1050.31,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error

X = fm.copy().fillna(0)
y = X.pop('label')

Xp = np.array(X)
yp = np.array(y)
tscv = TimeSeriesSplit(n_splits=6)

for train_index, test_index in tscv.split(Xp):
    X_train, X_test = Xp[train_index], Xp[test_index]
    y_train, y_test = yp[train_index], yp[test_index]
    reg = RandomForestRegressor()
    reg.fit(X_train, y_train)
    preds = reg.predict(X_test)
    print('Mean Abs Error: {:.2f}'.format(mean_absolute_error(preds, y_test)))
    feature_imps = [(imp, X.columns[i]) for i, imp in enumerate(reg.feature_importances_)]
    feature_imps.sort()
    feature_imps.reverse()
    print('Feature Importances:')
    for i, f in enumerate(feature_imps[0:5]):
        print('{}: {} [{:.3f}]'.format(i + 1, f[1], f[0]))
    print('-----\n')


Mean Abs Error: 73.73
Feature Importances:
1: time_in_cycles [0.483]
2: engines.SUM(recordings.sensor_measurement_18) [0.086]
3: engines.SUM(recordings.sensor_measurement_21) [0.077]
4: engines.SUM(recordings.sensor_measurement_4) [0.077]
5: engines.MEAN(recordings.sensor_measurement_11) [0.076]
-----

Mean Abs Error: 18.89
Feature Importances:
1: time_in_cycles [0.362]
2: engines.SUM(recordings.sensor_measurement_19) [0.167]
3: engines.SUM(recordings.operational_setting_3) [0.076]
4: engines.SUM(recordings.sensor_measurement_5) [0.076]
5: engines.SUM(recordings.sensor_measurement_18) [0.076]
-----

Mean Abs Error: 21.86
Feature Importances:
1: time_in_cycles [0.350]
2: engines.SUM(recordings.sensor_measurement_20) [0.167]
3: engines.SUM(recordings.sensor_measurement_7) [0.092]
4: engines.SUM(recordings.sensor_measurement_11) [0.088]
5: engines.SUM(recordings.sensor_measurement_14) [0.085]
-----

Mean Abs Error: 9.57
Feature Importances:
1: engines.SUM(recordings.sensor_measurement_12)

In [15]:
test_data = pd.read_csv('data/RUL_test.txt', sep=' ', header=-1, names=cols)
test_data = test_data.drop(cols[-5:], axis=1)
test_data['index'] = test_data.index

es2 = ft.EntitySet('Dataset2')
es2.entity_from_dataframe(dataframe=test_data,
                         entity_id='recordings',
                         index='index')

es2.normalize_entity(base_entity_id='recordings', 
                    new_entity_id='engines',
                    index='engine_no')

es2.normalize_entity(base_entity_id='recordings', 
                    new_entity_id='cycles',
                    index='time_in_cycles')


Entityset: Dataset2
  Entities:
    recordings (shape = [41214, 27])
    engines (shape = [248, 1])
    cycles (shape = [486, 1])
  Relationships:
    recordings.engine_no -> engines.engine_no
    recordings.time_in_cycles -> cycles.time_in_cycles

In [25]:
fm2 = ft.calculate_feature_matrix(entityset=es2, features=features, verbose=True)

Elapsed: 00:09 | Remaining: 00:00 | Progress: 100%|██████████|| Calculated: 1/1 cutoff times


In [27]:
preds = reg.predict(fm2)
out = fm2.copy()
out['predictions'] = preds
out[out['engine_no']==1][['engine_no', 'predictions']]

Unnamed: 0_level_0,engine_no,predictions
index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1,77.3
1,1,77.2
2,1,77.2
3,1,77.2
4,1,77.4
5,1,77.2
6,1,77.3
7,1,77.4
8,1,77.3
9,1,77.3
