# Feature Engineering

## Libraries

In [1]:
import numpy as np
import pandas as pd
from cnr_methods import get_simplified_data, transform_data

# Feature Engineering Library for Time Series
from tsfresh import extract_features
from tsfresh.utilities.dataframe_functions import make_forecasting_frame
from tsfresh.utilities.dataframe_functions import impute
from tsfresh.feature_selection.relevance import calculate_relevance_table

# Feature Selection Libraries
from lofo import LOFOImportance, Dataset, plot_importance
from sklearn.model_selection import KFold
from sklearn.metrics import make_scorer, mean_absolute_error
import xgboost as xgb

## Read Data

For this pipeline, only Training Set will be used.

In [2]:
full_data = get_simplified_data()
full_data = full_data[full_data['Set']=='Train']
y_train = pd.read_csv('Data/Y_train.csv')

As done in the other Notebooks, we will transform the Column 'Time' to Datetime format and set as the index of the dataset.

In [3]:
full_data['Time'] = pd.to_datetime(full_data['Time'],dayfirst=True)
full_data = full_data.set_index('Time')

In [4]:
full_data.head()

Unnamed: 0_level_0,ID,WF,U_100m,V_100m,U_10m,V_10m,T,CLCT,Set
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2018-05-01 01:00:00,1,WF1,-2.2485,-3.2578,1.254603,-0.289687,286.44,82.543144,Train
2018-05-01 02:00:00,2,WF1,-2.4345,-1.4461,2.490908,-0.41337,286.26,99.990844,Train
2018-05-01 03:00:00,3,WF1,-1.220571,-0.266871,0.997093,-1.415138,286.575,98.367235,Train
2018-05-01 04:00:00,4,WF1,3.7065,-6.2174,0.689598,-0.961441,284.78,94.860604,Train
2018-05-01 05:00:00,5,WF1,3.8134,-5.4446,0.290994,-0.294963,284.46,95.905879,Train


To simplify the work, we will generate features for just one Wind Farm. When doing modelling, the features, as the models, will be generated for all Wind Farms separately.

In [5]:
WF = 'WF1'
data = full_data[full_data['WF']==WF]
y_train = y_train[y_train['ID'].isin(data['ID'])]

## Feature Creation

### Wind Speed Vector

In [6]:
feature_data = data[['ID','WF','U_100m','V_100m','U_10m','V_10m','T','CLCT','Set']]
feature_data['Wind Speed 100m'] = np.sqrt(feature_data['U_100m']**2 + feature_data['V_100m']**2)
feature_data['Wind Direction 100m'] = np.arctan(feature_data['V_100m']/feature_data['U_100m'])
feature_data['Wind Speed 10m'] = np.sqrt(feature_data['U_10m']**2 + feature_data['V_10m']**2)
feature_data['Wind Direction 10m'] = np.arctan(feature_data['V_10m']/feature_data['U_10m'])
feature_data = feature_data.drop(['U_100m','V_100m','U_10m','V_10m'],axis=1)

Changing Reference for Negative Angles:

In [7]:
feature_data['Wind Direction 100m'] = feature_data['Wind Direction 100m'].apply(lambda x: 360 + x if x < 0 else x)
feature_data['Wind Direction 10m'] = feature_data['Wind Direction 10m'].apply(lambda x: 360 + x if x < 0 else x)

Using Wind Speed and Direction instead of U and V, we will create some variables over the Numerical Variables from the simplified data.

In [8]:
feature_data[feature_data['Wind Direction 100m'] < 0]['Wind Direction 100m'] = 360 - feature_data[feature_data['Wind Direction 100m'] < 0]['Wind Direction 100m']

In [9]:
features = ['T', 'CLCT', 'Wind Speed 100m','Wind Direction 100m', 'Wind Speed 10m', 'Wind Direction 10m']

### Time-Relative Variables

Here, we get Values for Last Week and Month for each Numerical Feature.

In [10]:
for column in features:
    feature_data[column + '_last_week'] = feature_data[column].shift(7)
    feature_data[column + '_last_month'] = feature_data[column].shift(30)

Get the Number of Month:

In [11]:
feature_data['Month_Number'] = feature_data.index.month

Month Statistics:

In [12]:
mean = feature_data.groupby('Month_Number').mean()[features]
median = feature_data.groupby('Month_Number').median()[features]
variance = feature_data.groupby('Month_Number').var()[features]

In [13]:
mean.columns = mean.columns + '_Month_Mean'
median.columns = median.columns + '_Month_Median'
variance.columns = variance.columns + '_Month_Variance'

In [14]:
feature_data = feature_data.merge(mean,on='Month_Number',how='left')
feature_data = feature_data.merge(median,on='Month_Number',how='left')
feature_data = feature_data.merge(variance,on='Month_Number',how='left')

In [15]:
feature_data.index = data.index

### Distance from Features

Distance of Position of Max and Min (Already on Tsfresh, check it later):

In [16]:
for column in features:
    feature_data[column + '_Distance_Max'] = feature_data.index - feature_data[column].idxmax()
    feature_data[column + '_Distance_Min'] = feature_data.index - feature_data[column].idxmin()
    feature_data[column + '_Distance_Max'] = feature_data[column + '_Distance_Max'].apply(lambda x : x.days)
    feature_data[column + '_Distance_Min'] = feature_data[column + '_Distance_Min'].apply(lambda x : x.days)

### Rolling Window Variables

### Wavelet Transformations (Check)

## Tsfresh

Now we use Tsfresh, a Python Library that automates Feature Engineering for Time Series Data. We generate new features for all the columns on the Simplified Data, as done below.

In [17]:
data = data[['ID','WF','U_100m','V_100m','U_10m','V_10m','T','CLCT','Set']]

In [18]:
tsfresh_data = pd.DataFrame()
for variable in ['U_100m','V_100m','U_10m','V_10m','T','CLCT']: 
    df_shift, y = make_forecasting_frame(data[variable],kind=variable,max_timeshift=20,rolling_direction=1)
    X = extract_features(df_shift, column_id="id", column_sort="time", column_value="value", impute_function=impute,show_warnings=False,n_jobs=3)
    X['Feature'] = variable
    tsfresh_data = tsfresh_data.append(X)

Feature Extraction: 100%|██████████| 15/15 [02:14<00:00,  8.96s/it]
Feature Extraction: 100%|██████████| 15/15 [02:18<00:00,  9.21s/it]
Feature Extraction: 100%|██████████| 15/15 [02:13<00:00,  8.87s/it]
Feature Extraction: 100%|██████████| 15/15 [02:12<00:00,  8.87s/it]
Feature Extraction: 100%|██████████| 15/15 [02:08<00:00,  8.58s/it]
Feature Extraction: 100%|██████████| 15/15 [01:45<00:00,  7.03s/it]


Process tsfresh_data to pass column 'Features' to the other columns

In [19]:
tsfresh_data = tsfresh_data.pivot(columns='Feature')

In [20]:
tsfresh_data.columns = tsfresh_data.columns.map('{0[0]}|{0[1]}'.format)

In [21]:
tsfresh_data.head()

Unnamed: 0_level_0,value__abs_energy|CLCT,value__abs_energy|T,value__abs_energy|U_100m,value__abs_energy|U_10m,value__abs_energy|V_100m,value__abs_energy|V_10m,value__absolute_sum_of_changes|CLCT,value__absolute_sum_of_changes|T,value__absolute_sum_of_changes|U_100m,value__absolute_sum_of_changes|U_10m,...,value__variance|U_100m,value__variance|U_10m,value__variance|V_100m,value__variance|V_10m,value__variance_larger_than_standard_deviation|CLCT,value__variance_larger_than_standard_deviation|T,value__variance_larger_than_standard_deviation|U_100m,value__variance_larger_than_standard_deviation|U_10m,value__variance_larger_than_standard_deviation|V_100m,value__variance_larger_than_standard_deviation|V_10m
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-05-01 02:00:00,6813.370572,82047.8736,5.055752,1.57403,10.613261,0.083919,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2018-05-01 03:00:00,16811.539518,163992.6612,10.982542,7.778652,12.704466,0.254793,17.447701,0.18,0.186,1.236305,...,0.008649,0.382112,0.820564,0.003824,1.0,0.0,0.0,0.0,0.0,0.0
2018-05-01 04:00:00,26487.652359,246117.891825,12.472335,8.772845,12.775686,2.257409,19.07131,0.495,1.399929,2.73012,...,0.284984,0.425138,1.513166,0.253942,1.0,0.0,0.0,0.0,1.0,0.0
2018-05-01 05:00:00,35486.186521,327217.540225,26.210477,9.248391,51.431749,3.181777,22.577941,2.29,6.327,3.037614,...,6.250924,0.467797,5.034489,0.202684,1.0,0.0,1.0,0.0,1.0,0.0
2018-05-01 06:00:00,44684.124127,408135.031825,40.752497,9.333069,81.075418,3.268781,23.623216,2.61,6.4339,3.436218,...,8.045999,0.556415,5.149121,0.198239,1.0,0.0,1.0,0.0,1.0,0.0


In [22]:
tsfresh_data = tsfresh_data.fillna(0)

## Feature Selection

In [23]:
final_features = feature_data.merge(tsfresh_data,left_on=feature_data.index,right_on=tsfresh_data.index,how='left')

final_features = final_features.merge(y_train,on='ID',how='left')

final_features = final_features.rename({'key_0':'Date'},axis=1)

In [24]:
features = final_features.drop(['ID','WF','Set','Date','Production'],axis=1).columns

In [25]:
final_features = transform_data(final_features[features],1)
final_features = final_features.fillna(0)

In [26]:
final_features

Unnamed: 0,T,CLCT,Wind Speed 100m,Wind Direction 100m,Wind Speed 10m,Wind Direction 10m,T_last_week,T_last_month,CLCT_last_week,CLCT_last_month,...,value__variance|U_100m,value__variance|U_10m,value__variance|V_100m,value__variance|V_10m,value__variance_larger_than_standard_deviation|CLCT,value__variance_larger_than_standard_deviation|T,value__variance_larger_than_standard_deviation|U_100m,value__variance_larger_than_standard_deviation|U_10m,value__variance_larger_than_standard_deviation|V_100m,value__variance_larger_than_standard_deviation|V_10m
0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.000629,0.191758,-0.334998,-0.589742,0.673441,0.000174,0.000000,0.000000,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
2,0.001100,-0.016371,-0.818177,-0.912297,-0.377457,-0.002205,0.000000,0.000000,0.0,0.0,...,inf,inf,inf,inf,inf,0.0,0.0,0.0,0.0,0.0
3,-0.006283,-0.036299,1.756731,7.419152,-0.380569,0.000023,0.000000,0.000000,0.0,0.0,...,3.494991,0.106700,0.611967,4.195722,0.0,0.0,0.0,0.0,inf,0.0
4,-0.001124,0.010959,-0.085198,0.000204,-1.049264,0.000436,0.000000,0.000000,0.0,0.0,...,3.088050,0.095619,1.202108,-0.225455,0.0,0.0,inf,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6234,-0.002239,0.000000,-0.031532,-0.046461,-0.114314,-5.451013,0.003513,-0.000544,0.0,-inf,...,-0.033698,-0.035140,0.069542,0.289901,0.0,0.0,0.0,0.0,0.0,0.0
6235,0.002080,0.000000,-0.341882,-0.000802,-0.107440,-0.069196,0.001692,-0.002763,0.0,0.0,...,-0.051091,-0.055341,0.053464,0.214786,0.0,0.0,0.0,0.0,0.0,0.0
6236,-0.005176,0.000000,0.162839,-0.090606,-0.077536,-0.108617,-0.000030,-0.000102,0.0,0.0,...,-0.180314,-0.062798,0.046493,0.158688,0.0,0.0,0.0,0.0,0.0,0.0
6237,-0.001007,inf,-0.151016,-0.137888,-0.067829,0.057593,-0.001697,-0.001582,0.0,0.0,...,-0.229285,-0.116145,-0.042519,0.116120,0.0,0.0,0.0,0.0,0.0,0.0


For a faster Feature Selection, here we do Five Subsamples of Data, and the Final Importance of Features will be the mean of the Importance calculated for each Subsample.

In [47]:
final_features = final_features.sample(frac=0.05,random_state=0)

In [48]:
cv = KFold(n_splits=4, shuffle=False, random_state=0)

In [49]:
scorer = make_scorer(mean_absolute_error, greater_is_better=False)

In [50]:
xgboost = xgb.XGBRegressor(tree_method='gpu_hist',max_depth=5,n_jobs=3)

In [51]:
dataset = Dataset(df=final_features, target="Production", features=features)

In [52]:
lofo_imp = LOFOImportance(dataset, cv=cv, scoring=scorer,model=xgboost)

In [53]:
importance_df = lofo_imp.get_importance()

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [None]:
plot_importance(importance_df, figsize=(12, 20))

In [58]:
final_features

Unnamed: 0,T,CLCT,Wind Speed 100m,Wind Direction 100m,Wind Speed 10m,Wind Direction 10m,T_last_week,T_last_month,CLCT_last_week,CLCT_last_month,...,value__variance|U_10m,value__variance|V_100m,value__variance|V_10m,value__variance_larger_than_standard_deviation|CLCT,value__variance_larger_than_standard_deviation|T,value__variance_larger_than_standard_deviation|U_100m,value__variance_larger_than_standard_deviation|U_10m,value__variance_larger_than_standard_deviation|V_100m,value__variance_larger_than_standard_deviation|V_10m,Production
0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,...,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
1,-0.000629,0.191758,-0.334998,-0.589742,0.673441,0.000174,0.000000,0.000000,0.0,0.0,...,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,1.252763
2,0.001100,-0.016371,-0.818177,-0.912297,-0.377457,-0.002205,0.000000,0.000000,0.0,0.0,...,inf,inf,inf,inf,0.0,0.0,0.0,0.0,0.0,1.145132
3,-0.006283,-0.036299,1.756731,7.419152,-0.380569,0.000023,0.000000,0.000000,0.0,0.0,...,0.106700,0.611967,4.195722,0.0,0.0,0.0,0.0,inf,0.0,0.572519
4,-0.001124,0.010959,-0.085198,0.000204,-1.049264,0.000436,0.000000,0.000000,0.0,0.0,...,0.095619,1.202108,-0.225455,0.0,0.0,inf,0.0,0.0,0.0,0.050010
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6234,-0.002239,0.000000,-0.031532,-0.046461,-0.114314,-5.451013,0.003513,-0.000544,0.0,-inf,...,-0.035140,0.069542,0.289901,0.0,0.0,0.0,0.0,0.0,0.0,-1.163151
6235,0.002080,0.000000,-0.341882,-0.000802,-0.107440,-0.069196,0.001692,-0.002763,0.0,0.0,...,-0.055341,0.053464,0.214786,0.0,0.0,0.0,0.0,0.0,0.0,0.182322
6236,-0.005176,0.000000,0.162839,-0.090606,-0.077536,-0.108617,-0.000030,-0.000102,0.0,0.0,...,-0.062798,0.046493,0.158688,0.0,0.0,0.0,0.0,0.0,0.0,-2.379546
6237,-0.001007,inf,-0.151016,-0.137888,-0.067829,0.057593,-0.001697,-0.001582,0.0,0.0,...,-0.116145,-0.042519,0.116120,0.0,0.0,0.0,0.0,0.0,0.0,-0.223144
