# Feature Enginnering Part 2

Here, the Feature Engineering process, defined on Part 1, is applied to all data.

## Libraries

In [1]:
import numpy as np 
import pandas as pd 
import tsfresh
from tsfresh import extract_features
from tsfresh.utilities.dataframe_functions import make_forecasting_frame
from tsfresh.utilities.dataframe_functions import impute
from cnr_methods import get_simplified_data, transform_data

## Feature Engineering Functions

Here, all the process created on Part 1 is organized on Functions before applying to the data.

In [2]:
def get_selected_features(n_features):
    selected_features = pd.read_csv(r'Feature Selection\Importance_WF1.csv')
    selected_features = selected_features[:n_features]['feature']
    return selected_features

In [3]:
def get_manual_features(feature_data):

    index = feature_data.index
    features = ['T', 'CLCT', 'U_100m','V_100m','U_10m','V_10m']

    # Wind Speed Vector
    feature_data['Wind Speed 100m'] = np.sqrt(feature_data['U_100m']**2 + feature_data['V_100m']**2)
    feature_data['Wind Direction 100m'] = np.arctan(feature_data['V_100m']/feature_data['U_100m'])
    feature_data['Wind Speed 10m'] = np.sqrt(feature_data['U_10m']**2 + feature_data['V_10m']**2)
    feature_data['Wind Direction 10m'] = np.arctan(feature_data['V_10m']/feature_data['U_10m'])

    feature_data['Wind Direction 100m'] = feature_data['Wind Direction 100m'].apply(lambda x: 360 + x if x < 0 else x)
    feature_data['Wind Direction 10m'] = feature_data['Wind Direction 10m'].apply(lambda x: 360 + x if x < 0 else x)

    # Time Relative Variables 

    for column in features:
        feature_data[column + '_last_week'] = feature_data[column].shift(7) # Values for Last Week
        feature_data[column + '_last_month'] = feature_data[column].shift(30) # Values for Last Month

    feature_data['Month_Number'] = feature_data.index.month # Month Number
    feature_data['Quarter_Number'] = feature_data.index.quarter # Quarter Number

    mean_month = feature_data.groupby('Month_Number').mean()[features] # Month Mean
    median_month = feature_data.groupby('Month_Number').median()[features] # Month Median
    variance_month = feature_data.groupby('Month_Number').var()[features] # Month Variance

    mean_quarter = feature_data.groupby('Quarter_Number').mean()[features] # Quarter Mean
    median_quarter = feature_data.groupby('Quarter_Number').median()[features] # Quarter Median
    variance_quarter = feature_data.groupby('Quarter_Number').var()[features] # Quarter Variance

    mean_month.columns = mean_month.columns + '_Month_Mean'
    median_month.columns = median_month.columns + '_Month_Median'
    variance_month.columns = variance_month.columns + '_Month_Variance'
    mean_quarter.columns = mean_quarter.columns + '_Quarter_Mean'
    median_quarter.columns = median_quarter.columns + '_Quarterh_Median'
    variance_quarter.columns = variance_quarter.columns + '_Quarter_Variance'

    feature_data = feature_data.merge(mean_month,on='Month_Number',how='left')
    feature_data = feature_data.merge(median_month,on='Month_Number',how='left')
    feature_data = feature_data.merge(variance_month,on='Month_Number',how='left')
    feature_data = feature_data.merge(mean_quarter,on='Quarter_Number',how='left')
    feature_data = feature_data.merge(median_quarter,on='Quarter_Number',how='left')
    feature_data = feature_data.merge(variance_quarter,on='Quarter_Number',how='left')
    feature_data.index = index

    # Periodical Features

    day = feature_data.index.day
    hour = feature_data.index.hour
    minute = feature_data.index.minute
    dayofweek = feature_data.index.dayofweek
    dayofyear = feature_data.index.dayofyear
    days_in_month = feature_data.index.days_in_month

    feature_data["cos_day"], feature_data["sin_day"] = (
    np.cos(2 * np.pi * (day - 1) / days_in_month),
    np.sin(2 * np.pi * (day - 1) / days_in_month),
    )

    feature_data["cos_hour"], feature_data["sin_hour"] = (
        np.cos(2 * np.pi * hour / 24),
        np.sin(2 * np.pi * hour / 24),
        )

    feature_data["cos_minute"], feature_data["sin_minute"] = (
        np.cos(2 * np.pi * minute / 60),
        np.sin(2 * np.pi * minute / 60),
    )

    feature_data["cos_dayofyear"], feature_data["sin_dayofyear"] = (
        np.cos(2 * np.pi * (dayofyear - 1) / 365),
        np.sin(2 * np.pi * (dayofyear - 1) / 365),
    )

    feature_data["cos_dayofweek"], feature_data["sin_dayofweek"] = (
        np.cos(2 * np.pi * dayofweek / 7),
        np.sin(2 * np.pi * dayofweek / 7),
    )

    # Distance from Max and Min

    for column in features:
        feature_data[column + '_Distance_Max'] = feature_data.index - feature_data[column].idxmax()
        feature_data[column + '_Distance_Min'] = feature_data.index - feature_data[column].idxmin()
        feature_data[column + '_Distance_Max'] = feature_data[column + '_Distance_Max'].apply(lambda x : x.days)
        feature_data[column + '_Distance_Min'] = feature_data[column + '_Distance_Min'].apply(lambda x : x.days)

    # Dropping Base Features 
    #features.append(['Month_Number','Quarter Number'])
    feature_data = feature_data.drop(features,axis=1)

    return feature_data

In [4]:
def get_tsfresh_features(data):
    tsfresh_data = pd.DataFrame()
    for variable in ['U_100m','V_100m','U_10m','V_10m','T','CLCT']: 
        df_shift, y = make_forecasting_frame(data[variable],kind=variable,max_timeshift=20,rolling_direction=1)
        X = extract_features(df_shift, column_id="id", column_sort="time", column_value="value", impute_function=impute,show_warnings=False,n_jobs=3)
        X['Feature'] = variable
        tsfresh_data = tsfresh_data.append(X)

    # Formmating the Data
    tsfresh_data = tsfresh_data.pivot(columns='Feature')
    tsfresh_data.columns = tsfresh_data.columns.map('{0[0]}|{0[1]}'.format)
    tsfresh_data = tsfresh_data.loc[:, tsfresh_data.apply(pd.Series.nunique) != 1] # Remove Constant Columns

    return tsfresh_data

In [5]:
def get_features_data(X,selected_features): 
    feature_data = pd.DataFrame()
    for WF in X['WF'].unique():
        X_WF = X[X['WF']==WF]

        X_manual = get_manual_features(X_WF)
        X_tsfresh = get_tsfresh_features(X_WF)
        X_WF = pd.concat([X_manual,X_tsfresh],axis=1)
        feature_data = pd.concat([feature_data,X_WF],axis=0)

    feature_data = pd.concat([X,feature_data],axis=1) 
    feature_data = feature_data[selected_features]

    return feature_data

## Process Application

Here, the Feature Engineering is properly applied on the Full Data.

In [6]:
X,y_train = get_simplified_data()

X_train = X[X['Set']=='Train']
X_test = X[X['Set']=='Test']

In [7]:
selected_features = get_selected_features(50)

In [8]:
X_train = get_features_data(X_train,selected_features)
#X_test = get_features_data(X_test,selected_features)

Feature Extraction: 100%|██████████| 15/15 [02:29<00:00,  9.96s/it]
Feature Extraction: 100%|██████████| 15/15 [02:27<00:00,  9.85s/it]
Feature Extraction: 100%|██████████| 15/15 [02:28<00:00,  9.90s/it]
Feature Extraction: 100%|██████████| 15/15 [02:31<00:00, 10.10s/it]
Feature Extraction: 100%|██████████| 15/15 [02:21<00:00,  9.46s/it]
Feature Extraction: 100%|██████████| 15/15 [01:51<00:00,  7.46s/it]
Feature Extraction: 100%|██████████| 15/15 [02:28<00:00,  9.89s/it]
Feature Extraction: 100%|██████████| 15/15 [02:28<00:00,  9.91s/it]
Feature Extraction: 100%|██████████| 15/15 [02:27<00:00,  9.86s/it]
Feature Extraction: 100%|██████████| 15/15 [02:28<00:00,  9.91s/it]
Feature Extraction: 100%|██████████| 15/15 [02:24<00:00,  9.64s/it]
Feature Extraction: 100%|██████████| 15/15 [01:58<00:00,  7.87s/it]
Feature Extraction: 100%|██████████| 15/15 [02:28<00:00,  9.92s/it]
Feature Extraction: 100%|██████████| 15/15 [02:27<00:00,  9.86s/it]
Feature Extraction: 100%|██████████| 15/15 [02:2

In [9]:
X_train

Unnamed: 0,Wind Speed 100m,value__energy_ratio_by_chunks__num_segments_10__segment_focus_9|V_100m,value__friedrich_coefficients__m_3__r_30__coeff_2|V_10m,value__time_reversal_asymmetry_statistic__lag_3|V_100m,value__c3__lag_2|V_10m,value__cid_ce__normalize_False|V_10m,value__quantile__q_0.1|V_100m,"value__cwt_coefficients__widths_(2, 5, 10, 20)__coeff_3__w_5|U_100m","value__change_quantiles__f_agg_""mean""__isabs_False__qh_1.0__ql_0.4|U_10m","value__fft_coefficient__coeff_9__attr_""real""|V_10m",...,"value__fft_coefficient__coeff_5__attr_""angle""|V_10m",value__quantile__q_0.3|T,value__minimum|U_10m,"value__cwt_coefficients__widths_(2, 5, 10, 20)__coeff_9__w_20|CLCT","value__agg_linear_trend__f_agg_""mean""__chunk_len_5__attr_""slope""|CLCT","value__fft_coefficient__coeff_8__attr_""imag""|U_10m","value__cwt_coefficients__widths_(2, 5, 10, 20)__coeff_2__w_10|CLCT",Wind Speed 10m,"value__linear_trend__attr_""stderr""|V_100m","value__cwt_coefficients__widths_(2, 5, 10, 20)__coeff_11__w_2|V_100m"
2018-05-01 01:00:00,3.958410,,,,,,,,,,...,,,,,,,,1.287613,,
2018-05-01 02:00:00,2.831607,0.000000,-0.170318,0.000000,0.000000,0.000000,-3.257800,1.301759,0.000000,-0.159832,...,19.832130,286.440000,1.254603,129.261055,0.000000,-0.036940,82.978398,2.524975,0.058691,-0.138579
2018-05-01 03:00:00,1.908954,0.000000,-0.170318,0.000000,0.000000,0.123682,-3.076630,1.301759,0.000000,-0.159832,...,19.832130,286.314000,1.254603,129.261055,0.000000,-0.036940,82.978398,1.731130,0.000000,-0.138579
2018-05-01 04:00:00,7.238384,0.000000,-0.004963,0.000000,0.000000,1.009375,-2.895460,1.301759,1.236305,-0.159832,...,19.832130,286.368000,0.997093,129.261055,0.000000,-0.036940,53.994460,1.183180,0.351995,-0.138579
2018-05-01 05:00:00,6.647232,0.000000,8.690463,0.000000,0.000000,1.106652,-5.329520,-0.057668,1.236305,-0.159832,...,19.832130,286.112000,0.689598,129.261055,0.000000,-0.036940,101.230389,0.414344,1.182930,-0.138579
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-01-15 20:00:00,5.555136,0.052024,5.559128,164.200976,-121.642750,3.201610,-9.877035,0.895961,0.010199,-2.359539,...,170.743397,275.608000,-0.557474,85.957936,-5.518759,0.231413,56.902084,1.116643,0.032627,2.297913
2019-01-15 21:00:00,4.466715,0.049091,1.021078,182.541877,-114.762898,3.249738,-9.877035,1.053627,-0.016723,-1.381516,...,100.635951,275.577000,-0.557474,80.414405,-5.821506,-0.004651,56.132273,0.822711,0.030140,1.567985
2019-01-15 22:00:00,4.599713,0.041070,-0.305104,201.078845,-107.245347,3.254980,-9.877035,1.285723,-0.065147,-2.585641,...,115.336272,275.478245,-0.557474,77.610008,-8.022825,-0.212426,62.209185,0.707360,0.030125,0.392285
2019-01-15 23:00:00,4.094362,0.035231,-0.631844,209.261386,-99.095078,3.250074,-9.877035,1.339524,-0.121936,-1.630878,...,147.219146,275.207028,-0.557474,74.414578,-10.504460,0.050066,68.421053,0.545934,0.029514,-0.609207
