# Feature Enginnering Part 2

Here, the Feature Engineering process, defined on Part 1, is applied to all data.

## Libraries

In [1]:
import numpy as np 
import pandas as pd 
import tsfresh
from tsfresh import extract_features
from tsfresh.utilities.dataframe_functions import make_forecasting_frame
from tsfresh.utilities.dataframe_functions import impute
from cnr_methods import get_simplified_data, transform_data

## Feature Engineering Functions

Here, all the process created on Part 1 is organized on Functions before applying to the data.

In [2]:
def get_selected_features(n_features):
    selected_features = pd.read_csv(r'Feature Selection\Importance_WF1.csv')
    selected_features = selected_features[:n_features]['feature']
    return selected_features

In [3]:
def get_manual_features(feature_data):

    index = feature_data.index
    features = ['T', 'CLCT', 'U_100m','V_100m','U_10m','V_10m']

    # Wind Speed Vector
    feature_data['Wind Speed 100m'] = np.sqrt(feature_data['U_100m']**2 + feature_data['V_100m']**2)
    feature_data['Wind Direction 100m'] = np.arctan(feature_data['V_100m']/feature_data['U_100m'])
    feature_data['Wind Speed 10m'] = np.sqrt(feature_data['U_10m']**2 + feature_data['V_10m']**2)
    feature_data['Wind Direction 10m'] = np.arctan(feature_data['V_10m']/feature_data['U_10m'])

    feature_data['Wind Direction 100m'] = feature_data['Wind Direction 100m'].apply(lambda x: 360 + x if x < 0 else x)
    feature_data['Wind Direction 10m'] = feature_data['Wind Direction 10m'].apply(lambda x: 360 + x if x < 0 else x)

    # Time Relative Variables 

    for column in features:
        feature_data[column + '_lag_7_days'] = feature_data[column].shift(7)
        feature_data[column + '_lag_14_days'] = feature_data[column].shift(14)
        feature_data[column + '_lag_21_days'] = feature_data[column].shift(21)

    feature_data['Month_Number'] = feature_data.index.month # Month Number

    mean_month = feature_data.groupby('Month_Number').mean()[features]
    variance_month = feature_data.groupby('Month_Number').var()[features]

    mean_month.columns = mean_month.columns + '_Last_Month_Mean'
    variance_month.columns = variance_month.columns + 'Last_Month_Variance'

    month_data = mean_month.merge(variance_month,on='Month_Number',how='left')
    month_data = month_data.reset_index()
    month_data['Month_Number'] = month_data['Month_Number'] + 1
    month_data['Month_Number'] = month_data['Month_Number'].replace({13:1})

    feature_data = feature_data.merge(month_data,on='Month_Number',how='left')
    feature_data.index = index

    # Periodical Features

    day = feature_data.index.day
    hour = feature_data.index.hour
    minute = feature_data.index.minute
    dayofweek = feature_data.index.dayofweek
    dayofyear = feature_data.index.dayofyear
    days_in_month = feature_data.index.days_in_month

    feature_data["cos_day"], feature_data["sin_day"] = (
    np.cos(2 * np.pi * (day - 1) / days_in_month),
    np.sin(2 * np.pi * (day - 1) / days_in_month),
    )

    feature_data["cos_hour"], feature_data["sin_hour"] = (
        np.cos(2 * np.pi * hour / 24),
        np.sin(2 * np.pi * hour / 24),
        )

    feature_data["cos_minute"], feature_data["sin_minute"] = (
        np.cos(2 * np.pi * minute / 60),
        np.sin(2 * np.pi * minute / 60),
    )

    feature_data["cos_dayofyear"], feature_data["sin_dayofyear"] = (
        np.cos(2 * np.pi * (dayofyear - 1) / 365),
        np.sin(2 * np.pi * (dayofyear - 1) / 365),
    )

    feature_data["cos_dayofweek"], feature_data["sin_dayofweek"] = (
        np.cos(2 * np.pi * dayofweek / 7),
        np.sin(2 * np.pi * dayofweek / 7),
    )

    # Distance from Max and Min

    for column in features:
        feature_data[column + '_Distance_Max'] = feature_data.index - feature_data[column].idxmax()
        feature_data[column + '_Distance_Min'] = feature_data.index - feature_data[column].idxmin()
        feature_data[column + '_Distance_Max'] = feature_data[column + '_Distance_Max'].apply(lambda x : x.days)
        feature_data[column + '_Distance_Min'] = feature_data[column + '_Distance_Min'].apply(lambda x : x.days)

    # Rolling Window Statistics

    for column in features:
        feature_data[column + '_Rolling_7_Window_Mean'] = feature_data[column].rolling(window=7).mean()
        feature_data[column + '_Rolling_14_Window_Mean'] = feature_data[column].rolling(window=14).mean()
        feature_data[column + '_Rolling_7_Window_Variance'] = feature_data[column].rolling(window=7).var()
        feature_data[column + '_Rolling_14_Window_Variance'] = feature_data[column].rolling(window=14).var()

    # Expanded Window Statistics

    for column in features:
        feature_data[column + '_Expanded_Window_Min'] = feature_data[column].expanding().min()
        feature_data[column + '_Expanded_Window_Min'] = feature_data[column].expanding().max()


    # Dropping Base Features 
    #features.append(['Month_Number','Quarter Number'])
    feature_data = feature_data.drop(features,axis=1)

    return feature_data

In [4]:
def get_features_data(X): 
    feature_data = pd.DataFrame()
    for WF in X['WF'].unique():
        X_WF = X[X['WF']==WF]

        X_WF = get_manual_features(X_WF)

        X_WF['WF'] = WF
        feature_data = pd.concat([feature_data,X_WF],axis=0)

    feature_data = pd.concat([X,feature_data],axis=1) 

    return feature_data

## Process Application

Here, the Feature Engineering is properly applied on the Full Data.

In [5]:
X,y_train = get_simplified_data()

X_train = X[X['Set']=='Train']
X_test = X[X['Set']=='Test']

In [6]:
X_train = get_features_data(X_train)

In [7]:
X_test = get_features_data(X_test)

In [8]:
X_train['Set'] = 'Train'
X_test['Set'] = 'Test'

feature_data = pd.concat([X_train,X_test],axis=0)

feature_data = feature_data.loc[:,~feature_data.columns.duplicated()]


feature_data.to_csv(r'C:\Users\andre_\OneDrive\Documentos\Feature Selection\Selected_Features_Data.csv')