In [2]:
import numpy as np 
import pandas as pd 
import tsfresh
from tsfresh import extract_features
from tsfresh.utilities.dataframe_functions import make_forecasting_frame
from tsfresh.utilities.dataframe_functions import impute
from cnr_methods import get_simplified_data, transform_data

In [3]:
def get_manual_features(feature_data):

    index = feature_data.index
    features = ['T', 'CLCT', 'U_100m','V_100m','U_10m','V_10m']

    # Wind Speed Vector
    feature_data['Wind Speed 100m'] = np.sqrt(feature_data['U_100m']**2 + feature_data['V_100m']**2)
    feature_data['Wind Direction 100m'] = np.arctan(feature_data['V_100m']/feature_data['U_100m'])
    feature_data['Wind Speed 10m'] = np.sqrt(feature_data['U_10m']**2 + feature_data['V_10m']**2)
    feature_data['Wind Direction 10m'] = np.arctan(feature_data['V_10m']/feature_data['U_10m'])

    feature_data['Wind Direction 100m'] = feature_data['Wind Direction 100m'].apply(lambda x: 360 + x if x < 0 else x)
    feature_data['Wind Direction 10m'] = feature_data['Wind Direction 10m'].apply(lambda x: 360 + x if x < 0 else x)

    # Time Relative Variables 

    for column in features:
        feature_data[column + '_last_week'] = feature_data[column].shift(7) # Values for Last Week
        feature_data[column + '_last_month'] = feature_data[column].shift(30) # Values for Last Month

    feature_data['Month_Number'] = feature_data.index.month # Month Number
    feature_data['Quarter_Number'] = feature_data.index.quarter # Quarter Number

    mean_month = feature_data.groupby('Month_Number').mean()[features] # Month Mean
    median_month = feature_data.groupby('Month_Number').median()[features] # Month Median
    variance_month = feature_data.groupby('Month_Number').var()[features] # Month Variance

    mean_quarter = feature_data.groupby('Quarter_Number').mean()[features] # Quarter Mean
    median_quarter = feature_data.groupby('Quarter_Number').median()[features] # Quarter Median
    variance_quarter = feature_data.groupby('Quarter_Number').var()[features] # Quarter Variance

    mean_month.columns = mean_month.columns + '_Month_Mean'
    median_month.columns = median_month.columns + '_Month_Median'
    variance_month.columns = variance_month.columns + '_Month_Variance'
    mean_quarter.columns = mean_quarter.columns + '_Quarter_Mean'
    median_quarter.columns = median_quarter.columns + '_Quarterh_Median'
    variance_quarter.columns = variance_quarter.columns + '_Quarter_Variance'

    feature_data = feature_data.merge(mean_month,on='Month_Number',how='left')
    feature_data = feature_data.merge(median_month,on='Month_Number',how='left')
    feature_data = feature_data.merge(variance_month,on='Month_Number',how='left')
    feature_data = feature_data.merge(mean_quarter,on='Quarter_Number',how='left')
    feature_data = feature_data.merge(median_quarter,on='Quarter_Number',how='left')
    feature_data = feature_data.merge(variance_quarter,on='Quarter_Number',how='left')
    feature_data.index = index

    # Periodical Features

    day = feature_data.index.day
    hour = feature_data.index.hour
    minute = feature_data.index.minute
    dayofweek = feature_data.index.dayofweek
    dayofyear = feature_data.index.dayofyear
    days_in_month = feature_data.index.days_in_month

    feature_data["cos_day"], feature_data["sin_day"] = (
    np.cos(2 * np.pi * (day - 1) / days_in_month),
    np.sin(2 * np.pi * (day - 1) / days_in_month),
    )

    feature_data["cos_hour"], feature_data["sin_hour"] = (
        np.cos(2 * np.pi * hour / 24),
        np.sin(2 * np.pi * hour / 24),
        )

    feature_data["cos_minute"], feature_data["sin_minute"] = (
        np.cos(2 * np.pi * minute / 60),
        np.sin(2 * np.pi * minute / 60),
    )

    feature_data["cos_dayofyear"], feature_data["sin_dayofyear"] = (
        np.cos(2 * np.pi * (dayofyear - 1) / 365),
        np.sin(2 * np.pi * (dayofyear - 1) / 365),
    )

    feature_data["cos_dayofweek"], feature_data["sin_dayofweek"] = (
        np.cos(2 * np.pi * dayofweek / 7),
        np.sin(2 * np.pi * dayofweek / 7),
    )

    # Distance from Max and Min

    for column in features:
        feature_data[column + '_Distance_Max'] = feature_data.index - feature_data[column].idxmax()
        feature_data[column + '_Distance_Min'] = feature_data.index - feature_data[column].idxmin()
        feature_data[column + '_Distance_Max'] = feature_data[column + '_Distance_Max'].apply(lambda x : x.days)
        feature_data[column + '_Distance_Min'] = feature_data[column + '_Distance_Min'].apply(lambda x : x.days)

    # Dropping Base Features 
    features.append(['Month_Number','Quarter Number'])
    feature_data = feature_data.drop(features,axis=1)

    return feature_data

In [4]:
def get_tsfresh_features(data,features_dict):
    tsfresh_data = pd.DataFrame()
    for variable in ['U_100m','V_100m','U_10m','V_10m','T','CLCT']: 
        df_shift, y = make_forecasting_frame(data[variable],kind=variable,max_timeshift=20,rolling_direction=1)
        X = extract_features(df_shift, column_id="id", column_sort="time", column_value="value", impute_function=impute,show_warnings=False,n_jobs=3,kind_to_fc_parameters=features_dict)
        X['Feature'] = variable
        tsfresh_data = tsfresh_data.append(X)

    # Formmating the Data
    tsfresh_data = tsfresh_data.pivot(columns='Feature')
    tsfresh_data.columns = tsfresh_data.columns.map('{0[0]}|{0[1]}'.format)
    tsfresh_data = tsfresh_data.loc[:, tsfresh_data.apply(pd.Series.nunique) != 1] # Remove Constant Columns

    return tsfresh_data

In [5]:
def get_selected_features(n_features):
    selected_features = pd.read_csv(r'Feature Selection\Importance_WF1.csv')
    selected_features = selected_features[:n_features]['feature']
    return selected_features

## Testes

In [6]:
X,y_train = get_simplified_data()

In [7]:
selected_features = get_selected_features(50)

In [8]:
manual_features = []
tsfresh_features = []
for feature in selected_features:
    if feature.__contains__('|'):
        tsfresh_features.append(feature)
    else:
        manual_features.append(feature)

In [9]:
tsfresh_features = [x.split('|')[0] for x in tsfresh_features]

In [10]:
features_dict = tsfresh.feature_extraction.settings.from_columns(tsfresh_features)

In [12]:
feature_data = pd.DataFrame()
y = pd.DataFrame()
for WF in X['WF'].unique():
    X_WF = X[X['WF']==WF]
    y_train = y_train[y_train.index.isin(X_WF['ID'])]

    X_tsfresh = get_tsfresh_features(X_WF,features_dict)
    X_manual =get_manual_features(X_WF)
    X_WF = pd.concat([X_manual,X_tsfresh],axis=1)

    X_WF = transform_data(X_WF)
    y_train = transform_data(y_train)

    feature_data = pd.concat([feature_data,X_WF])
    y = pd.concat([y,y_train])

Feature Extraction: 100%|██████████| 15/15 [01:17<00:00,  5.14s/it]


ValueError: Index contains duplicate entries, cannot reshape