# Feature Selection

### Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

### Create Dataframe

In [2]:
extracted_features = pd.read_csv('data/extracted_features.csv')
target_column = 'price actual'

### PCA

In [3]:
no_time_df = extracted_features.drop(['time'], axis=1)
X = no_time_df.values
sc = StandardScaler()
X_std = sc.fit_transform(X)

pca = PCA(n_components = 0.99)
X_pca = pca.fit_transform(X_std)
n_pcs= pca.n_components_

most_important = [np.abs(pca.components_[i]).argmax() for i in range(n_pcs)]
initial_feature_names = no_time_df.columns
most_important_names = [initial_feature_names[most_important[i]] for i in range(n_pcs)]
most_important_names_set = set(most_important_names)

features_df = pd.DataFrame()
features_df['time'] = extracted_features['time']
for feature in most_important_names_set:
    features_df[feature] = extracted_features[feature]

### Pearson Correlation

In [4]:
cor = extracted_features.corr(numeric_only=True)
cor_target = abs(cor[target_column])

relative_features = cor_target[cor_target>0.4]
print(relative_features)

selected_features = pd.DataFrame()
for feature in relative_features.index:
    selected_features[feature] = extracted_features[feature]

generation fossil brown coal/lignite                0.410184
generation fossil gas                               0.511821
generation fossil hard coal                         0.515519
generation hydro pumped storage consumption         0.468760
total load actual                                   0.449061
price actual                                        1.000000
generation fossil brown coal/lignite day mean       0.401129
generation fossil gas day mean                      0.428154
generation fossil hard coal day mean                0.472333
generation fossil hard coal day median              0.436126
generation fossil brown coal/lignite week mean      0.454132
generation fossil gas week mean                     0.427194
generation fossil hard coal week mean               0.483186
generation fossil brown coal/lignite week median    0.411003
generation fossil hard coal week median             0.463670
Name: price actual, dtype: float64


### Save to CSV

In [5]:
selected_features.to_csv('data/selected_features.csv', index=False)