In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt # module for plotting 
import seaborn as sns
from matplotlib import cm
from IPython.display import display
from datetime import datetime
from math import sqrt

%config IPCompleter.greedy=True
%load_ext autoreload
%autoreload 2
%matplotlib inline

# sklearn
from sklearn.metrics import (classification_report, confusion_matrix,
accuracy_score, log_loss, mean_squared_error, mean_absolute_error, r2_score)
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor # Ensemble method
from sklearn.feature_selection import (chi2, SelectKBest, f_regression, RFE) # Kbest method
from sklearn.feature_selection import RFE # Recursive feature elimation
from sklearn.linear_model import LinearRegression # Regression method
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.tree import DecisionTreeRegressor
from sklearn.cluster import KMeans

import warnings  
warnings.filterwarnings('ignore')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [89]:
df = pd.read_csv('data_readyfor_prediction.csv')
df.head(3)


Unnamed: 0,Date,Current1_Ah,Current2_Ah,Current3_Ah,Total_Ah,Power_kW,Holiday,temp_C,HR,windSpeed_m/s,windGust_m/s,pres_mbar,solarRad_W/m2
0,2017-01-01 00:00:00,180.769,194.099,17.472,392.34,74.897706,1,8.5325,74.966667,2.3445,3.27075,1029.0,1.355083
1,2017-01-01 01:00:00,175.629,196.974,21.183,393.785,75.173556,1,7.93,76.308333,2.692417,3.480333,1028.166667,1.477333
2,2017-01-01 02:00:00,172.005,196.694,20.03,388.728,74.208175,1,7.263417,80.275,3.30025,4.067167,1028.0,1.415083


In [90]:
def evaluation_metric(model, y_test, y_pred):
    if model == "test":
        print(None)
    elif model == "RMSE":
        result = sqrt(mean_squared_error(y_test, y_pred))
        return result        
        

In [91]:
df['Date']=pd.to_datetime(df['Date'])
df['Power-1']=df['Power_kW'].shift(1)

df = df.set_index('Date' , drop = True)
#df1 = df.drop (columns = ['Date'])
df2 = df1.dropna()

# 1 FEATURE SELECTION

- ** Filter method**: Selection methods that utilize statistical measuring to attribe a score for each feature. Features are classified according to a score function to be removed or maintained by the model. The highest the score feature the better is that feature to contributing to the model

In [92]:
print(" Feature data dimension: ", df2.shape)
Data=df2.values
Y=Data[:,4] #output
X=Data[:,[0,1,2,3,5,6,7,8,9,10,11,12]] #input

#df2.reset_index(drop=True, inplace=True)

 Feature data dimension:  (5692, 13)


In [93]:
df2.columns

Index(['Current1_Ah', 'Current2_Ah', 'Current3_Ah', 'Total_Ah', 'Power_kW',
       'Holiday', 'temp_C', 'HR', 'windSpeed_m/s', 'windGust_m/s', 'pres_mbar',
       'solarRad_W/m2', 'Power-1'],
      dtype='object')

In [94]:
#X = df2.drop(['Power_kW'], axis=1)
#Y = df2['Power_kW']


In [96]:
print(Y)

[75.1735565 74.2081752 73.7358886 ... 82.7047524 80.6184063 75.092424 ]


### 1.1 Kbest Model
- Extract best features of dataset. The SelectKBest method selects the features according to the k highest score.
#### SelectKBest for regression data
For regression we will set `f_regression` method as a scoring function. The target number of features to select is 5 We'll fit and transform the model on training X and Y data.


In [97]:
select = SelectKBest(score_func = f_regression, k=5)
z = select.fit_transform(X, Y) 
print("After selecting best 5 features:", z.shape) 

After selecting best 5 features: (5692, 5)


- To identify the selected features we can use `get_support()` function and filter out them from the features list. The z object contains selected X data. 

In [98]:

feature_names = list(df2.columns.values)

mask = select.get_support() # list of boleans
new_features = [] # The list of your K best features

for bool, feature in zip(mask, feature_names):
     if bool:
        new_features.append(feature)

print('All features: ', feature_names)
print('Selected best 5: ', new_features)

print(select.fit(X,Y).scores_)


All features:  ['Current1_Ah', 'Current2_Ah', 'Current3_Ah', 'Total_Ah', 'Power_kW', 'Holiday', 'temp_C', 'HR', 'windSpeed_m/s', 'windGust_m/s', 'pres_mbar', 'solarRad_W/m2', 'Power-1']
Selected best 5:  ['Current1_Ah', 'Current2_Ah', 'Current3_Ah', 'windSpeed_m/s', 'solarRad_W/m2']
[ 2.80026718e+04  1.09222657e+04  9.20893955e+02 -1.22026104e+17
  5.30860359e+01  9.86685135e+00  4.94787131e+01  1.19387615e+02
  1.24904617e+02  8.35316910e+00  1.98628164e+01  6.65809004e+03]


In [99]:
#mask = select.get_support(indices=True)
#X.iloc[:, mask]

**Wrapper method**: (selection method) - select a set of features, training and evaluate the model
### 1.2 Recursive Feature Elimination (RFE) 
Is used to select features by recursively considering smaller and smaller sets of features. First, the features are trained in a initial set of features (using one of machine learning model) and the importance of feature is obtained. Then, the least important features are prunned from the current set of features. That procedure is recursively repeated until the desired number of features is eventually reached. 

In [100]:
model = LinearRegression()
rfe = RFE(model,5)
fit = rfe.fit(X,Y)

print('Number of features: {}'.format(fit.n_features_))

Number of features: 5


In [101]:
#print('All features: ', feature_names)

print( "Feature Ranking (Linear Model, 5 features): %s" % (fit.ranking_))

Feature Ranking (Linear Model, 5 features): [1 1 1 1 1 7 8 2 3 5 4 6]


- SolarRad_W/m2
- Holiday
- HR
- windSpeed_m/s

### 1.3 Ensemble method
Learns whcih features better improve the performance of the model.

In [102]:
model = RandomForestRegressor()
model.fit(X,Y)
print(model.feature_importances_)


[4.10878058e-06 2.83065762e-06 5.27973414e-06 9.99963343e-01
 6.98335695e-07 2.29850417e-06 3.47402649e-06 1.59902698e-06
 1.15104055e-06 3.64690784e-06 2.40600423e-06 9.16400289e-06]


In [103]:
df2.columns

Index(['Current1_Ah', 'Current2_Ah', 'Current3_Ah', 'Total_Ah', 'Power_kW',
       'Holiday', 'temp_C', 'HR', 'windSpeed_m/s', 'windGust_m/s', 'pres_mbar',
       'solarRad_W/m2', 'Power-1'],
      dtype='object')

In [104]:
df2.to_csv('data_after_selection.csv')