In [1]:
#import libraries
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
import numpy as np
import sklearn.metrics as metrics
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV

In [2]:
data = pd.read_csv('mean-wind-speed.csv') #load the dataset

In [3]:
# to explicitly convert the date column to type DATETIME
data['date'] = pd.to_datetime(data['date'], dayfirst=True)
data.dtypes

date                            datetime64[ns]
total_precipitation                    float64
mean_relative_humidity                   int64
mean_air_temperature                   float64
percentage_possible_sunshine             int64
total_sunshine_duration                float64
solar_radiation                        float64
mean_wind_speed                        float64
dtype: object

In [4]:
data = data.set_index('date') #set the index of the dataset as the date

In [5]:
#define a function to output evaluation metrics
def regression_results(y_true, y_pred):
    # Regression metrics
    explained_variance=metrics.explained_variance_score(y_true, y_pred)
    mean_absolute_error=metrics.mean_absolute_error(y_true, y_pred) 
    mse=metrics.mean_squared_error(y_true, y_pred) 
    mean_squared_log_error=metrics.mean_squared_log_error(y_true, y_pred)
    median_absolute_error=metrics.median_absolute_error(y_true, y_pred)
    r2=metrics.r2_score(y_true, y_pred)
    print('explained_variance: ', round(explained_variance,4))    
    print('mean_squared_log_error: ', round(mean_squared_log_error,4))
    print('r2: ', round(r2,4))
    print('MAE: ', round(mean_absolute_error,4))
    print('MSE: ', round(mse,4))
    print('RMSE: ', round(np.sqrt(mse),4))

In [6]:
data_mean_wind_speed = data[['mean_wind_speed']] # creating new dataframe from mean_wind_speed column
data_mean_wind_speed.loc[:,'last_month'] = data_mean_wind_speed.loc[:,'mean_wind_speed'].shift() # inserting new column with last month's mean_wind_speed values
data_mean_wind_speed = data_mean_wind_speed.dropna() # dropping NAs
data_mean_wind_speed

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


Unnamed: 0_level_0,mean_wind_speed,last_month
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2010-02-01,5.6,5.0
2010-03-01,5.8,5.6
2010-04-01,5.4,5.8
2010-05-01,5.1,5.4
2010-06-01,5.4,5.1
...,...,...
2019-08-01,5.6,5.4
2019-09-01,6.2,5.6
2019-10-01,5.0,6.2
2019-11-01,5.5,5.0


In [7]:
X_train = data_mean_wind_speed[:'2018'].drop(['mean_wind_speed'], axis = 1) #separate features for training
y_train = data_mean_wind_speed.loc[:'2018', 'mean_wind_speed'] #separate target variable for training
X_test = data_mean_wind_speed['2019'].drop(['mean_wind_speed'], axis = 1) #separate features for training
y_test = data_mean_wind_speed.loc['2019', 'mean_wind_speed'] #separate target variable for testing

  X_test = data_mean_wind_speed['2019'].drop(['mean_wind_speed'], axis = 1) #separate features for training


In [8]:
#define a function to produce the root mean square error value
def rmse(actual, predict):
    predict = np.array(predict)
    actual = np.array(actual)
    distance = predict - actual
    square_distance = distance ** 2
    mean_square_distance = square_distance.mean()
    score = np.sqrt(mean_square_distance)
    return score
rmse_score = make_scorer(rmse, greater_is_better = False)

In [9]:
model = RandomForestRegressor() #set the model for use
param_search = { 
    'n_estimators': [20, 50, 100],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [i for i in range(5,15)]
} #declare search parameters
tscv = TimeSeriesSplit(n_splits=10)

#run the grid-search
gsearch = GridSearchCV(estimator=model, cv=tscv, param_grid=param_search, scoring = rmse_score) 
gsearch.fit(X_train, y_train)
best_score = gsearch.best_score_
best_model = gsearch.best_estimator_

y_true = y_test.values #store the actual values of the target variable 
y_pred = best_model.predict(X_test) # use the trained model to predict the target variable
regression_results(y_true, y_pred) # output the evaluation metrics

explained_variance:  0.1931
mean_squared_log_error:  0.0056
r2:  0.1898
MAE:  0.3682
MSE:  0.2086
RMSE:  0.4567
