In [78]:
#import libraries
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
import numpy as np
import sklearn.metrics as metrics
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV

In [79]:
data = pd.read_csv('total-sunshine-duration-Japanweather1.csv') #load the dataset

In [80]:
# to explicitly convert the date column to type DATETIME
data['date'] = pd.to_datetime(data['date'], dayfirst=True)
data.dtypes

date                            datetime64[ns]
solar_radiation                        float64
mean_relative_humidity                   int64
mean_air_temperature                   float64
mean_wind_speed                        float64
total_precipitation                    float64
percentage_possible_sunshine             int64
total_sunshine_duration                float64
dtype: object

In [81]:
data = data.set_index('date') #set the index of the dataset as the date

In [82]:
#define a function to output evaluation metrics
def regression_results(y_true, y_pred):
    # Regression metrics
    explained_variance=metrics.explained_variance_score(y_true, y_pred)
    mean_absolute_error=metrics.mean_absolute_error(y_true, y_pred) 
    mse=metrics.mean_squared_error(y_true, y_pred) 
    mean_squared_log_error=metrics.mean_squared_log_error(y_true, y_pred)
    median_absolute_error=metrics.median_absolute_error(y_true, y_pred)
    r2=metrics.r2_score(y_true, y_pred)
    print('explained_variance: ', round(explained_variance,4))    
    print('mean_squared_log_error: ', round(mean_squared_log_error,4))
    print('r2: ', round(r2,4))
    print('MAE: ', round(mean_absolute_error,4))
    print('MSE: ', round(mse,4))
    print('RMSE: ', round(np.sqrt(mse),4))

In [94]:
data_total_sunshine_duration = data[['total_sunshine_duration', 'solar_radiation']] # creating new dataframe from total_sunshine_duration column
data_total_sunshine_duration.loc[:,'last_month'] = data_total_sunshine_duration.loc[:,'total_sunshine_duration'].shift() # inserting new column with last month's total_sunshine_duration values
data_total_sunshine_duration.loc[:,'last_month_solar_radiation'] = data_total_sunshine_duration.loc[:,'solar_radiation'].shift() # inserting new column with last month's total_sunshine_duration values
data_total_sunshine_duration = data_total_sunshine_duration.dropna() # dropping NAs

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


In [95]:
del data_total_sunshine_duration['solar_radiation']
data_total_sunshine_duration

Unnamed: 0_level_0,total_sunshine_duration,last_month,last_month_solar_radiation
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2010-02-01,69.5,92.7,9.1
2010-03-01,134.9,69.5,9.2
2010-04-01,81.3,134.9,14.2
2010-05-01,111.8,81.3,13.4
2010-06-01,128.1,111.8,14.8
...,...,...,...
2019-08-01,191.3,196.2,19.2
2019-09-01,154.2,191.3,18.9
2019-10-01,198.8,154.2,15.7
2019-11-01,135.7,198.8,15.8


In [96]:
X_train = data_total_sunshine_duration[:'2018'].drop(['total_sunshine_duration'], axis = 1) #separate features for training
y_train = data_total_sunshine_duration.loc[:'2018', 'total_sunshine_duration'] #separate target variable for training
X_test = data_total_sunshine_duration['2019'].drop(['total_sunshine_duration'], axis = 1) #separate features for training
y_test = data_total_sunshine_duration.loc['2019', 'total_sunshine_duration'] #separate target variable for testing

  X_test = data_total_sunshine_duration['2019'].drop(['total_sunshine_duration'], axis = 1) #separate features for training


In [97]:
def rmse(actual, predict):
    predict = np.array(predict)
    actual = np.array(actual)
    distance = predict - actual
    square_distance = distance ** 2
    mean_square_distance = square_distance.mean()
    score = np.sqrt(mean_square_distance)
    return score
rmse_score = make_scorer(rmse, greater_is_better = False)

In [98]:
model = RandomForestRegressor()
param_search = { 
    'n_estimators': [20, 50, 100],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [i for i in range(5,15)]
}
tscv = TimeSeriesSplit(n_splits=10)
gsearch = GridSearchCV(estimator=model, cv=tscv, param_grid=param_search, scoring = rmse_score)
gsearch.fit(X_train, y_train)
best_score = gsearch.best_score_
best_model = gsearch.best_estimator_

In [93]:
y_true = y_test.values
y_pred = best_model.predict(X_test)
regression_results(y_true, y_pred)

explained_variance:  -0.2838
mean_squared_log_error:  0.1047
r2:  -0.2857
MAE:  36.8936
MSE:  1973.9787
RMSE:  44.4295


In [30]:
X_train = data_total_sunshine_duration.drop(['total_sunshine_duration'], axis = 1)
y_train = data_total_sunshine_duration.loc[:'2019', 'total_sunshine_duration']

In [31]:
y_train

date
2010-02-01     69.5
2010-03-01    134.9
2010-04-01     81.3
2010-05-01    111.8
2010-06-01    128.1
              ...  
2019-08-01    191.3
2019-09-01    154.2
2019-10-01    198.8
2019-11-01    135.7
2019-12-01    123.2
Name: total_sunshine_duration, Length: 119, dtype: float64

In [42]:
test_data = pd.read_csv('predicted-total-sunshine-duration.csv')
test_data = test_data.set_index('date')
X_test = test_data.drop(['total_sunshine_duration'], axis = 1)

# model = RandomForestRegressor()
# param_search = { 
#     'n_estimators': [20, 50, 100],
#     'max_features': ['auto', 'sqrt', 'log2'],
#     'max_depth' : [i for i in range(5,15)]
# }
# tscv = TimeSeriesSplit(n_splits=10)
# gsearch = GridSearchCV(estimator=model, cv=tscv, param_grid=param_search, scoring = rmse_score)
# gsearch.fit(X_train, y_train)
# best_score = gsearch.best_score_
# best_model = gsearch.best_estimator_
y_pred = best_model.predict(X_test)
print(y_pred)

[142.07993214 152.77470717 134.43117795 142.81179043 150.96989467
 124.3523089  153.14833288 134.43117795 142.81179043 150.96989467]
