In [1]:
#import libraries
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
import numpy as np
import sklearn.metrics as metrics
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV

In [2]:
data = pd.read_csv('total-sunshine-duration-Japanweather1.csv') #load the dataset

In [3]:
# to explicitly convert the date column to type DATETIME
data['date'] = pd.to_datetime(data['date'], dayfirst=True)
data.dtypes

date                            datetime64[ns]
solar_radiation                        float64
mean_relative_humidity                 float64
mean_air_temperature                   float64
mean_wind_speed                        float64
total_precipitation                    float64
percentage_possible_sunshine           float64
total_sunshine_duration                float64
dtype: object

In [4]:
data = data.set_index('date') #set the index of the dataset as the date

In [5]:
#define a function to output evaluation metrics
def regression_results(y_true, y_pred):
    # Regression metrics
    explained_variance=metrics.explained_variance_score(y_true, y_pred)
    mean_absolute_error=metrics.mean_absolute_error(y_true, y_pred) 
    mse=metrics.mean_squared_error(y_true, y_pred) 
    mean_squared_log_error=metrics.mean_squared_log_error(y_true, y_pred)
    median_absolute_error=metrics.median_absolute_error(y_true, y_pred)
    r2=metrics.r2_score(y_true, y_pred)
    print('explained_variance: ', round(explained_variance,4))    
    print('mean_squared_log_error: ', round(mean_squared_log_error,4))
    print('r2: ', round(r2,4))
    print('MAE: ', round(mean_absolute_error,4))
    print('MSE: ', round(mse,4))
    print('RMSE: ', round(np.sqrt(mse),4))

In [6]:
data_total_sunshine_duration = data[['total_sunshine_duration']] # creating new dataframe from total_sunshine_duration column
data_total_sunshine_duration.loc[:,'last_month'] = data_total_sunshine_duration.loc[:,'total_sunshine_duration'].shift() # inserting new column with last month's total_sunshine_duration values
data_total_sunshine_duration = data_total_sunshine_duration.dropna() # dropping NAs
data_total_sunshine_duration

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


Unnamed: 0_level_0,total_sunshine_duration,last_month
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2010-02-01,69.5,92.7
2010-03-01,134.9,69.5
2010-04-01,81.3,134.9
2010-05-01,111.8,81.3
2010-06-01,128.1,111.8
...,...,...
2021-01-01,112.8,116.6
2021-02-01,118.5,112.8
2021-03-01,120.5,118.5
2021-04-01,116.2,120.5


In [7]:
X_train = data_total_sunshine_duration[:'2018'].drop(['total_sunshine_duration'], axis = 1) #separate features for training
y_train = data_total_sunshine_duration.loc[:'2018', 'total_sunshine_duration'] #separate target variable for training
X_test = data_total_sunshine_duration['2019'].drop(['total_sunshine_duration'], axis = 1) #separate features for training
y_test = data_total_sunshine_duration.loc['2019', 'total_sunshine_duration'] #separate target variable for testing

  X_test = data_total_sunshine_duration['2019'].drop(['total_sunshine_duration'], axis = 1) #separate features for training


In [8]:
def rmse(actual, predict):
    predict = np.array(predict)
    actual = np.array(actual)
    distance = predict - actual
    square_distance = distance ** 2
    mean_square_distance = square_distance.mean()
    score = np.sqrt(mean_square_distance)
    return score
rmse_score = make_scorer(rmse, greater_is_better = False)

In [9]:
model = RandomForestRegressor()
param_search = { 
    'n_estimators': [20, 50, 100],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [i for i in range(5,15)]
}
tscv = TimeSeriesSplit(n_splits=10)
gsearch = GridSearchCV(estimator=model, cv=tscv, param_grid=param_search, scoring = rmse_score)
gsearch.fit(X_train, y_train)
best_score = gsearch.best_score_
best_model = gsearch.best_estimator_
y_true = y_test.values
y_pred = best_model.predict(X_test)
regression_results(y_true, y_pred)

KeyboardInterrupt: 

In [None]:
X_train = data_total_sunshine_duration.drop(['total_sunshine_duration'], axis = 1)
y_train = data_total_sunshine_duration.loc[:'2020', 'total_sunshine_duration']

In [None]:
X_train

In [None]:
y_train

In [None]:
test_data = pd.read_csv('predicted-total-sunshine-duration.csv')
test_data = test_data.set_index('date')
X_test = test_data.drop(['total_sunshine_duration'], axis = 1)
y_pred = best_model.predict(X_test)
print(y_pred)