# Linear models  

In this notebook we will be looking at ways to use linear models to predict electricity demand for the GTA. We will trying different combinations of features through best subset selection which in the end should give us a sense of the most important features.

In [1]:
import numpy as np
import pandas as pd
import sklearn.model_selection
import sklearn.linear_model
import math

In [2]:
# Load the data (Also removing columns that contain repetitive information)
data = pd.read_csv("MergedDataset.csv", delimiter = ",").drop(['time', 'local_time'], axis=1)
data

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0,Date,Hour,Toronto,HOEP,temperature,precipitation,snowfall,snow_mass,air_density,radiation_surface,radiation_toa,cloud_cover,isWeekend,isHoliday
0,2004-01-01,1,4606,30.9,0.198,0.001,0.000,1.156,1.279,0.0,0.0,0.118,False,True
1,2004-01-01,2,4366,27.13,0.339,0.001,0.000,1.156,1.279,0.0,0.0,0.148,False,True
2,2004-01-01,3,4188,25.23,0.502,0.001,0.001,1.156,1.280,0.0,0.0,0.144,False,True
3,2004-01-01,4,4046,24.29,0.534,0.000,0.000,1.157,1.280,0.0,0.0,0.159,False,True
4,2004-01-01,5,3974,24.42,0.494,0.000,0.000,1.157,1.281,0.0,0.0,0.194,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
131491,2018-12-31,20,5755,5.62,2.908,2.279,0.566,3.426,1.243,0.0,0.0,0.988,False,False
131492,2018-12-31,21,5544,8.95,2.383,1.402,0.263,3.773,1.242,0.0,0.0,0.981,False,False
131493,2018-12-31,22,5338,5.81,2.611,0.244,0.041,3.866,1.237,0.0,0.0,0.985,False,False
131494,2018-12-31,23,5091,2.87,3.384,0.283,0.076,3.872,1.227,0.0,0.0,0.990,False,False


In [3]:
#Checking the types of each column of our loaded data.
data.dtypes

Date                  object
Hour                   int64
Toronto                int64
HOEP                  object
temperature          float64
precipitation        float64
snowfall             float64
snow_mass            float64
air_density          float64
radiation_surface    float64
radiation_toa        float64
cloud_cover          float64
isWeekend               bool
isHoliday               bool
dtype: object

In [4]:
#Converting the 'Date' column from object to datetime.
data['Date'] = pd.to_datetime(data['Date'])

In [5]:
#Checking that our 'Date' column got converted to the right type
data.dtypes

Date                 datetime64[ns]
Hour                          int64
Toronto                       int64
HOEP                         object
temperature                 float64
precipitation               float64
snowfall                    float64
snow_mass                   float64
air_density                 float64
radiation_surface           float64
radiation_toa               float64
cloud_cover                 float64
isWeekend                      bool
isHoliday                      bool
dtype: object

In [6]:
#Updating whole data set to have 7 years studied total(5 years training data + 2 years testing data)
data = data[data['Date'].dt.year >= 2012]
data

Unnamed: 0,Date,Hour,Toronto,HOEP,temperature,precipitation,snowfall,snow_mass,air_density,radiation_surface,radiation_toa,cloud_cover,isWeekend,isHoliday
70128,2012-01-01,1,4834,21.71,0.672,0.032,0.004,6.048,1.267,0.0,0.0,0.538,True,True
70129,2012-01-01,2,4631,16.92,0.932,0.004,0.002,6.054,1.265,0.0,0.0,0.581,True,True
70130,2012-01-01,3,4442,11.23,1.319,0.006,0.004,6.059,1.262,0.0,0.0,0.676,True,True
70131,2012-01-01,4,4312,7.1,1.715,0.008,0.004,6.066,1.259,0.0,0.0,0.624,True,True
70132,2012-01-01,5,4224,-6.7,2.019,0.013,0.003,6.070,1.256,0.0,0.0,0.637,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
131491,2018-12-31,20,5755,5.62,2.908,2.279,0.566,3.426,1.243,0.0,0.0,0.988,False,False
131492,2018-12-31,21,5544,8.95,2.383,1.402,0.263,3.773,1.242,0.0,0.0,0.981,False,False
131493,2018-12-31,22,5338,5.81,2.611,0.244,0.041,3.866,1.237,0.0,0.0,0.985,False,False
131494,2018-12-31,23,5091,2.87,3.384,0.283,0.076,3.872,1.227,0.0,0.0,0.990,False,False


In [7]:
# Create a new column 't' which is just the index of each date starting at 1
data.insert(0, 't', np.arange(len(data))) 

#Checking that the new column gets created and that it does what it's expected to
data

Unnamed: 0,t,Date,Hour,Toronto,HOEP,temperature,precipitation,snowfall,snow_mass,air_density,radiation_surface,radiation_toa,cloud_cover,isWeekend,isHoliday
70128,0,2012-01-01,1,4834,21.71,0.672,0.032,0.004,6.048,1.267,0.0,0.0,0.538,True,True
70129,1,2012-01-01,2,4631,16.92,0.932,0.004,0.002,6.054,1.265,0.0,0.0,0.581,True,True
70130,2,2012-01-01,3,4442,11.23,1.319,0.006,0.004,6.059,1.262,0.0,0.0,0.676,True,True
70131,3,2012-01-01,4,4312,7.1,1.715,0.008,0.004,6.066,1.259,0.0,0.0,0.624,True,True
70132,4,2012-01-01,5,4224,-6.7,2.019,0.013,0.003,6.070,1.256,0.0,0.0,0.637,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
131491,61363,2018-12-31,20,5755,5.62,2.908,2.279,0.566,3.426,1.243,0.0,0.0,0.988,False,False
131492,61364,2018-12-31,21,5544,8.95,2.383,1.402,0.263,3.773,1.242,0.0,0.0,0.981,False,False
131493,61365,2018-12-31,22,5338,5.81,2.611,0.244,0.041,3.866,1.237,0.0,0.0,0.985,False,False
131494,61366,2018-12-31,23,5091,2.87,3.384,0.283,0.076,3.872,1.227,0.0,0.0,0.990,False,False


In [8]:
#Creating our train and test set 
data_train = data[data['Date'].dt.year < 2017]
data_test = data[data['Date'].dt.year >= 2017]
data_train

Unnamed: 0,t,Date,Hour,Toronto,HOEP,temperature,precipitation,snowfall,snow_mass,air_density,radiation_surface,radiation_toa,cloud_cover,isWeekend,isHoliday
70128,0,2012-01-01,1,4834,21.71,0.672,0.032,0.004,6.048,1.267,0.0,0.0,0.538,True,True
70129,1,2012-01-01,2,4631,16.92,0.932,0.004,0.002,6.054,1.265,0.0,0.0,0.581,True,True
70130,2,2012-01-01,3,4442,11.23,1.319,0.006,0.004,6.059,1.262,0.0,0.0,0.676,True,True
70131,3,2012-01-01,4,4312,7.1,1.715,0.008,0.004,6.066,1.259,0.0,0.0,0.624,True,True
70132,4,2012-01-01,5,4224,-6.7,2.019,0.013,0.003,6.070,1.256,0.0,0.0,0.637,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113971,43843,2016-12-31,20,5665,2.81,2.506,0.140,0.014,4.203,1.243,0.0,0.0,0.875,True,False
113972,43844,2016-12-31,21,5665,0.0,2.218,0.043,0.002,4.131,1.245,0.0,0.0,0.579,True,False
113973,43845,2016-12-31,22,5665,0.0,1.579,0.017,0.005,4.099,1.250,0.0,0.0,0.188,True,False
113974,43846,2016-12-31,23,5665,0.0,0.515,0.007,0.003,4.101,1.258,0.0,0.0,0.172,True,False


To account for the seasonality in the data it will not be enough to consider a linear model of the form $X_t = \beta_1 t + \beta_0$ as this would only pickup on the underlying linear trend in the data (for which we already suspect is relatively weak according to our intial plots and time series decomposition <b>Note to self: this may be subject to change depending on how the time series analysis goes</b>).

We need to introduce some seasonal behaviour in our linear model. One way to do this (<b>CITE HERE</b>) is through fourier feature bases. If $m$ is the seasonal period we suspect (e.g. daily, weekly, monthly, yearly) then we can introduce terms of the following form:
$X_t = \sum_{j=1}^P \beta_{2j-1} \sin(\frac{2 \pi j t}{m}) + \beta_{2j} \cos(\frac{2 \pi j t}{m})$
where $m$ is the the seasonal period and $P$ is the number of pairs of fourier series we have. We will bound $P$ to at most $\frac{m}{2}$. For daily seasonality $m=24$, for weekly $m=7 \times 24$, for monthly $m = 4 \times 7 \times 24$ (using the running assumption that we use a month as 4 weeks) and yearly as $365 \times 24 = 8760$.


In [9]:
#Starting our design matrix (essentially adding 't' from data_train)
design_matrix = np.array(data_train['t']).reshape(-1,1)

In [10]:
#1. Fourier feature bases for Daily seasonal period
m_day = 24
P_day = int(m_day/2)
j_day = np.arange(1, int(P_day))

# Generating the column names we need for the dataframe
titles_sin, titles_cos = ['daily_sin_' + str(x) for x in j_day], ['daily_cos_' + str(x) for x in j_day]

# Generating the number to which we will apply sin and cos afterwards
temp_multiply = (2 * np.pi * j_day)/m_day

# Generating the sin and cos values
daily_design_matrix_sin = pd.DataFrame(data = np.sin(temp_multiply * design_matrix), columns = titles_sin)
daily_design_matrix_cos = pd.DataFrame(data = np.cos(temp_multiply * design_matrix), columns = titles_cos)

# Merging the two dataframes to be a single dataframe
daily_design_matrix =  pd.concat([daily_design_matrix_sin, daily_design_matrix_cos], axis = 1)

# Reordering the columns in a more convenient way (sin and cos tuples with incrementing index j)
titles_daily = list(sum(zip(titles_sin, titles_cos), ()))
daily_design_matrix = daily_design_matrix[titles_daily]

In [11]:
# Get index for the day start which will be useful for hyperparameter later
start_day = 0

#Printing weekly design matrix
daily_design_matrix

Unnamed: 0,daily_sin_1,daily_cos_1,daily_sin_2,daily_cos_2,daily_sin_3,daily_cos_3,daily_sin_4,daily_cos_4,daily_sin_5,daily_cos_5,...,daily_sin_7,daily_cos_7,daily_sin_8,daily_cos_8,daily_sin_9,daily_cos_9,daily_sin_10,daily_cos_10,daily_sin_11,daily_cos_11
0,0.000000,1.000000,0.000000,1.000000e+00,0.000000e+00,1.000000e+00,0.000000e+00,1.0,0.000000,1.000000,...,0.000000,1.000000,0.000000e+00,1.0,0.000000e+00,1.000000e+00,0.000000,1.000000e+00,0.000000,1.000000
1,0.258819,0.965926,0.500000,8.660254e-01,7.071068e-01,7.071068e-01,8.660254e-01,0.5,0.965926,0.258819,...,0.965926,-0.258819,8.660254e-01,-0.5,7.071068e-01,-7.071068e-01,0.500000,-8.660254e-01,0.258819,-0.965926
2,0.500000,0.866025,0.866025,5.000000e-01,1.000000e+00,6.123234e-17,8.660254e-01,-0.5,0.500000,-0.866025,...,-0.500000,-0.866025,-8.660254e-01,-0.5,-1.000000e+00,-1.836970e-16,-0.866025,5.000000e-01,-0.500000,0.866025
3,0.707107,0.707107,1.000000,6.123234e-17,7.071068e-01,-7.071068e-01,1.224647e-16,-1.0,-0.707107,-0.707107,...,-0.707107,0.707107,-2.449294e-16,1.0,7.071068e-01,7.071068e-01,1.000000,3.061617e-16,0.707107,-0.707107
4,0.866025,0.500000,0.866025,-5.000000e-01,1.224647e-16,-1.000000e+00,-8.660254e-01,-0.5,-0.866025,0.500000,...,0.866025,0.500000,8.660254e-01,-0.5,3.673940e-16,-1.000000e+00,-0.866025,-5.000000e-01,-0.866025,0.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43843,-0.965926,0.258819,-0.500000,-8.660254e-01,7.071068e-01,-7.071068e-01,8.660254e-01,0.5,-0.258819,0.965926,...,-0.258819,-0.965926,8.660254e-01,-0.5,7.071068e-01,7.071068e-01,-0.500000,8.660254e-01,-0.965926,-0.258819
43844,-0.866025,0.500000,-0.866025,-5.000000e-01,1.409837e-12,-1.000000e+00,8.660254e-01,-0.5,0.866025,0.500000,...,-0.866025,0.500000,-8.660254e-01,-0.5,-3.046447e-12,-1.000000e+00,0.866025,-5.000000e-01,0.866025,0.500000
43845,-0.707107,0.707107,-1.000000,-3.006999e-12,-7.071068e-01,-7.071068e-01,6.013998e-12,-1.0,0.707107,-0.707107,...,0.707107,0.707107,-1.202800e-11,1.0,-7.071068e-01,7.071068e-01,-1.000000,-4.121058e-12,-0.707107,-0.707107
43846,-0.500000,0.866025,-0.866025,5.000000e-01,-1.000000e+00,-3.973181e-12,-8.660254e-01,-0.5,-0.500000,-0.866025,...,0.500000,-0.866025,8.660254e-01,-0.5,1.000000e+00,4.643586e-12,0.866025,5.000000e-01,0.500000,0.866025


In [12]:
#2. Fourier feature bases for Weekly seasonal period
m_week = 24 * 7
P_week = int(m_week/2)
j_week = np.array([i for i in range(1, int(P_week)) if i/m_week not in j_day/m_day])

# Generating the column names we need for the dataframe
titles_sin, titles_cos = ['weekly_sin_' + str(x) for x in j_week], ['weekly_cos_' + str(x) for x in j_week]

# Generating the number to which we will apply sin and cos afterwards
temp_multiply = (2 * np.pi * j_week)/m_week

# Generating the sin and cos values
weekly_design_matrix_sin = pd.DataFrame(data = np.sin(temp_multiply * design_matrix), columns = titles_sin)
weekly_design_matrix_cos = pd.DataFrame(data = np.cos(temp_multiply * design_matrix), columns = titles_cos)

# Merging the two dataframes to be a single dataframe
weekly_design_matrix =  pd.concat([weekly_design_matrix_sin, weekly_design_matrix_cos], axis = 1)

# Reordering the columns in a more convenient way (sin and cos tuples with a specific index j)
titles_weekly = list(sum(zip(titles_sin, titles_cos), ()))
weekly_design_matrix = weekly_design_matrix[titles_weekly]

In [13]:
# Get index for the week start which will be useful for hyperparameter later
start_week = start_day + len(daily_design_matrix.columns)

#Printing weekly design matrix
weekly_design_matrix

Unnamed: 0,weekly_sin_1,weekly_cos_1,weekly_sin_2,weekly_cos_2,weekly_sin_3,weekly_cos_3,weekly_sin_4,weekly_cos_4,weekly_sin_5,weekly_cos_5,...,weekly_sin_79,weekly_cos_79,weekly_sin_80,weekly_cos_80,weekly_sin_81,weekly_cos_81,weekly_sin_82,weekly_cos_82,weekly_sin_83,weekly_cos_83
0,0.000000,1.000000,0.000000,1.000000,0.000000,1.000000,0.000000,1.000000,0.000000,1.000000,...,0.000000,1.000000,0.000000,1.000000,0.000000,1.000000,0.000000,1.000000,0.000000,1.000000
1,0.037391,0.999301,0.074730,0.997204,0.111964,0.993712,0.149042,0.988831,0.185912,0.982566,...,0.185912,-0.982566,0.149042,-0.988831,0.111964,-0.993712,0.074730,-0.997204,0.037391,-0.999301
2,0.074730,0.997204,0.149042,0.988831,0.222521,0.974928,0.294755,0.955573,0.365341,0.930874,...,-0.365341,0.930874,-0.294755,0.955573,-0.222521,0.974928,-0.149042,0.988831,-0.074730,0.997204
3,0.111964,0.993712,0.222521,0.974928,0.330279,0.943883,0.433884,0.900969,0.532032,0.846724,...,0.532032,-0.846724,0.433884,-0.900969,0.330279,-0.943883,0.222521,-0.974928,0.111964,-0.993712
4,0.149042,0.988831,0.294755,0.955573,0.433884,0.900969,0.563320,0.826239,0.680173,0.733052,...,-0.680173,0.733052,-0.563320,0.826239,-0.433884,0.900969,-0.294755,0.955573,-0.149042,0.988831
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43843,-0.185912,0.982566,-0.365341,0.930874,-0.532032,0.846724,-0.680173,0.733052,-0.804598,0.593820,...,-0.804598,-0.593820,-0.680173,-0.733052,-0.532032,-0.846724,-0.365341,-0.930874,-0.185912,-0.982566
43844,-0.149042,0.988831,-0.294755,0.955573,-0.433884,0.900969,-0.563320,0.826239,-0.680173,0.733052,...,0.680173,0.733052,0.563320,0.826239,0.433884,0.900969,0.294755,0.955573,0.149042,0.988831
43845,-0.111964,0.993712,-0.222521,0.974928,-0.330279,0.943883,-0.433884,0.900969,-0.532032,0.846724,...,-0.532032,-0.846724,-0.433884,-0.900969,-0.330279,-0.943883,-0.222521,-0.974928,-0.111964,-0.993712
43846,-0.074730,0.997204,-0.149042,0.988831,-0.222521,0.974928,-0.294755,0.955573,-0.365341,0.930874,...,0.365341,0.930874,0.294755,0.955573,0.222521,0.974928,0.149042,0.988831,0.074730,0.997204


In [14]:
#3. Fourier feature bases for Yearly seasonal period
m_year = 24 * 365
P_year = 730 # Don't go all the way to m_year//2 (too slow)
j_year = np.array([i for i in range(1, int(P_year)) if i/m_year not in j_day/m_day or i not in j_week/m_week])

# Generating the column names we need for the dataframe
titles_sin, titles_cos = ['yearly_sin_' + str(x) for x in j_year], ['yearly_cos_' + str(x) for x in j_year]

# Generating the number to which we will apply sin and cos afterwards
temp_multiply = (2 * np.pi * j_year)/m_year

# Generating the sin and cos values
yearly_design_matrix_sin = pd.DataFrame(data = np.sin(temp_multiply * design_matrix), columns = titles_sin)
yearly_design_matrix_cos = pd.DataFrame(data = np.cos(temp_multiply * design_matrix), columns = titles_cos)

# Merging the two dataframes to be a single dataframe
yearly_design_matrix =  pd.concat([yearly_design_matrix_sin, yearly_design_matrix_cos], axis = 1)

# Reordering the columns in a more convenient way (sin and cos tuples with a specific index j)
titles_yearly = list(sum(zip(titles_sin, titles_cos), ()))
yearly_design_matrix = yearly_design_matrix[titles_yearly]

In [15]:
# Get index for the year start which will be useful for hyperparameter later
start_year = start_week + len(weekly_design_matrix.columns)

#Printing yearly design matrix
yearly_design_matrix

Unnamed: 0,yearly_sin_1,yearly_cos_1,yearly_sin_2,yearly_cos_2,yearly_sin_3,yearly_cos_3,yearly_sin_4,yearly_cos_4,yearly_sin_5,yearly_cos_5,...,yearly_sin_725,yearly_cos_725,yearly_sin_726,yearly_cos_726,yearly_sin_727,yearly_cos_727,yearly_sin_728,yearly_cos_728,yearly_sin_729,yearly_cos_729
0,0.000000,1.000000,0.000000,1.000000,0.000000,1.000000,0.000000,1.000000,0.000000,1.000000,...,0.000000,1.000000,0.000000,1.000000,0.000000,1.000000,0.000000,1.000000,0.000000,1.000000
1,0.000717,1.000000,0.001435,0.999999,0.002152,0.999998,0.002869,0.999996,0.003586,0.999994,...,0.496891,0.867813,0.497513,0.867456,0.498135,0.867099,0.498757,0.866742,0.499379,0.866384
2,0.001435,0.999999,0.002869,0.999996,0.004304,0.999991,0.005738,0.999984,0.007173,0.999974,...,0.862417,0.506199,0.863142,0.504961,0.863866,0.503722,0.864587,0.502483,0.865307,0.501242
3,0.002152,0.999998,0.004304,0.999991,0.006455,0.999979,0.008607,0.999963,0.010759,0.999942,...,0.999942,0.010759,0.999963,0.008607,0.999979,0.006455,0.999991,0.004304,0.999998,0.002152
4,0.002869,0.999996,0.005738,0.999984,0.008607,0.999963,0.011476,0.999934,0.014345,0.999897,...,0.873109,-0.487526,0.871706,-0.490029,0.870297,-0.492528,0.868880,-0.495022,0.867456,-0.497513
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43843,0.030837,0.999524,0.061645,0.998098,0.092394,0.995722,0.123056,0.992400,0.153600,0.988133,...,-0.361045,-0.932548,-0.389630,-0.920971,-0.417845,-0.908518,-0.445663,-0.895201,-0.473056,-0.881032
43844,0.031554,0.999502,0.063077,0.998009,0.094537,0.995521,0.125902,0.992043,0.157143,0.987576,...,-0.776694,-0.629878,-0.796183,-0.605056,-0.814878,-0.579632,-0.832762,-0.553630,-0.849817,-0.527078
43845,0.032271,0.999479,0.064508,0.997917,0.096679,0.995316,0.128748,0.991677,0.160684,0.987006,...,-0.987006,-0.160684,-0.991677,-0.128748,-0.995316,-0.096679,-0.997917,-0.064508,-0.999479,-0.032271
43846,0.032988,0.999456,0.065940,0.997824,0.098820,0.995105,0.131593,0.991304,0.164222,0.986423,...,-0.936379,0.350991,-0.924291,0.381689,-0.911197,0.411972,-0.897111,0.441806,-0.882048,0.471160


In [16]:
#Adding all our newly generated period design_matrices to our original one
design_matrix = pd.concat([daily_design_matrix, weekly_design_matrix, yearly_design_matrix], axis = 1)
design_matrix

Unnamed: 0,daily_sin_1,daily_cos_1,daily_sin_2,daily_cos_2,daily_sin_3,daily_cos_3,daily_sin_4,daily_cos_4,daily_sin_5,daily_cos_5,...,yearly_sin_725,yearly_cos_725,yearly_sin_726,yearly_cos_726,yearly_sin_727,yearly_cos_727,yearly_sin_728,yearly_cos_728,yearly_sin_729,yearly_cos_729
0,0.000000,1.000000,0.000000,1.000000e+00,0.000000e+00,1.000000e+00,0.000000e+00,1.0,0.000000,1.000000,...,0.000000,1.000000,0.000000,1.000000,0.000000,1.000000,0.000000,1.000000,0.000000,1.000000
1,0.258819,0.965926,0.500000,8.660254e-01,7.071068e-01,7.071068e-01,8.660254e-01,0.5,0.965926,0.258819,...,0.496891,0.867813,0.497513,0.867456,0.498135,0.867099,0.498757,0.866742,0.499379,0.866384
2,0.500000,0.866025,0.866025,5.000000e-01,1.000000e+00,6.123234e-17,8.660254e-01,-0.5,0.500000,-0.866025,...,0.862417,0.506199,0.863142,0.504961,0.863866,0.503722,0.864587,0.502483,0.865307,0.501242
3,0.707107,0.707107,1.000000,6.123234e-17,7.071068e-01,-7.071068e-01,1.224647e-16,-1.0,-0.707107,-0.707107,...,0.999942,0.010759,0.999963,0.008607,0.999979,0.006455,0.999991,0.004304,0.999998,0.002152
4,0.866025,0.500000,0.866025,-5.000000e-01,1.224647e-16,-1.000000e+00,-8.660254e-01,-0.5,-0.866025,0.500000,...,0.873109,-0.487526,0.871706,-0.490029,0.870297,-0.492528,0.868880,-0.495022,0.867456,-0.497513
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43843,-0.965926,0.258819,-0.500000,-8.660254e-01,7.071068e-01,-7.071068e-01,8.660254e-01,0.5,-0.258819,0.965926,...,-0.361045,-0.932548,-0.389630,-0.920971,-0.417845,-0.908518,-0.445663,-0.895201,-0.473056,-0.881032
43844,-0.866025,0.500000,-0.866025,-5.000000e-01,1.409837e-12,-1.000000e+00,8.660254e-01,-0.5,0.866025,0.500000,...,-0.776694,-0.629878,-0.796183,-0.605056,-0.814878,-0.579632,-0.832762,-0.553630,-0.849817,-0.527078
43845,-0.707107,0.707107,-1.000000,-3.006999e-12,-7.071068e-01,-7.071068e-01,6.013998e-12,-1.0,0.707107,-0.707107,...,-0.987006,-0.160684,-0.991677,-0.128748,-0.995316,-0.096679,-0.997917,-0.064508,-0.999479,-0.032271
43846,-0.500000,0.866025,-0.866025,5.000000e-01,-1.000000e+00,-3.973181e-12,-8.660254e-01,-0.5,-0.500000,-0.866025,...,-0.936379,0.350991,-0.924291,0.381689,-0.911197,0.411972,-0.897111,0.441806,-0.882048,0.471160


In [17]:
# Add columns for each of the fourier bases from j = 1 to P and for each different seasonality
# for j = 1 to m/2, but do this in ranges:
# for m = 24 use the range [1,...12]
# For m = 7*24 use a longer range like [1, 6, 12, ..., P] (so we don't blow up training)
# For m = 4*7*42 use a longer range [1, 12, ... P] (remember P = m/2 use P - 1 to be safe actually)
# Again the same thing for m = 8760 use a larger ranger something divisible by 8760 like 10, 20
day_range = list(range(P_day))
week_range = list(range(0, int(len(titles_weekly)/2)+1, 12))
year_range = list(range(0, int(len(titles_yearly)/2)+1, 243))
#print(day_range, '\n\n', week_range, '\n\n', year_range, '\n\n')

In [18]:
# For each possible pair train the model with cross validation with the following function
#The model is the sklearn model, the n_splits is the number of folds (try to find a reasonable
# size but i suspect we won't be able to do it a large number of splits, there's just
# too much data! )
# X is the transformed set of features and y is the target electricity demand
# def rolling_window_time_series(model, n_splits, X, y):
#     # Keep a running total of the mean mse
#     mse_cv = 0
#     N = 1
#     ts_splitter = sklearn.model_selection.TimeSeriesSplit(n_splits=n_splits)
#     for train, valid in ts_splitter.split(X):
#         # For each train, valid fold fit it on train
#         model.fit(X[train], y[train])
#         # Predict on valid
#         y_pred = model.predict(X[valid])
#         # Find mse of this fold and then add it to the running average
#         mse_fold = sklearn.metrics.mean_squared_error(y_true=y[valid], y_pred=y_pred)
#         # This is to keep a running average (instead of appending to an array and then
#         # taking the mean)
#         if N == 1:
#             mse_cv = mse_fold
#         else:
#             # This can be checked by expanding the formula for the mean
#             mse_cv += (mse_fold - mse_cv)/N
#         N+=1
#     # Return mse and rmse
#     return mse_cv, math.sqrt(mse_cv)

# 1. So high level it's like a for loop over all possible combinations of parameters
# 2. For each of those combinations train the model using cross validation
# Record the average mse => 2. should be done by the function rolling_window
mse = pd.DataFrame(columns = ['year_end_index', 'week_end_index', 'day_end_index', 'mse', 'sqrt_mse'])
lr = sklearn.linear_model.LinearRegression(fit_intercept= False)#We 
for p_year in year_range:
    for p_week in week_range:
        for p_day in day_range:
            #Defining our X and Y 
            X = design_matrix[titles_daily[:2*p_day] + titles_weekly[:2*p_week] + titles_yearly[:2*p_year]]
            y = np.arange(0,len(X))
            #Training our model
            model = lr.fit(X, y) #THIS KEEPS CRASHING CUZ OF X, but it looks legit....
            #Checking the model's accuracy
            tscv = sklearn.model_selection.TimeSeriesSplit(n_splits = 10)
            #scores = cross_val_score(model, X_train, y_train, cv=tscv, scoring=mse)
            #Storing the model's accuracy in adataframe
            mse = mse.append({'year_end_index': 2*p_year, 'week_end_index':2*p_week, 'day_end_index':2*p_day, 'mse': 0, 'sqrt_mse': 0}, ignore_index=True)
#print(len(design_matrix[titles_daily[:2*p_day] + titles_weekly[:2*p_week] + titles_yearly[:2*p_year]].index))
mse

ValueError: at least one array or dtype is required

In [None]:
# Once you get the optimal fourier series linear_model
# Do best subset selection on the 10 features so 2^10 combinations now added to the 
# best model you found above