In [364]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
import numpy as np

### Load and handle missing data

In [417]:
df = pd.read_csv('weather_by_time.csv')
pivoted = df.copy(deep=True)

In [418]:
pivoted['month'] = pd.to_datetime(pivoted['day']).dt.month
pivoted['year'] = pd.to_datetime(pivoted['day']).dt.year
pivoted['day'] = pd.to_datetime(pivoted['day']).dt.dayofyear

In [419]:
pivoted = pivoted.set_index(['year', 'month', 'day', 'hour'])

In [426]:
print("Originally contains data from approx " + str(len(df.columns) / 5) + " weather stations")

Originally contains data from approx 566.8 weather stations


***Drop stations with more than or equal to 10 days of missing data***

In [425]:
by_day_df = pivoted.groupby(['year', 'month', 'day']).sum(min_count=1)
by_day_df = by_day_df.dropna(thresh=len(by_day_df) - 9, axis=1)
pivoted = pivoted[pivoted.columns.intersection(by_day_df.columns)]

In [428]:
print("Used data from approx " + str(len(pivoted.columns) / 5) + " weather stations")

Used data from approx 330.2 weather stations


***Fill in missing data*** 

Fill in full days of missing data by taking the average of the day before and after where there were data 

In [429]:
test = by_day_df.copy(deep=True)
test[test.notnull()] = False
test[test.isnull()] = True

In [430]:
time_col = pd.DataFrame(index=pivoted.index)
time_col = time_col.reset_index()

In [None]:
test = pd.merge(test, time_col, on=['year', 'month', 'day'])
test = test.set_index(['year', 'month', 'day', 'hour'])

In [14]:
fill_missing_days = (by_day_df.ffill()+by_day_df.bfill())/2
fill_missing_days = fill_missing_days.bfill().ffill()

In [15]:
fill_missing_days = fill_missing_days.reset_index()
avg_day_with_hourly_data = pd.merge(fill_missing_days, time_col, on=['year', 'month', 'day'])
avg_day_with_hourly_data = avg_day_with_hourly_data.set_index(['year', 'month', 'day', 'hour'])

In [16]:
avg_day_with_hourly_data[test==False] = False

In [17]:
pivoted = pivoted.fillna(avg_day_with_hourly_data)
pivoted = pivoted.replace(False, np.nan)

In [18]:
by_hour = (pivoted.ffill()+pivoted.bfill())/2
by_hour = by_hour.bfill().ffill()

Fill in hours by taking average before and after

In [19]:
weather_df = (pivoted.ffill()+pivoted.bfill())/2
weather_df = weather_df.bfill().ffill()
weather_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,air-temp|72011354829,air-temp|72013704867,air-temp|72019854813,air-temp|72026654809,air-temp|72027504872,air-temp|72027763843,air-temp|72027803704,air-temp|72028404877,air-temp|72029703730,air-temp|72031703735,...,wind_speed|74466504838,wind_speed|74466653944,wind_speed|74467204862,wind_speed|74570013840,wind_speed|74594013705,wind_speed|74594693786,wind_speed|74598013702,wind_speed|74671013806,wind_speed|74671693808,wind_speed|74693093737
year,month,day,hour,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
2016,1,2,0,-2.5,-0.3,-7.2,-3.0,-3.00,8.5,6.0,-2.7,9.0,0.9,...,5.1,3.6,1.5,5.7,3.6,2.1,2.6,2.6,1.5,0.5
2016,1,2,1,-2.2,-0.8,-6.1,-3.0,-2.80,7.6,5.0,-2.7,8.0,0.3,...,4.1,4.6,2.6,5.1,2.6,2.1,3.1,3.1,1.5,1.0
2016,1,2,2,-2.0,-1.5,-6.1,-3.0,-2.80,6.2,4.0,-2.9,6.0,-2.0,...,4.6,5.1,3.1,4.6,4.6,1.5,2.6,2.1,1.5,1.0
2016,1,2,3,-2.0,-1.7,-5.0,-3.0,-3.00,5.6,3.0,-2.5,4.0,-2.0,...,4.1,5.7,3.6,3.6,1.5,1.5,2.1,2.1,1.5,1.5
2016,1,2,4,-2.0,-1.5,-5.0,-2.5,-3.25,5.6,2.5,-2.5,4.0,-2.7,...,5.1,4.6,2.1,4.6,1.5,1.5,2.1,2.1,1.5,1.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018,12,365,19,3.1,3.5,0.6,4.0,4.10,12.8,11.0,3.2,12.0,8.0,...,4.6,2.6,2.1,7.2,3.1,3.9,1.8,9.8,8.8,2.6
2018,12,365,20,3.2,4.5,0.6,5.0,3.80,13.1,12.0,4.1,12.0,8.5,...,4.6,4.6,1.5,5.7,2.1,2.1,1.5,6.7,6.2,2.1
2018,12,365,21,3.5,3.8,0.6,6.0,4.00,13.4,12.0,3.7,12.0,8.5,...,4.6,9.8,3.6,5.7,3.1,5.1,2.6,7.2,8.8,1.5
2018,12,365,22,4.0,3.5,1.1,7.0,4.00,13.5,12.5,3.7,12.0,8.5,...,5.1,7.7,5.1,8.2,3.6,5.1,2.1,7.7,4.6,4.1


### Renewable Data

In [127]:
target = pd.read_csv('/root/renewables.csv', parse_dates=['datetime_beginning_utc'])

In [128]:
target['hour'] = target['datetime_beginning_utc'].dt.hour
target['month'] = pd.to_datetime(target['datetime_beginning_utc']).dt.month
target['year'] = pd.to_datetime(target['datetime_beginning_utc']).dt.year
target['day'] = pd.to_datetime(target['datetime_beginning_utc']).dt.dayofyear

target = target.drop(['Unnamed: 0', 'total_mw', 'datetime_beginning_utc'], axis=1)

In [129]:
target = target.pivot_table(values=['mw'], index=['year', 'month', 'day', 'hour'], columns='fuel_type')
target.columns = target.columns.map('|'.join).str.strip('|')

In [130]:
# Drop hydro and other renewables for now
target = target.drop(['mw|Hydro', 'mw|Other Renewables'], axis=1)

# Start with solar data 
solar = target[['mw|Solar']]

In [375]:
solar

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,mw|Solar
year,month,day,hour,Unnamed: 4_level_1
2016,1,1,5,0.0
2016,1,1,6,0.0
2016,1,1,7,0.0
2016,1,1,8,0.0
2016,1,1,9,0.0
...,...,...,...,...
2018,12,365,1,4.4
2018,12,365,2,4.4
2018,12,365,3,4.4
2018,12,365,4,4.4


### Split into a separate dataframe for each metric, standardize, apply PCA

In [222]:
metrics = ['air-temp', 'dew_point_temp', 'sea_level_pressure', 'sky_ceiling_height', 'wind_speed']

Metrics: 
- Air temp 
- Dew point temp 
- Sea level pressure
- Sky ceiling height 
- Wind speed 

In [340]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import copy 

def split_apply_pca(df, num_components):
    # 80-20 split 
    df = df.copy()
    
    train_pct_index = int(0.8 * len(df))
    X_train, X_test = df[:train_pct_index], df[train_pct_index:]
    
    # Standardize 
    scaler = StandardScaler()
    scaler.fit(X_train.values) # fit on training set 

    scaled_X_train = scaler.transform(X_train.values)
    scaled_X_test = scaler.transform(X_test.values)
    
    # Convert back to df after scaling 
    X_train = pd.DataFrame(scaled_X_train, index=X_train.index, columns=X_train.columns)
    X_test = pd.DataFrame(scaled_X_test, index=X_test.index, columns=X_test.columns)
    
    # PCA Analysis
    pca = PCA(n_components = num_components)
    pca.fit(X_train.values)

    X_train_pca = pca.transform(X_train.values)
    X_test_pca = pca.transform(X_test.values)
    
    # Convert back to df after pca 
    X_train = pd.DataFrame(X_train_pca, index=X_train.index)
    X_test = pd.DataFrame(X_test_pca, index=X_test.index)
    
    print('Explained variation per principal component: {}'.format(pca.explained_variance_ratio_.cumsum()))

    return X_train, X_test 

In [345]:
air_temp_df = df.loc[:, df.columns.str.startswith('air-temp')]
dew_point_temp_df = df.loc[:, df.columns.str.startswith('dew_point_temp')]
sea_level_pressure_df = df.loc[:, df.columns.str.startswith('sea_level_pressure')]
sky_ceiling_height_df = df.loc[:, df.columns.str.startswith('sky_ceiling_height')]
wind_speed_df = df.loc[:, df.columns.str.startswith('wind_speed')]


In [346]:
air_temp_df_train, air_temp_df_test = split_apply_pca(air_temp_df, 3)
air_temp_df_train.columns = ['air-temp|' + str(col) for col in air_temp_df_train.columns]
air_temp_df_test.columns = ['air-temp|' + str(col) for col in air_temp_df_test.columns]

Explained variation per principal component: [0.81477806 0.85268206 0.86793755]


In [347]:
dew_point_temp_df_train, dew_point_temp_df_test = split_apply_pca(dew_point_temp_df, 3)
dew_point_temp_df_train.columns = ['dew_point_temp|' + str(col) for col in dew_point_temp_df_train.columns]
dew_point_temp_df_test.columns = ['dew_point_temp|' + str(col) for col in dew_point_temp_df_test.columns]

Explained variation per principal component: [0.82475747 0.86972109 0.88697902]


In [348]:
sea_level_pressure_df_train, sea_level_pressure_df_test = split_apply_pca(sea_level_pressure_df, 3)
sea_level_pressure_df_train.columns = ['sea_level_pressure|' + str(col) for col in sea_level_pressure_df_train.columns]
sea_level_pressure_df_test.columns = ['sea_level_pressure|' + str(col) for col in sea_level_pressure_df_test.columns]

Explained variation per principal component: [0.51025887 0.61165454 0.642972  ]


In [349]:
sky_ceiling_height_df_train, sky_ceiling_height_df_test = split_apply_pca(sky_ceiling_height_df, 15)
sky_ceiling_height_df_train.columns = ['sky_ceiling_height|' + str(col) for col in sky_ceiling_height_df_train.columns]
sky_ceiling_height_df_test.columns = ['sky_ceiling_height|' + str(col) for col in sky_ceiling_height_df_test.columns]

Explained variation per principal component: [0.24172591 0.34241423 0.39770108 0.43840721 0.46849071 0.49082256
 0.50498128 0.51858437 0.52995401 0.53960922 0.54909071 0.55783273
 0.56504717 0.57147491 0.5777959 ]


In [350]:
wind_speed_df_train, wind_speed_df_test = split_apply_pca(wind_speed_df, 12)
wind_speed_df_train.columns = ['wind_speed|' + str(col) for col in wind_speed_df_train.columns]
wind_speed_df_test.columns = ['wind_speed|' + str(col) for col in wind_speed_df_test.columns]

Explained variation per principal component: [0.30509531 0.40341278 0.45113666 0.48068467 0.50890501 0.52569526
 0.53663785 0.54639451 0.55430623 0.56184552 0.56838367 0.57455663]


In [399]:
X_train = pd.concat([air_temp_df_train, dew_point_temp_df_train, sea_level_pressure_df_train, sky_ceiling_height_df_train, wind_speed_df_train], axis=1, join='inner')
X_test = pd.concat([air_temp_df_test, dew_point_temp_df_test, sea_level_pressure_df_test, sky_ceiling_height_df_test, wind_speed_df_test], axis=1, join='inner')

### Pivot so it's by hour

Features

In [400]:
X_train_hour = X_train.reset_index()
X_train_hour = X_train_hour.pivot_table(index = ['year','month', 'day'], columns="hour", values=X_train.columns)
X_train_hour.columns = X_train_hour.columns.map('{0[0]}|{0[1]}'.format)
X_train_hour = X_train_hour.dropna()
X_train_hour

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,air-temp|0|0,air-temp|0|1,air-temp|0|2,air-temp|0|3,air-temp|0|4,air-temp|0|5,air-temp|0|6,air-temp|0|7,air-temp|0|8,air-temp|0|9,...,wind_speed|9|14,wind_speed|9|15,wind_speed|9|16,wind_speed|9|17,wind_speed|9|18,wind_speed|9|19,wind_speed|9|20,wind_speed|9|21,wind_speed|9|22,wind_speed|9|23
year,month,day,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
2016,1,2,18.971038,19.382255,19.714949,20.080902,20.412253,20.692588,21.021708,21.469716,21.852284,22.199963,...,-0.665188,0.060074,1.098244,2.033725,2.189790,2.498197,1.647959,1.325112,0.118023,1.545255
2016,1,3,17.947499,18.583861,19.009928,19.464845,19.802621,20.073149,20.540016,20.872521,21.236355,21.459822,...,-0.350451,0.430265,0.439137,0.749671,0.179354,-0.365870,0.451229,-0.367936,0.684132,-0.425031
2016,1,4,18.638197,19.228427,19.847195,20.235984,20.713479,21.190465,21.785354,22.269139,22.657690,23.046777,...,-2.523483,-3.070133,-2.689939,-2.368897,-1.624732,-1.271947,-1.865899,-1.232261,-1.907700,-0.022652
2016,1,5,28.032195,29.183302,30.042180,30.824233,31.462964,32.080079,32.624859,33.058700,33.510733,33.926744,...,0.418637,0.021998,-0.928691,-1.188340,-0.724107,-0.855444,-0.001291,-0.149648,-0.154697,-0.437900
2016,1,6,25.617212,26.408382,27.029468,27.600705,28.149289,28.679383,29.183132,29.633319,29.778595,29.946591,...,-2.472387,-2.717427,-2.925791,-2.140633,-2.800680,-2.563389,-2.424839,-1.794435,-1.364806,-1.352627
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018,5,142,-12.724996,-11.316515,-10.513033,-9.861644,-9.410143,-9.041780,-8.710103,-8.387587,-8.139441,-8.011186,...,0.798491,0.961822,1.115100,1.552749,1.064386,1.814358,1.701332,1.742151,1.899142,1.139820
2018,5,143,-13.477000,-11.612440,-10.340575,-9.473005,-8.675126,-7.965338,-7.355711,-6.929964,-6.452828,-6.019194,...,-0.376047,0.144118,0.132134,-0.149532,0.361808,0.859793,-0.096472,1.541284,0.935081,1.662985
2018,5,144,-16.466992,-13.481470,-11.547531,-9.903925,-8.768059,-7.760643,-6.946340,-6.194404,-5.419946,-4.813750,...,3.081209,3.083719,2.086942,1.253733,0.181983,0.280379,0.448009,0.896595,0.878972,0.835564
2018,5,145,-18.443011,-15.266172,-13.268490,-11.734364,-10.607376,-9.568513,-8.653210,-7.972773,-7.472826,-6.872362,...,0.856143,0.871237,1.540518,0.812074,0.429410,1.332498,-0.415585,0.173460,0.538715,1.426491


In [401]:
X_test_hour = X_test.reset_index()
X_test_hour = X_test_hour.pivot_table(index = ['year','month', 'day'], columns="hour", values=X_test_hour.columns)
X_test_hour.columns = X_test_hour.columns.map('{0[0]}|{0[1]}'.format) 
X_test_hour = X_test_hour.dropna()
X_test_hour

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,air-temp|0|0,air-temp|0|1,air-temp|0|2,air-temp|0|3,air-temp|0|4,air-temp|0|5,air-temp|0|6,air-temp|0|7,air-temp|0|8,air-temp|0|9,...,wind_speed|9|14,wind_speed|9|15,wind_speed|9|16,wind_speed|9|17,wind_speed|9|18,wind_speed|9|19,wind_speed|9|20,wind_speed|9|21,wind_speed|9|22,wind_speed|9|23
year,month,day,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
2018,5,148,-20.682376,-18.351019,-16.835212,-15.836468,-14.859033,-13.972847,-13.203542,-12.476595,-12.024839,-11.666475,...,0.528800,1.567150,1.782288,1.467761,1.516311,1.377991,1.179367,1.656511,1.124491,0.479914
2018,5,149,-20.805580,-18.639658,-17.328393,-16.327187,-15.646571,-14.993918,-14.585452,-14.138233,-13.789528,-13.539379,...,0.966165,1.211295,0.966564,1.326494,2.845035,2.278561,2.348818,2.179767,1.915881,2.091830
2018,5,150,-21.491189,-20.011833,-19.096454,-18.164302,-17.516135,-16.982154,-16.465167,-16.052652,-15.746008,-15.590045,...,1.409775,1.703258,1.470676,0.963643,1.244792,0.932763,-0.361274,0.400617,-1.191976,-0.266885
2018,5,151,-19.231191,-18.103421,-17.487493,-16.911365,-16.590031,-16.299501,-16.061854,-15.751705,-15.547257,-15.280518,...,-2.273969,-1.390199,-1.525431,-1.014967,1.142430,0.471794,2.046825,1.351592,1.577466,2.757989
2018,6,152,-20.338788,-18.761247,-17.817659,-16.965142,-16.256324,-15.537710,-14.899526,-14.319979,-13.871918,-13.485410,...,1.838706,1.994921,2.308146,2.561432,1.896292,1.506761,2.126986,2.581223,1.245487,1.875526
2018,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018,12,360,17.890611,18.930931,19.574652,20.358549,20.867219,21.216213,21.599269,21.805001,22.058632,22.159494,...,0.187147,-0.848917,-1.062692,-1.240995,-1.311597,-1.142783,-0.769409,-0.476395,-0.309146,-0.716182
2018,12,361,15.595678,16.232561,16.720249,17.012687,17.196618,17.328797,17.455643,17.478150,17.503216,17.444268,...,-2.331407,-2.504595,-2.186752,-2.131737,-2.906947,-3.293878,-1.611867,-2.156884,-0.844015,-2.558824
2018,12,362,6.322988,5.887211,5.511471,5.228023,4.909650,4.629590,4.294304,4.145515,4.066979,4.155657,...,-0.260875,0.293658,1.116638,1.715267,2.211583,1.238384,1.957130,2.595996,2.691756,1.007235
2018,12,363,9.535058,10.196684,10.824704,11.356974,11.833417,12.301642,12.818921,13.459091,14.136390,14.661038,...,-1.700043,-0.762749,-0.944142,-0.435792,0.484212,1.521138,2.475921,2.073190,1.981188,1.914750


Target

In [402]:
solar_hour = solar.reset_index()
solar_hour = solar_hour.pivot_table(index = ['year','month', 'day'], columns="hour", values=solar_hour.columns)
solar_hour.columns = solar_hour.columns.map('{0[0]}|{0[1]}'.format)
solar_hour = solar_hour.dropna()
solar_hour

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,mw|Solar|0,mw|Solar|1,mw|Solar|2,mw|Solar|3,mw|Solar|4,mw|Solar|5,mw|Solar|6,mw|Solar|7,mw|Solar|8,mw|Solar|9,...,mw|Solar|14,mw|Solar|15,mw|Solar|16,mw|Solar|17,mw|Solar|18,mw|Solar|19,mw|Solar|20,mw|Solar|21,mw|Solar|22,mw|Solar|23
year,month,day,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
2016,1,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,28.9,46.6,55.1,57.3,53.2,44.8,18.1,0.0,0.0,0.0
2016,1,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,31.5,47.8,53.7,52.9,46.1,31.3,18.1,0.0,0.0,0.0
2016,1,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,29.0,54.3,56.1,56.6,52.5,39.0,15.6,0.0,0.0,0.0
2016,1,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,33.3,58.3,63.7,63.5,59.6,51.7,24.2,5.3,5.4,5.4
2016,1,6,5.3,5.3,5.4,5.4,5.6,5.6,5.4,5.6,5.6,5.6,...,29.2,46.1,58.6,56.8,53.9,42.5,18.5,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018,12,360,1.3,1.3,1.3,1.3,1.3,1.3,1.3,1.3,1.3,1.3,...,653.9,753.6,811.2,806.1,780.7,714.3,506.6,55.9,1.3,1.3
2018,12,361,1.3,1.3,1.3,1.3,1.3,1.3,1.3,1.3,1.3,1.3,...,435.7,654.4,635.7,610.2,516.8,372.5,117.3,15.3,4.4,4.4
2018,12,362,4.4,4.4,4.4,4.4,4.4,4.4,4.4,4.4,4.4,4.4,...,67.3,104.4,75.7,93.0,94.5,60.4,28.4,7.9,4.4,4.4
2018,12,363,4.4,4.4,4.4,4.4,4.4,4.4,4.4,4.4,4.4,4.4,...,479.0,625.6,697.2,643.4,538.4,484.8,267.5,45.3,4.4,4.4


Train / Test split target

In [403]:
train_pct_index = int(0.8 * len(solar_hour))
Y_train, Y_test = solar_hour[:train_pct_index], solar_hour[train_pct_index:]

Merge so we can ensure we have matching dates between features and target

In [404]:
# Start by merging training set 
train_set_merged = X_train_hour.merge(Y_train, left_index=True, right_index=True, how='inner')
X_train = train_set_merged.iloc[:,:-24]
Y_train = train_set_merged[train_set_merged.columns[-24:]]

In [405]:
# Merge test sets 
test_set_merged = X_test_hour.merge(Y_test, left_index=True, right_index=True, how='inner')
X_test = test_set_merged.iloc[:,:-24]
Y_test = test_set_merged[test_set_merged.columns[-24:]]

### Circular encoding for time data

In [369]:
# Circular encoding of day of year and month 
def sin_cos_correction(df):
    df = df.reset_index()
    
    df['day_sin'] = np.sin((df['day']-1)*(2.*np.pi/365))
    df['day_cos'] = np.cos((df['day']-1)*(2.*np.pi/365))
    df['month_sin'] = np.sin((df['month']-1)*(2.*np.pi/12))
    df['month_cos'] = np.cos((df['month']-1)*(2.*np.pi/12))
    
    df = df.drop(['month', 'day'], axis=1)
    df = df.set_index(['year', 'month_sin', 'month_cos', 'day_sin', 'day_cos'])
    return df

In [413]:
X_train = sin_cos_correction(X_train)
X_test = sin_cos_correction(X_test)
Y_train = sin_cos_correction(Y_train)
Y_test = sin_cos_correction(Y_test)

### Save to csv

In [414]:
X_train.to_csv("X_Train.csv", header=None, index=None)
X_test.to_csv("X_test.csv", header=None, index=None)

In [415]:
Y_train.to_csv("y_Train.csv")
Y_test.to_csv("y_test.csv")