In [3]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
import numpy as np
import dask.dataframe as dd
import glob

ModuleNotFoundError: No module named 'dask'

### Format Weather Data

In [2]:
# Weather data includes: air-temp, wind_speed, sea_level_pressure
def format_weather(df, value):
    df['commit_date'] = df['commit_date'].str.split(' ').str[0]
    df['commit_date'] = pd.to_datetime(df['commit_date']) 
    
    
    df = df.melt(id_vars= ["station", "commit_date"], var_name='hour', value_name=value) 
    df['hour'] = df['hour'].str.replace('hour', '')
    df['hour'] = df['hour'].astype(int) 
    df['station']= df['station'].astype(str)
    
    # Create a column for each weather station 
    df = df.pivot_table(index = ['day','hour'], columns="station", values=df.columns)
    df.columns = df.columns.map('{0[0]}|{0[1]}'.format)
    
    return df 

In [25]:
weather_type = 'dew_point_temp'

In [26]:
cols = list(pd.read_csv(weather_type + ".csv", nrows =1))
df = dd.read_csv(weather_type + ".csv", usecols =[i for i in cols if i not in ['committer', 'commit_hash', 'data_points', "avg", "min", "max", "median"]])

df['day'] = df['commit_date'].str.split(' ').str[0]
df = df.melt(id_vars= ["station", "day"], var_name='hour', value_name=weather_type) 
df['hour'] = df['hour'].str.replace('hour', '')

df['day']=dd.to_datetime(df.day,unit='ns')

df['year'] = df.day.dt.year

In [29]:
year_df = df.loc[df["year"] == 2017]

In [30]:
year_df.to_csv(weather_type+ '/2017.csv', single_file = True) 

  return func(*(_execute_task(a, cache) for a in args))


['/home/weather/noaa/dew_point_temp/2017.csv']

In [77]:
year_df = df.loc[df["year"] == 2016]

In [78]:
year_df.to_csv(weather_type+ '/2016.csv', single_file = True) 

  return func(*(_execute_task(a, cache) for a in args))


['/home/weather/noaa/dew_point_temp/2016.csv']

### Merge the separate dfs
Code is currently in Python file

### Add noise to data
- Find difference between current date and previous row 
- Average it
- Take that and create a vector that has standard deviatoin of that value 
- Add it 

In [61]:
def add_noise(weather_type):
    weather_df = pd.read_csv(weather_type + ".csv")
    weather_df.dropna(axis = 1, inplace = True)
    
    weather_df.set_index(['year', 'month', 'day'], inplace=True)
    noise_std_dev = np.abs(weather_df.diff()).mean(axis = 0)
    noise_std_dev.to_csv("with_noise/noise_amt_" + weather_type + ".csv")
    
    for col, val in noise_std_dev.items():
        noise = np.random.normal(0, noise_std_dev[col], len(weather_df.index))
        weather_df[col] = weather_df[col] + noise
    
    weather_df.to_csv("with_noise/" + weather_type + ".csv")

In [62]:
weather_types = ['air_temp', 'wind_speed', 'sea_level_pressure', 'sky_ceiling_height', 'dew_point_temp']

for weather_type in weather_types:
    print("Going through " + weather_type)
    add_noise(weather_type)

Going through air_temp
Going through wind_speed
Going through sea_level_pressure
Going through sky_ceiling_height
Going through dew_point_temp


### Split into a separate dataframe for each metric, standardize, apply PCA

Metrics: 
- Air temp 
- Dew point temp: measure of how much water vapor is in the air 
- Sea level pressure: atmospheric pressure at sea level at a given location.
- Sky ceiling height: height of the lowest layer of clouds above the surface that are either broken or overcast, but not thin
- Wind speed 

In [102]:
weather_types = ['air_temp', 'wind_speed', 'sea_level_pressure', 'sky_ceiling_height', 'dew_point_temp']

weather_df_hour = pd.read_csv("with_noise/" + "dew_point_temp" + ".csv")

In [103]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import copy 

def split_apply_pca(X_train, X_test, num_components):
    # PCA Analysis
    pca = PCA(n_components = num_components)
    pca.fit(X_train.values)

    X_train_pca = pca.transform(X_train.values)
    X_test_pca = pca.transform(X_test.values)
    
    # Convert back to df after pca 
    X_train = pd.DataFrame(X_train_pca, index=X_train.index)
    X_test = pd.DataFrame(X_test_pca, index=X_test.index)
    
    print('Explained variation per principal component: {}'.format(pca.explained_variance_ratio_.cumsum()))

    return X_train, X_test 

In [104]:
weather_df_hour.set_index(['year', 'month', 'day'], inplace=True)

In [105]:
day_in_four_months = 122
X_train = weather_df_hour.head(len(weather_df_hour.index) - day_in_four_months)
X_test = weather_df_hour.tail(day_in_four_months)

In [106]:
X_train

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,dew_point_temp|72011354829|0,dew_point_temp|72011354829|1,dew_point_temp|72011354829|2,dew_point_temp|72011354829|3,dew_point_temp|72011354829|4,dew_point_temp|72011354829|5,dew_point_temp|72011354829|6,dew_point_temp|72011354829|7,dew_point_temp|72011354829|8,dew_point_temp|72011354829|9,...,dew_point_temp|A0735900240|14,dew_point_temp|A0735900240|15,dew_point_temp|A0735900240|16,dew_point_temp|A0735900240|17,dew_point_temp|A0735900240|18,dew_point_temp|A0735900240|19,dew_point_temp|A0735900240|20,dew_point_temp|A0735900240|21,dew_point_temp|A0735900240|22,dew_point_temp|A0735900240|23
year,month,day,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
2016,1,1,2.785295,-11.031692,-7.585751,4.074712,-4.828795,-11.783837,2.771928,-2.978121,-7.621156,-0.086747,...,-11.486529,-2.840669,-7.480224,-6.086219,0.798234,-6.457324,-8.401233,-5.672511,-5.352045,-5.623044
2016,1,2,-4.793950,-2.616926,-7.652630,-3.343662,-14.782730,-5.438909,-9.373972,-8.260880,-6.957278,-9.065052,...,-1.992586,-5.469894,-5.528797,-3.164783,-9.579635,-5.605023,-6.591124,-4.784729,-4.207549,-3.522513
2016,1,3,1.049710,-12.054713,-3.534869,3.551657,0.428264,-6.750574,-1.725010,-7.019402,-2.783997,-8.655217,...,-3.822501,-4.225859,-6.267626,-10.865823,-14.591161,-6.163143,-4.468190,-7.550445,-16.198009,-3.818474
2016,1,4,-10.027022,-8.594244,-11.269617,-10.316746,-16.082316,-12.537191,-12.757986,-17.745078,-14.895025,-11.853434,...,-9.716352,-13.372267,-11.542380,-10.612912,-9.340593,-10.508026,-11.543050,-15.058510,-8.885381,-7.465315
2016,1,5,-21.958236,-15.637671,-11.629449,-19.421015,-22.916439,-13.564816,-18.313709,-9.188582,-11.884143,-7.598177,...,-15.448050,-12.552819,-10.170983,-9.622977,-5.698588,-7.165180,-7.444727,-10.421403,-14.946598,-6.394089
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017,8,239,1.830444,0.268627,8.307793,9.031835,6.214482,1.269644,5.711309,6.807780,8.713897,13.386693,...,9.101800,13.267187,9.572265,15.014185,1.412256,9.956146,14.188528,8.790695,27.328889,21.880137
2017,8,240,18.324233,15.660260,14.418129,23.772828,12.370442,18.182054,8.939999,18.662675,7.544386,14.920589,...,15.181854,14.939459,20.278295,18.887723,16.633310,9.552260,14.972063,18.948003,13.574421,16.446786
2017,8,241,16.566169,22.465601,17.230447,22.898193,23.492700,8.636046,16.350639,19.103264,23.380780,14.495456,...,12.467416,13.570323,10.833121,16.044663,12.681439,10.229604,9.578618,13.042083,13.967637,21.204424
2017,8,242,27.072338,12.303581,17.985688,18.122251,15.977102,23.999633,17.612772,12.897649,14.327554,21.106765,...,13.684785,12.472459,15.414543,14.163193,15.039113,21.708169,13.100552,12.474592,7.272634,12.642781


In [86]:
air_temp_df_train, air_temp_df_test = split_apply_pca(X_train, X_test, 3)

Explained variation per principal component: [0.77840133 0.813706   0.82611254]


In [107]:
dew_point_temp_df_train, dew_point_temp_df_test = split_apply_pca(X_train, X_test , 3)

Explained variation per principal component: [0.76487441 0.80323523 0.81753173]


In [96]:
sea_level_pressure_df_train, sea_level_pressure_df_test = split_apply_pca(X_train, X_test , 25)

Explained variation per principal component: [0.45728172 0.56665524 0.59535399 0.61529915 0.63091042 0.63950533
 0.64355086 0.64733932 0.65018958 0.65257917 0.65446234 0.65634121
 0.65792931 0.65935585 0.66061323 0.66184959 0.66300465 0.66409477
 0.66513663 0.66615267 0.66716346 0.66815765 0.66914095 0.67011727
 0.67108961]


In [101]:
sky_ceiling_height_df_train, sky_ceiling_height_df_test = split_apply_pca(X_train, X_test , 25)

Explained variation per principal component: [0.13350116 0.1874962  0.21916649 0.24133819 0.25602117 0.26850256
 0.27849023 0.28684163 0.29450242 0.30128675 0.30761562 0.31350627
 0.3185929  0.32324738 0.32775596 0.33210736 0.33635488 0.34041331
 0.34399694 0.34737555 0.35064373 0.35373908 0.35682036 0.35981046
 0.36270511]


In [91]:
wind_speed_df_train, wind_speed_df_test = split_apply_pca(X_train, X_test, 25)

Explained variation per principal component: [0.14788073 0.20102236 0.22464909 0.24381369 0.25934308 0.27226876
 0.28054203 0.28806269 0.2949415  0.3008464  0.30637314 0.311254
 0.31580927 0.32010935 0.32413688 0.32806712 0.33169026 0.33521276
 0.33870685 0.34205112 0.34527276 0.34838105 0.35146422 0.35442248
 0.35720739]


In [108]:
X_train = pd.concat([air_temp_df_train, dew_point_temp_df_train, sea_level_pressure_df_train, sky_ceiling_height_df_train, wind_speed_df_train], axis=1, join='inner')
X_test = pd.concat([air_temp_df_test, dew_point_temp_df_test, sea_level_pressure_df_test, sky_ceiling_height_df_test, wind_speed_df_test], axis=1, join='inner')

In [110]:
X_train.to_csv("X_train_after_pca.csv")
X_test.to_csv("X_test_after_pca.csv")

### Load Forecasts 

In [39]:
def extract_date(df, date_col):
    df.copy(deep=True)
    df['hour'] = df[date_col].dt.hour
    df['week'] = df[date_col].dt.week
    df['month'] = df[date_col].dt.month
    df['year'] = df[date_col].dt.year
    df['day'] = df[date_col].dt.dayofyear
    return df

In [40]:
# PJM Load Forecasts
pjm_df = pd.concat(map(pd.read_csv, ['load_forecasts/pjm/2016.csv', 'load_forecasts/pjm/2017.csv']))

Use only predictions that are generated one day ago

In [41]:
pjm_df = pjm_df.drop(['evaluated_at_ept', 'forecast_hour_beginning_ept'], axis = 1)
pjm_df.columns = ['Eval At', 'Date', 'area', 'PJM']

# RTO columns represent the load forecast for all of PJM
pjm_df = pjm_df.loc[pjm_df['area'] == 'RTO']

pjm_df['Date'] = pd.to_datetime(pjm_df['Date'])
pjm_df['Eval At'] = pd.to_datetime(pjm_df['Eval At'])

# Find difference (in number of days) between the day the forecasted was created and the forecast number
pjm_df['diff'] = (pjm_df['Date'].dt.date - pjm_df['Eval At'].dt.date).dt.days.astype(str)

In [42]:
# keep only days that are one day apart
pjm_df = pjm_df.loc[pjm_df['diff'] == '1']

# Randomly keep one of the load forecasts 
pjm_df = pjm_df.groupby( ['Date'] ).first().reset_index()

In [43]:
pjm_df = extract_date(pjm_df, 'Date')
pjm_df = pjm_df.drop(['Eval At', 'area', 'diff', 'Date'], axis = 1)

In [44]:
# Pivot so each hour is a separate columns 
pjm_df = pjm_df.pivot_table(index = ['year','month', 'week', 'day'], columns="hour", values=pjm_df.columns)
pjm_df.columns = pjm_df.columns.map('{0[0]}|{0[1]}'.format)

In [64]:
pjm_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,PJM|0,PJM|1,PJM|2,PJM|3,PJM|4,PJM|5,PJM|6,PJM|7,PJM|8,PJM|9,...,PJM|14,PJM|15,PJM|16,PJM|17,PJM|18,PJM|19,PJM|20,PJM|21,PJM|22,PJM|23
year,month,week,day,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
2016,1,1,4,98680.0,98233.0,95990.0,92388.0,88038.0,84511.0,82732.0,82389.0,83071.0,85360.0,...,104973.0,105039.0,104475.0,103596.0,102768.0,102112.0,102143.0,105126.0,111499.0,114260.0
2016,1,1,5,114730.0,113313.0,109936.0,104789.0,99408.0,96334.0,94637.0,94140.0,94710.0,97048.0,...,110909.0,108726.0,106343.0,104075.0,102208.0,100737.0,100321.0,102832.0,109274.0,112639.0
2016,1,1,6,115195.0,114570.0,111439.0,106018.0,100376.0,96540.0,94786.0,94203.0,94707.0,96745.0,...,108108.0,105436.0,102659.0,100082.0,98113.0,96553.0,96011.0,98214.0,103979.0,107487.0
2016,1,1,7,113929.0,113019.0,109552.0,104284.0,98685.0,90730.0,88718.0,88009.0,88190.0,90125.0,...,103851.0,101805.0,99590.0,97475.0,95889.0,94492.0,93889.0,95858.0,101363.0,104237.0
2016,1,1,8,110167.0,109116.0,105707.0,100465.0,94637.0,85954.0,83590.0,82568.0,82562.0,84252.0,...,98679.0,97446.0,95756.0,93988.0,92531.0,91204.0,90684.0,92559.0,97657.0,98945.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017,12,52,362,122964.0,122308.0,119582.0,114523.0,109280.0,107491.0,106162.0,105848.0,106580.0,109008.0,...,123877.0,121115.0,118449.0,116076.0,114635.0,113652.0,113937.0,117096.0,123643.0,127109.0
2017,12,52,363,126934.0,125372.0,121994.0,116309.0,110704.0,106512.0,104559.0,103694.0,103822.0,105612.0,...,119130.0,116525.0,113653.0,111061.0,109318.0,108173.0,108056.0,110397.0,116076.0,118347.0
2017,12,52,364,117227.0,115747.0,113169.0,109097.0,104353.0,100326.0,98216.0,97217.0,96950.0,97636.0,...,109139.0,108670.0,107436.0,105431.0,103493.0,102321.0,102428.0,104856.0,110089.0,112356.0
2017,12,52,365,112903.0,111850.0,109979.0,106678.0,102780.0,101268.0,99817.0,99262.0,99481.0,100629.0,...,110827.0,109618.0,107977.0,106535.0,105332.0,104565.0,105307.0,108961.0,116035.0,119008.0


### Nuclear 

In [70]:
actual_gen = pd.concat(map(pd.read_csv, ['gen_by_fuel_type/2016.csv', 'gen_by_fuel_type/2017.csv']))

In [76]:
actual_gen = actual_gen[['datetime_beginning_utc', 'mw']]

In [77]:
actual_gen.columns = ['date', 'nuclear_gen']

In [78]:
actual_gen

Unnamed: 0,date,nuclear_gen
0,12/31/2016 5:00:00 AM,35829.6
1,12/31/2016 5:00:00 AM,12545.6
2,12/31/2016 5:00:00 AM,704.8
3,12/31/2016 5:00:00 AM,267.5
4,12/31/2016 5:00:00 AM,33888.5
...,...,...
95233,1/1/2017 5:00:00 AM,7.3
95234,1/1/2017 5:00:00 AM,678.6
95235,1/1/2017 5:00:00 AM,0.0
95236,1/1/2017 5:00:00 AM,0.0


In [46]:
actual_gen['datetime_beginning_utc'] = pd.to_datetime(actual_gen['datetime_beginning_utc'])

actual_gen = extract_date(actual_gen, 'datetime_beginning_utc')

nuclear = actual_gen.loc[actual_gen['fuel_type'] == 'Nuclear']
nuclear = nuclear.drop(['fuel_type', 'datetime_beginning_ept', 'fuel_type', 'fuel_percentage_of_total', 'is_renewable'], axis = 1)

In [47]:
nuclear_weekly = nuclear.groupby(['year', 'week']).mean().reset_index()

In [48]:
nuclear_weekly = nuclear_weekly[['year', 'week', 'mw']]

In [49]:
# Take the values from the week before as the current week 
nuclear_weekly['mw'] = nuclear_weekly['mw'].shift(1)

In [50]:
nuclear_weekly.columns = ['year', 'week', 'nuclear']

### Merge nuclear, load forecasts, and weather together

In [65]:
# Weather Data
X_train = pd.read_csv("X_train_after_pca.csv")
X_test = pd.read_csv("X_test_after_pca.csv")

In [67]:
X_train

Unnamed: 0,year,month,day,0,1,2,0.1,1.1,2.1,0.2,...,15.2,16.2,17.2,18.2,19.2,20.2,21.2,22.2,23.2,24.2
0,2016,1,1,1186.133765,246.110226,-135.702606,1016.584538,283.051438,-28.590519,341.735073,...,6.018641,21.674299,0.703101,11.629154,30.816559,-26.851573,3.034115,-15.672511,-17.470511,23.375869
1,2016,1,2,1234.690011,-32.923771,-108.737031,1177.787831,-37.087207,-101.878537,237.667828,...,-2.156930,-16.715094,-13.753816,-3.007628,8.913964,5.052085,-4.865501,10.827681,-28.482907,0.295123
2,2016,1,3,1221.495520,-21.475024,-7.123256,1136.260062,8.537972,-85.694900,-71.265549,...,-2.818909,19.728749,17.741554,-8.962226,-8.829314,-7.105707,-1.326713,-4.181786,1.759561,6.032705
3,2016,1,4,1461.557551,60.565533,-26.514199,1454.506226,57.198628,-53.223330,394.783055,...,-14.418449,21.757132,-22.568148,7.288453,-1.008589,7.676192,-5.072129,-14.027923,20.493213,-1.517844
4,2016,1,5,1787.757826,-260.890903,-65.020558,1878.187159,-312.777671,-11.738459,1217.816466,...,10.653026,-11.585351,-7.542148,5.887752,-10.417489,-3.022965,9.584577,13.785165,17.964676,0.634902
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
604,2017,8,239,-619.256953,-56.901129,-54.299198,-565.793430,-41.217686,-37.025474,338.033348,...,2.512923,8.307493,10.267223,-0.564225,-0.532988,1.294632,-2.822819,12.873906,-1.779926,0.294248
605,2017,8,240,-618.273705,-43.024962,-24.226846,-717.068469,-171.461491,-57.900321,189.612107,...,9.311822,19.879716,5.677285,-2.284726,6.289365,-9.405595,-0.539387,16.862644,-5.354348,-15.627848
606,2017,8,241,-583.238463,-56.607479,-84.309592,-833.989442,-97.629935,-72.925162,43.961493,...,12.236851,30.476439,11.005522,-5.446577,-6.182445,-7.124181,2.879962,18.053702,-9.963270,-17.382840
607,2017,8,242,-650.208427,-106.879202,-59.719641,-797.903823,-96.838832,-67.675220,-28.881393,...,9.033311,-3.205620,-8.633822,-2.803527,-12.325311,10.933678,-2.767050,13.678854,4.843415,-5.750486


In [52]:
# Reconstruct the datetime column 
X_train['date'] = pd.to_datetime(X_train['year'] * 1000 + X_train['day'], format='%Y%j')
X_train.drop(['year', 'month', 'day'], axis = 1, inplace = True)
X_train = extract_date(X_train, 'date')

In [53]:
X_train = X_train.merge(pjm_df, how='inner', on=['year', 'month', 'week', 'day'])
X_train = X_train.merge(nuclear_weekly, how='inner', on=['year', 'week'])

In [54]:
X_test['date'] = pd.to_datetime(X_test['year'] * 1000 + X_test['day'], format='%Y%j')
X_test.drop(['year', 'month', 'day'], axis = 1, inplace = True)
X_test = extract_date(X_test, 'date')

In [55]:
X_test = X_test.merge(pjm_df, how='inner', on=['year', 'month', 'week', 'day'])
X_test = X_test.merge(nuclear_weekly, how='inner', on=['year', 'week'])

In [56]:
X_train.drop('date', axis = 1, inplace=True)
X_test.drop('date', axis = 1, inplace=True)

# remove NaN 
X_train.dropna(inplace=True)
X_test.dropna(inplace=True)


In [57]:
X_train.to_csv("X_train_with_dates.csv")
X_test.to_csv("X_test_with_dates.csv")

In [61]:
X_train[['year', 'month', 'day', 'hour']]

Unnamed: 0,year,month,day,hour
1,2016,1,2,0
2,2016,1,3,0
10,2016,1,11,0
11,2016,1,12,0
12,2016,1,13,0
...,...,...,...,...
604,2017,8,239,0
605,2017,8,240,0
606,2017,8,241,0
607,2017,8,242,0


In [63]:
Y_test[['year', 'month', 'day', 'hour']]

NameError: name 'Y_test' is not defined

### Circular encoding for time data

In [31]:
# Circular encoding of day of year and month 
def sin_cos_correction(df):
    df = df.reset_index()
    
    df['day_sin'] = np.sin((df['day']-1)*(2.*np.pi/365))
    df['day_cos'] = np.cos((df['day']-1)*(2.*np.pi/365))
    df['month_sin'] = np.sin((df['month']-1)*(2.*np.pi/12))
    df['month_cos'] = np.cos((df['month']-1)*(2.*np.pi/12))
    df['week_sin'] = np.sin((df['week']-1)*(2.*np.pi/52))
    df['week_cos'] = np.cos((df['week']-1)*(2.*np.pi/52))
    
    df = df.drop(['month', 'day', 'week'], axis=1)
    df = df.set_index(['year', 'month_sin', 'month_cos', 'week_sin', 'week_cos', 'day_sin', 'day_cos'])
    return df

In [32]:
X_train = sin_cos_correction(X_train)
X_test = sin_cos_correction(X_test)

In [33]:
X_train.drop('index', axis = 1, inplace=True)
X_test.drop('index', axis = 1, inplace=True)

### Standardize

In [35]:
from sklearn.preprocessing import StandardScaler

# Standardize features 
scaler = StandardScaler()
scaler.fit(X_train.values) # fit on training set 

scaled_X_train = scaler.transform(X_train.values)
scaled_X_test = scaler.transform(X_test.values)

# Convert back to df after scaling 
X_train = pd.DataFrame(scaled_X_train, index=X_train.index, columns=X_train.columns)
X_test = pd.DataFrame(scaled_X_test, index=X_test.index, columns=X_test.columns)

In [36]:
X_train

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,0,1,2,0.1,1.1,2.1,0.2,1.2,2.2,3,...,PJM|15,PJM|16,PJM|17,PJM|18,PJM|19,PJM|20,PJM|21,PJM|22,PJM|23,nuclear
year,month_sin,month_cos,week_sin,week_cos,day_sin,day_cos,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1
2016,0.0,1.000000,-2.449294e-16,1.000000,0.017213,0.999852,1.394470,-0.171743,-0.954713,1.305931,-0.179915,-0.813239,0.594686,0.037287,-1.523669,0.182418,...,-0.264151,-0.395243,-0.519776,-0.618447,-0.668592,-0.682178,-0.592838,-0.337125,-0.242552,1.066946
2016,0.0,1.000000,-2.449294e-16,1.000000,0.034422,0.999407,1.379780,-0.111648,-0.068307,1.260496,0.043963,-0.684220,-0.164320,-0.713823,-1.008296,-1.517682,...,-0.576319,-0.647002,-0.682334,-0.720079,-0.735344,-0.728809,-0.616502,-0.347117,-0.203628,1.066946
2016,0.0,1.000000,1.205367e-01,0.992709,0.171293,0.985220,2.082393,1.005844,-1.818549,2.078390,0.524839,-0.689358,0.065767,-0.912861,-1.514951,2.615455,...,0.638266,0.424496,0.258874,0.163840,0.083438,0.063566,0.168775,0.537359,0.788443,1.661409
2016,0.0,1.000000,1.205367e-01,0.992709,0.188227,0.982126,1.830166,0.589983,1.536230,1.778961,0.301317,-0.204284,-0.683454,0.424513,-1.806997,-2.985543,...,0.683979,0.459452,0.282561,0.175585,0.107673,0.077649,0.183565,0.499792,0.758899,1.661409
2016,0.0,1.000000,1.205367e-01,0.992709,0.205104,0.978740,2.193189,0.426625,-1.441383,2.267769,-0.137687,-0.794518,0.448179,-0.467549,-1.064648,2.060579,...,1.168521,0.849931,0.611459,0.465648,0.366200,0.325920,0.426273,0.764929,1.098055,1.661409
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017,-0.5,-0.866025,-7.485107e-01,-0.663123,-0.816538,-0.577292,-0.669646,-0.297602,-0.479836,-0.601721,-0.200183,-0.296218,0.841269,0.328432,1.347634,-0.154613,...,-1.046939,-0.800314,-0.608025,-0.472644,-0.352796,-0.256923,-0.219664,-0.326792,-0.480568,0.916650
2017,-0.5,-0.866025,-8.229839e-01,-0.568065,-0.826354,-0.563151,-0.668551,-0.224765,-0.217506,-0.767231,-0.839276,-0.462637,0.476620,0.571819,1.147256,-0.396427,...,-0.015278,0.069038,0.141681,0.162351,0.167155,0.173491,0.149865,0.032034,-0.091511,0.899915
2017,-0.5,-0.866025,-8.229839e-01,-0.568065,-0.835925,-0.548843,-0.629544,-0.296061,-0.741625,-0.895154,-0.476992,-0.582418,0.118777,0.146622,1.552054,-0.990677,...,-0.144163,-0.098412,-0.048108,-0.039259,-0.032839,-0.023971,-0.055288,-0.174526,-0.297430,0.899915
2017,-0.5,-0.866025,-8.229839e-01,-0.568065,-0.845249,-0.534373,-0.704106,-0.559941,-0.527120,-0.855673,-0.473110,-0.540564,-0.060188,-0.282496,0.289726,0.089844,...,-0.056469,0.054706,0.170034,0.241539,0.301818,0.355787,0.351163,0.234940,0.099347,0.899915


In [69]:
y_Train

NameError: name 'y_Train' is not defined

### Save to csv

In [38]:
X_train.to_csv("X_Train.csv")
X_test.to_csv("X_Test.csv")

### Get MEF Values from Simple Dispatch