Capstone 1 

All companies that participate in wholesale energy markets are required to submit a quarterly report detailing each transaction to the Federal Energy Regulatory Commission (FERC). This information is then made publicly available for download on FERC’s website at https://eqrreportviewer.ferc.gov/.

First I'll explore the simple answer. Specifically, I will build a function that will calculate the mean, standard deviation, and 95% confidence interval for the years we have data for. Then the function will "predict" the next years prices by averaging the mean and standard deviation of the preeceeding year, and sue those to develop a new 95% confidence interval.  

In [60]:
from scipy import stats
import numpy as np
import pandas as pd
from sklearn import *
from math import sqrt
import numpy as np

data = pd.read_csv(r'C:\Users\Pujanande\Desktop\Capstone 1\all_data_cap1.csv') #import data
data['TRADE_DATE'] = pd.to_datetime(data['TRADE_DATE']) #convert to DT
data['FERC_time'] = 0 #'off_peak' #make everthing off peak and then change it to peak if criteria are met
data['FERC_time'][(data['TRADE_DATE'].dt.weekday <= 5) & (data['TRADE_DATE'].dt.hour >= 6) & (data['TRADE_DATE'].dt.hour <= 21)]=1 #'peak' 

data['FERC_season'] = 0 #'shoulder' #make everthing shoulder and then change it to winter or summer if criteria are met
data['FERC_season'][(data['TRADE_DATE'].dt.month <= 2) | (data['TRADE_DATE'].dt.month >= 12)]= 1 # winter' 
data['FERC_season'][(data['TRADE_DATE'].dt.month <= 8) & (data['TRADE_DATE'].dt.month >= 6)]= 2

#create simple prediction function
def simple_price(season, time):
    season_name = data.loc[data['FERC_season'] == season] #subset data
    period = season_name.loc[season_name['FERC_time'] == time]

    year_list = data['TRADE_DATE'].dt.year #list unique years in data
    year_list = np.unique(year_list)
    
    df = pd.DataFrame(columns=('mean','std','con_int')) #empty df
    
    for y in year_list : #for every year
        year_df = period[period['TRADE_DATE'].dt.year == y] #split years
        mean = year_df.PRICEINDOLPERMWH.mean() #mean of prices
        std = year_df.PRICEINDOLPERMWH.std() #std of prices
        con = stats.norm.interval(0.95, loc=mean, scale=std) #95% confidence range
        df.loc[y] = [mean,std,con] #add to df
        
    pred_mean = df['mean'].mean() #average historical mean
    pred_std = df['std'].mean() #average historical std
    pred_con = stats.norm.interval(0.95, loc=pred_mean, scale=pred_std) #95% confidence range
        
    df.loc['pred'] = [pred_mean,pred_std,pred_con] #add to df 
        
    return print(df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [61]:
# Legend:
    # FERC Season
        # 0 = off_peak
        # 1 = peak
    # FERC Time
        # 0 = Shoulder
        # 1 = Winter
        # 2 = Summer

simple_price(0,0)

           mean       std                         con_int
2014  25.735525  5.117542  (15.7053270736, 35.7657227646)
2015  17.450875  7.154513  (3.42828760952, 31.4734627899)
2016  16.172487  5.932291  (4.54541091099, 27.7995625716)
pred  19.786296  6.068115  (7.89300853135, 31.6795827087)


Exploring the posibiity of using a regression model to predict future prices is a more interesting, but flawed, approach.

In [3]:
data.dtypes #check types

DATE                              object
SELLER_COMPANY                    object
SELLER_COMPANY_OLD                object
C_BUYER_NAME                      object
C_BUYER_NAME_OLD                  object
Region                            object
Contract_Service_Agreement_id      int64
TR_CONTRACT_ID                     int64
loc                               object
TR_TIMEZONE                       object
TR_CLASS_NAME                     object
PRICEINDOLPERMWH                 float64
TR_DELV_SPEC_LOC                  object
TRADE_DATE                        object
HOUROFDAY                          int64
QUANTITYINMWH                    float64
HOURLYTRANSCHARGE                float64
HOUR_FREQ                          int64
weighted_pricemw                 float64
index_loc                        float64
index_bench                      float64
index_loc_seller                 float64
index_loc_oldseller              float64
transcharge                        int64
price_minus_inde

Everything looks good, with the exception of the 'TRADE_DATE' column. This column contains the date and hour that the trade occured and should be a date time, but during upload it was classified as an object. So my next step is to convert it to a data time and check the results.

In [4]:
data['TRADE_DATE'] = pd.to_datetime(data['TRADE_DATE']) #convert to DT
data['TRADE_DATE'].head()

0   2014-01-01 22:00:00
1   2014-01-01 18:00:00
2   2014-01-01 02:00:00
3   2014-01-01 03:00:00
4   2014-01-01 04:00:00
Name: TRADE_DATE, dtype: datetime64[ns]

Since the goal of this project was to predict prices for a FERC specific time period and season I used the following to create these variables in the data frame.

In [5]:
data['FERC_time'] = 0 #'off_peak' #make everthing off peak and then change it to peak if criteria are met
data['FERC_time'][(data['TRADE_DATE'].dt.weekday <= 5) & (data['TRADE_DATE'].dt.hour >= 6) & (data['TRADE_DATE'].dt.hour <= 21)]=1 #'peak' 

data['FERC_season'] = 0 #'shoulder' #make everthing shoulder and then change it to winter or summer if criteria are met
data['FERC_season'][(data['TRADE_DATE'].dt.month <= 2) | (data['TRADE_DATE'].dt.month >= 12)]= 1 # winter' 
data['FERC_season'][(data['TRADE_DATE'].dt.month <= 8) & (data['TRADE_DATE'].dt.month >= 6)]= 2 #'summer'
data.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,DATE,SELLER_COMPANY,SELLER_COMPANY_OLD,C_BUYER_NAME,C_BUYER_NAME_OLD,Region,Contract_Service_Agreement_id,TR_CONTRACT_ID,loc,TR_TIMEZONE,...,transcharge,price_minus_index,price_above_trans,price_above_trans_mwh,transaction_len,year,qtr,benchhub,FERC_time,FERC_season
0,1/1/2014,EXELON,"EXELON GENERATION COMPANY, LLC",THE ENERGY AUTHORITY,THE ENERGY AUTHORITY,Florida,12870,717562,FPL,EASTERNPREVAILING,...,8,-1.471054,0,0.0,Hourly,2014,1,FPL,0,1
1,1/1/2014,SOUTHERN COMPANY,"SOUTHERN COMPANY SERVICES, INC. (AS AGENT)",SEMINOLE ELECTRIC COOP,"SEMINOLE ELECTRIC COOPERATIVE, INC.",Florida,481,1888312,FPL,CENTRALSTANDARD,...,8,2.114532,0,0.0,Hourly,2014,1,FPL,1,1
2,1/1/2014,THE ENERGY AUTHORITY,"THE ENERGY AUTHORITY, INC.",J.P. MORGAN CHASE & COMPANY,JP MORGAN VENTURES ENERGY CORPORATION,Florida,30760,892022,FPL,EASTERNPREVAILING,...,8,0.893512,0,0.0,Hourly,2014,1,FPL,0,1
3,1/1/2014,THE ENERGY AUTHORITY,"THE ENERGY AUTHORITY, INC.",EXELON,"EXELON GENERATION COMPANY, LLC",Florida,30789,892011,FPL,EASTERNPREVAILING,...,8,1.272839,0,0.0,Hourly,2014,1,FPL,0,1
4,1/1/2014,THE ENERGY AUTHORITY,"THE ENERGY AUTHORITY, INC.",EXELON,"EXELON GENERATION COMPANY, LLC",Florida,30789,892011,FPL,EASTERNPREVAILING,...,8,1.407159,0,0.0,Hourly,2014,1,FPL,0,1


Based on the conclusions I reached when exploring the data (see data story) I need variables for all of the following:

1.	Time
    *	Hour
    *	Day
2.	FERC season
3.	FERC period
4.	Frequency of trading
5.	Location
6.	Entity
    *	Entity as seller
    *	Entity as purchaser 

In [6]:
data['is_sunday'] = data['TRADE_DATE'].dt.weekday == 6 #extract sunday
data['is_saturday'] = data['TRADE_DATE'].dt.weekday == 5 #extract saturday
data['is_weekday'] = data['TRADE_DATE'].dt.weekday <= 5 #extract weekday

data_y = data.copy() #copy the data

#subset data to predictive factors
data_x = data[['FERC_time','FERC_season','is_sunday','is_saturday','is_weekday','HOUR_FREQ',
               'HOUROFDAY','SELLER_COMPANY','C_BUYER_NAME','TR_DELV_SPEC_LOC']]

#get dummies for categorical data and check
data_x = pd.get_dummies(data_x)
data_x.head()

Unnamed: 0,FERC_time,FERC_season,is_sunday,is_saturday,is_weekday,HOUR_FREQ,HOUROFDAY,SELLER_COMPANY_CARGILL,SELLER_COMPANY_EXELON,SELLER_COMPANY_MORGAN STANLEY,...,TR_DELV_SPEC_LOC_POU,TR_DELV_SPEC_LOC_SC,TR_DELV_SPEC_LOC_SCEG,TR_DELV_SPEC_LOC_SOCO,TR_DELV_SPEC_LOC_SOCOLOAD,TR_DELV_SPEC_LOC_SSN,TR_DELV_SPEC_LOC_TAL,TR_DELV_SPEC_LOC_TEC,TR_DELV_SPEC_LOC_TEC/FPL,TR_DELV_SPEC_LOC_TREDWINDS SUBSTATION
0,0,1,False,False,True,1,22,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,1,1,False,False,True,1,18,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,False,False,True,1,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,False,False,True,3,3,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1,False,False,True,3,4,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Now that I have my data the next step is to strip out the price as a seperate variable and split the data between a training and testing set.

In [7]:
#split data into traning and testing set
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(data_x, data_y.PRICEINDOLPERMWH, random_state = 1)

Now that we have our training and testing data we run the regression and train our model

In [8]:
#use SGDRegressor w/200 iterations and fit training data
regr= linear_model.SGDRegressor(n_iter=200, verbose=3)
regr = regr.fit(X_train,Y_train)

-- Epoch 1
Norm: 17.03, NNZs: 87, Bias: 13.652213, T: 59337, Avg. loss: 37.964727
Total training time: 0.12 seconds.
-- Epoch 2
Norm: 19.04, NNZs: 87, Bias: 14.476777, T: 118674, Avg. loss: 36.156103
Total training time: 0.22 seconds.
-- Epoch 3
Norm: 20.44, NNZs: 87, Bias: 14.838677, T: 178011, Avg. loss: 35.334676
Total training time: 0.34 seconds.
-- Epoch 4
Norm: 21.52, NNZs: 87, Bias: 15.166336, T: 237348, Avg. loss: 34.821800
Total training time: 0.44 seconds.
-- Epoch 5
Norm: 22.65, NNZs: 87, Bias: 15.403474, T: 296685, Avg. loss: 34.477039
Total training time: 0.55 seconds.
-- Epoch 6
Norm: 23.65, NNZs: 87, Bias: 15.599384, T: 356022, Avg. loss: 34.215623
Total training time: 0.64 seconds.
-- Epoch 7
Norm: 24.46, NNZs: 87, Bias: 15.752540, T: 415359, Avg. loss: 34.003022
Total training time: 0.74 seconds.
-- Epoch 8
Norm: 25.23, NNZs: 87, Bias: 15.830075, T: 474696, Avg. loss: 33.832549
Total training time: 0.84 seconds.
-- Epoch 9
Norm: 25.91, NNZs: 87, Bias: 15.935297, T: 534

Next use our test set to see how well the model did by computing the root mean square error.

In [9]:
#do prediction with testing data and compute mean square error
Y_pred = regr.predict(X_test)
rmse = sqrt(metrics.mean_squared_error(Y_test, Y_pred))
rmse 

7.7855618488354095

We compute our 90% confidence interval.

In [10]:
#compute how much more or less the price will be
con_int = (1.645 * (Y_pred.std()/sqrt(len(Y_pred))))

print("+/-:",con_int)

+/-: 0.0775209134841


According to the above our the predicted price is +/- $.06 of the actual price.

We can check for overfitting by checking the mean squared error of both the training and testing set.

In [11]:
Y_pred_train = regr.predict(X_train) #use training set to predict

mse_train = metrics.mean_squared_error(Y_pred_train, Y_train) #training error
mse_test = metrics.mean_squared_error(Y_pred, Y_test) #testing error

print("training error: ",mse_train)
print("testing error: ",mse_test)

training error:  61.5180873223
testing error:  60.614973302


Our errors are very similar suggesting a good fit.

Next we will cross validate the data using the kfolds method.

In [12]:
kfold = cross_validation.KFold(len(data_x),n_folds=5, random_state=1)
r_pred_error = [] #list to record errors

for train, test in kfold:
    kx_train, kx_test, ky_train, ky_test = data_x.iloc[train], data_x.iloc[test], data_y.PRICEINDOLPERMWH[train], data_y.PRICEINDOLPERMWH[test]
    regr_k = linear_model.SGDRegressor()
    regr_k.fit(kx_train, ky_train)
    pred = regr_k.predict(kx_test)
    r_pred_error.append(sqrt(metrics.mean_squared_error(pred, ky_test)))

print(r_pred_error)
print(np.mean(r_pred_error))

[10.63474841899784, 10.028800073793162, 5.552597706347297, 8.030273868581208, 9.41490708242606]
8.73226543003


The mean of our errors seems to vary somewhat, however, the mean is somewhat higher. 

Finally, since the goal of this project was to predict the prices in each of the predifiened periods and seasons we will compute the actual averages with the predicted averages and compare results.

In [13]:
# Legend:
    # FERC Season
        # 0 = off_peak
        # 1 = peak
    # FERC Time
        # 0 = Shoulder
        # 1 = Winter
        # 2 = Summer

def compare_season_price(season, time):
    season_name = data_y.loc[data_y['FERC_season'] == season] #subset y data
    period = season_name.loc[season_name['FERC_time'] == time]
    
    real_prices = period.PRICEINDOLPERMWH #avg actual prices
    average_price = real_prices.mean()
    
    season_x = data_x.loc[data_x['FERC_season'] == season] #subset x data
    period_x = season_x.loc[season_x['FERC_time'] == time]
    
    csp_pred = regr.predict(period_x) #predict future prices
    
    pred_price = csp_pred.mean() #avg prdicted prices
    rmse = sqrt(metrics.mean_squared_error(csp_pred, real_prices)) #mean square error
    csp_con_int = (1.645 * (csp_pred.std()/sqrt(len(csp_pred)))) #confidence interval
    
    return print("RMSE:",rmse), print("confidence +/-",csp_con_int),print("actual price:",average_price),print("pred price:",pred_price)

In [14]:
#Shoulder off-peak
compare_season_price(0,0)

RMSE: 6.473245775946494
confidence +/- 0.0712951715218
actual price: 21.0122451050622
pred price: 20.4081076837


(None, None, None, None)

In [15]:
#Shoulder peak
compare_season_price(0,1)

RMSE: 8.096920128844886
confidence +/- 0.0730617200929
actual price: 25.02156445864577
pred price: 24.2325466621


(None, None, None, None)

In [16]:
#Winter off-peak
compare_season_price(1,0)

RMSE: 6.9730993840955975
confidence +/- 0.0963953206647
actual price: 19.88418845315916
pred price: 20.9370141605


(None, None, None, None)

In [17]:
#Winter peak
compare_season_price(1,1)

RMSE: 7.581418061350178
confidence +/- 0.0769510971499
actual price: 21.24755833581574
pred price: 23.5363858008


(None, None, None, None)

In [18]:
#Summer off-peak
compare_season_price(2,0)

RMSE: 6.338269225504091
confidence +/- 0.102961115332
actual price: 22.869034327879284
pred price: 22.2540975983


(None, None, None, None)

In [19]:
#Summer peak
compare_season_price(2,1)

RMSE: 10.507748136402151
confidence +/- 0.123103595401
actual price: 28.402529495887926
pred price: 26.6050389351


(None, None, None, None)