In [1]:
import pandas as pd
import numpy as np
import feather
import pvlib
import sys
sys.path.append('..')
from src.utils.download_data import timer

In [2]:
DATA_PATH = '/home/SHARED/SOLAR/data/'

In [3]:
# read minute data and location info
#df = pd.read_pickle(DATA_PATH + 'oahu_min_cs.pkl')
df = pd.read_pickle('pvlib.pkl')

In [4]:
df.head()

Unnamed: 0_level_0,Radiation,GH,GT,ClearSky,Ineichen,Haurwitz,Solis
Datetime,Location,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2010-03-19 14:15:00-10:00,AP1,476.328,,973.62924,840.960548,871.749955,907.496424
2010-03-19 14:15:00-10:00,AP3,382.777,,973.625962,841.60095,871.733426,907.47759
2010-03-19 14:15:00-10:00,AP4,351.61,,973.624679,841.594,871.726958,907.47022
2010-03-19 14:15:00-10:00,AP5,390.092,,973.630994,841.628209,871.758798,907.506499
2010-03-19 14:15:00-10:00,AP6,353.928,343.313,973.628714,841.615852,871.747297,907.493395


Compute Haurwitz based on sun altitude:

`ghi_csm = 1098 * math.sin(math.radians(altitude)) * math.exp(-0.057 / (math.sin(math.radians(altitude))))`

In [5]:
df.shape

(9058229, 6)

Replace negative values with 0

In [6]:
df['GH'] = df['GH'].where(df['GH'] >= 0, other=0)

In [7]:
df.describe().style

Radiation,GH,GT,ClearSky,Ineichen,Haurwitz,Solis
count,9058230.0,1065670.0,9058230.0,9058230.0,9058230.0,9058230.0
mean,369.962,312.06,612.044,432.497,479.78,482.42
std,351.038,344.807,372.782,348.093,365.39,374.732
min,0.0,-0.40968,0.0,0.0,0.0,0.0
25%,42.3027,31.5347,240.128,34.2275,76.3822,72.1068
50%,269.047,170.254,802.767,442.5,509.373,498.184
75%,619.537,519.977,900.305,762.037,816.827,832.076
max,1700.35,1587.02,1022.68,976.033,1035.09,1061.95


In [8]:
df1 = df.unstack().between_time('7:30', '17:30').stack()
df1.shape

(6041579, 6)

Statistics for the different clearsky models:
  * **ClearSky**: model from `pysolar` library
  * **Ineichen**: `pvlib`'s Ineichen model
  * **Haurwitz**: `pvlib`'s Haurwitz model
  * **Solis**: `pvlibs`'s simplified Solis model

We compute the main descriptive statistics plus the number of times the GHI is greater than the clearsky model. We also check if the clearsky model returns a GHI of 0 when the sensor has a value greater than 0 (`#CS=0`)

In [9]:
res = []
for clearsky in ('ClearSky', 'Ineichen', 'Haurwitz', 'Solis'):   
    name = 'GHI_{}'.format(clearsky)
    cs = df1.loc[~np.isclose(df1['GH'], 0) &  np.isclose(df1[clearsky], 0), ['GH', clearsky]]
    
    df1[name] = np.where(np.isclose(df1[clearsky], 0), 1, df1['GH']/df1[clearsky])
    
    summ = df1[name].describe()
    q90, q99 = df1[name].quantile(q=[0.9, 0.99])
    per_gt_1, num_gt_1 = df1[name].gt(1).agg([np.mean, np.sum])
    
    summ['90%'] = q90
    summ['99%'] = q99
    summ['#>1'] = num_gt_1
    summ['%>1'] = per_gt_1
    summ['#CS=0'] = cs.shape[0]
    summ.name = clearsky
    res.append(summ)

pd.concat(res, axis=1, sort=False).style

Unnamed: 0,ClearSky,Ineichen,Haurwitz,Solis
count,6041580.0,6041580.0,6041580.0,6041580.0
mean,0.609836,0.872554,0.771898,0.772333
std,0.339151,0.403347,0.34579,0.345167
min,0.0,0.0,0.0,0.0
25%,0.318779,0.478783,0.425978,0.426563
50%,0.561595,1.05537,0.933943,0.95859
75%,0.868854,1.17201,1.04484,1.03738
max,3.16413,29.1443,12.4256,12.4105
90%,1.10763,1.28658,1.14367,1.1355
99%,1.36761,1.63198,1.34406,1.34148


For the `detect_clearsky()` function we need to do it per station and day (it does not support unequal differences in index)

In [10]:
with timer():
    cs = pvlib.clearsky.detect_clearsky(df.loc[('2010-03-20', 'AP1'), 'GH'], 
                                        df.loc[('2010-03-20', 'AP1'), 'Ineichen'], 
                                        df.loc[('2010-03-20', 'AP1'), 'GH'].index.get_level_values('Datetime'), 
                                        10)

	To accept the future behavior, pass 'dtype=object'.
	To keep the old behavior, pass 'dtype="datetime64[ns]"'.
  a = asanyarray(a)
  meas_slope_nstd = np.std(meas_slope, axis=0, ddof=1) / meas_mean
  c4 = meas_slope_nstd < var_diff


Elapsed time (s): 78.389561


In [11]:
df.loc[('2010-03-20', 'AP1'), ['GH', 'Ineichen']].loc[cs.to_numpy()].head()

Unnamed: 0_level_0,Radiation,GH,Ineichen
Datetime,Location,Unnamed: 2_level_1,Unnamed: 3_level_1
2010-03-20 06:58:00-10:00,AP1,44.0718,14.337
2010-03-20 06:59:00-10:00,AP1,47.3772,16.2033
2010-03-20 07:00:00-10:00,AP1,50.6826,18.179851
2010-03-20 07:01:00-10:00,AP1,54.3553,20.262922
2010-03-20 07:02:00-10:00,AP1,57.6608,22.448634
