# Introduction

Fill

# Imports, configs, etc

In [1]:
import numpy as np
import pandas as pd
import datetime
import pvlib
import clearsky_detect_model_free
import preprocess

import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
%matplotlib notebook

import os, sys
lib_path = os.path.abspath(os.path.join('..', 'rdtools'))
sys.path.append(lib_path)
import filtering

%load_ext autoreload
%autoreload 2

# Load irradiance data

Weather and performance data are taken at 1min intervals.  Supplier is the Sandia National Lab Regional Test Center in Albuquerque, NM.  Data spans roughly 2016-April 2017.  This data was scraped from PVDAQ (http://bit.ly/2mKrOwG).  The meteorological data and the performance data are contained in two separate files (technically separate systems - PV system and meteorological station).

In [2]:
file = os.path.expanduser('~/data_sets/snl_raw_data/1429_1405/raw_1405_weather_for_1429.csv')

In [3]:
cols = ['Global_Wm2', 'Date-Time']

In [4]:
data = pd.read_csv(file, parse_dates=['Date-Time'], usecols=cols, index_col=['Date-Time'])

In [5]:
data.index = data.index.tz_localize('Etc/GMT+7')

In [6]:
data = data.reindex(
    pd.date_range(start=data.index[0], end=data.index[-1], freq='1min')
).fillna(0)

In [7]:
data = pd.Series(data['Global_Wm2'], index=data.index)

In [8]:
print(type(data))

<class 'pandas.core.series.Series'>


In [9]:
data[data < 0] = 0

# Set up PVLib system

In [10]:
site_azimuth = 180
site_elevation = 1658
site_tilt = 35
site_lat = 35.0549
site_lon = -106.5433

In [11]:
rtc_no_loc = pvlib.pvsystem.PVSystem(surface_tilt=site_tilt, surface_azimuth=site_azimuth)
rtc_loc = pvlib.location.Location(site_lat, site_lon, altitude=site_elevation)
rtc = pvlib.pvsystem.LocalizedPVSystem(pvsystem=rtc_no_loc, location=rtc_loc)

# Test method

In [12]:
def metrics_plot(series, is_clear, result):
    fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10, 2.5))

    ax.set_title('Clear periods')
    _ = ax.plot(series.index, series)
    _ = ax.scatter(series.index[is_clear], series[is_clear], facecolor='none', edgecolor='green')

    fig.tight_layout()

In [13]:
def pvlib_compare_plot(sample, is_clear, pvlib_is_clear):
    fig, axes = plt.subplots(ncols=1, nrows=1, figsize=(10, 2.5))

    ax = axes
    _ = ax.plot(sample.index, sample)
    _ = ax.scatter(sample.index[is_clear & ~pvlib_is_clear], sample[is_clear & ~pvlib_is_clear], 
               facecolor='none', edgecolor='green', label='unfiltered')
    _ = ax.scatter(sample.index[pvlib_is_clear & ~is_clear], sample[pvlib_is_clear & ~is_clear], 
               facecolor='none', edgecolor='red', label='filtered')
    _ = ax.scatter(sample.index[pvlib_is_clear & is_clear], sample[is_clear & pvlib_is_clear], 
               facecolor='none', edgecolor='yellow', label='both')
    _ = ax.legend()
    
    fig.tight_layout()

## Clear sky verification

In [14]:
sample = data[(data.index >= '2016-10-01') & (data.index < '2016-11-01')]

# min/max scale each day?  summer months are inherently favored over winter days if we remove days based
# on irradiance without any processing

In [15]:
clear_skies = rtc.get_clearsky(sample.index)
clear_skies = pd.Series(clear_skies['ghi'], index=sample.index)

In [16]:
daily_irrad = []
for day, group in sample.groupby(sample.index.date):
    daily_irrad.append(np.trapz(group.values))
daily_irrad = pd.Series(daily_irrad, index=pd.date_range(start='2016-10-01', end='2016-10-31', freq='1D'))

In [17]:
series_list = []
for day, group in sample.groupby(sample.index.date):
    ser = pd.Series(group.values, index=group.index.time, name=day)
    if all(i == 0 for i in ser): continue
    series_list.append(ser)
df = pd.concat(series_list, axis=1)

In [18]:
df2= pd.DataFrame()
df2['avg'] = df.mean(axis=1)
df2['std'] = df.std(axis=1)
df2['avg_plus_std'] = df2['avg'] + df2['std']
df2['avg_less_std'] = df2['avg'] - df2['std']
df2.to_csv('tmp.csv')

In [19]:
fig, ax = plt.subplots()
df.plot(legend=False, ax=ax)
df2['avg'].plot(ax=ax, c='black')
df2['avg_plus_std'].plot(ax=ax, c='black', linestyle='--')
df2['avg_less_std'].plot(ax=ax, c='black', linestyle='--')
for col in df:
    try:
        tmp = (df[col] <= df2['avg_plus_std']) & (df[col] >= df2['avg_less_std'])
        ax.scatter(df[col][~tmp].index, df[col][~tmp], marker='o', edgecolor='black', facecolor='none', zorder=10, alpha=.2)
    except:
        pass

<IPython.core.display.Javascript object>

There are many 'reasonable looking' periods early in the morning and late in the day that are just above a standard deviation.  Since this method mostly struggles with periods of low irradiance, we will only filter out irradiances below the avg - std threshold (for now).

## Investigation

In [20]:
sample2 = sample.copy()
for day, group in sample2.groupby(sample2.index.date):
    # print(len(group), len(df2))
    try:
        group[group.values < df2['avg_less_std'].values] = -np.inf # np.nan
        sample2[group.index] = group
    except:
        pass

In [32]:
filtered_sample, components = preprocess.zscore_time_filter(sample, verbose=True)

  zscores = pd.Series((group.values - stats['avg'].values) / stats['std'].values, index=group.index)


In [33]:
pd.unique(components['mask'])

array([False,  True], dtype=bool)

In [36]:
sample[components['mask']].index

DatetimeIndex(['2016-10-01 11:44:00-07:00', '2016-10-01 11:45:00-07:00',
               '2016-10-01 11:46:00-07:00', '2016-10-01 11:47:00-07:00',
               '2016-10-01 11:48:00-07:00', '2016-10-01 11:49:00-07:00',
               '2016-10-01 11:50:00-07:00', '2016-10-01 11:51:00-07:00',
               '2016-10-01 11:52:00-07:00', '2016-10-01 12:36:00-07:00',
               ...
               '2016-10-31 17:06:00-07:00', '2016-10-31 17:07:00-07:00',
               '2016-10-31 17:08:00-07:00', '2016-10-31 17:09:00-07:00',
               '2016-10-31 17:10:00-07:00', '2016-10-31 17:11:00-07:00',
               '2016-10-31 17:12:00-07:00', '2016-10-31 17:13:00-07:00',
               '2016-10-31 17:14:00-07:00', '2016-10-31 17:15:00-07:00'],
              dtype='datetime64[ns, Etc/GMT+7]', length=3402, freq=None)

In [45]:
components['mask']

2016-10-01 00:00:00-07:00    False
2016-10-01 00:01:00-07:00    False
2016-10-01 00:02:00-07:00    False
2016-10-01 00:03:00-07:00    False
2016-10-01 00:04:00-07:00    False
2016-10-01 00:05:00-07:00    False
2016-10-01 00:06:00-07:00    False
2016-10-01 00:07:00-07:00    False
2016-10-01 00:08:00-07:00    False
2016-10-01 00:09:00-07:00    False
2016-10-01 00:10:00-07:00    False
2016-10-01 00:11:00-07:00    False
2016-10-01 00:12:00-07:00    False
2016-10-01 00:13:00-07:00    False
2016-10-01 00:14:00-07:00    False
2016-10-01 00:15:00-07:00    False
2016-10-01 00:16:00-07:00    False
2016-10-01 00:17:00-07:00    False
2016-10-01 00:18:00-07:00    False
2016-10-01 00:19:00-07:00    False
2016-10-01 00:20:00-07:00    False
2016-10-01 00:21:00-07:00    False
2016-10-01 00:22:00-07:00    False
2016-10-01 00:23:00-07:00    False
2016-10-01 00:24:00-07:00    False
2016-10-01 00:25:00-07:00    False
2016-10-01 00:26:00-07:00    False
2016-10-01 00:27:00-07:00    False
2016-10-01 00:28:00-

In [44]:
fig, ax = plt.subplots()
sample.plot(ax=ax)
ax.scatter(sample[components['mask']].index, sample[components['mask']], facecolor='none', edgecolor='black')

<IPython.core.display.Javascript object>

<matplotlib.collections.PathCollection at 0x121b9e198>

In [None]:
np.allclose(filtered_sample.replace(-np.inf, 0).dropna().values, sample2.replace(-np.inf, 0).dropna().values, equal_nan=True)

In [None]:
tmp = pd.DataFrame()
tmp['filtered'] = filtered_sample
tmp['ipy'] = sample2
tmp['diff'] = filtered_sample - sample2
tmp.to_csv('tmp.csv')

In [None]:
filtered_sample.replace([-np.inf, np.inf], 0).describe()

In [None]:
sample2.replace([-np.inf, np.inf], 0).describe()

In [None]:
diff = (filtered_sample - sample2)# .replace(-np.inf, 0)

In [None]:
diff

### Standard method

In [None]:
is_clear, result = clearsky_detect_model_free.model_free_detect(sample, verbose=True)

In [None]:
metrics_plot(sample, is_clear, result)

In [None]:
is_clear_filtered, result_filtered = clearsky_detect_model_free.model_free_detect(filtered_sample, verbose=True)

In [None]:
metrics_plot(sample, is_clear_filtered, result_filtered)

In [None]:
print(len(is_clear[is_clear]), len(is_clear_filtered[is_clear_filtered]))

In [None]:
pvlib_compare_plot(sample, is_clear, is_clear_filtered)

### Mean method

In [None]:
is_clear, result = clearsky_detect_model_free.model_free_detect_meanval(sample, verbose=True)

In [None]:
metrics_plot(sample, is_clear, result)

In [None]:
is_clear_filtered, result_filtered = clearsky_detect_model_free.model_free_detect_meanval(sample2, verbose=True)

In [None]:
metrics_plot(sample, is_clear_filtered, result_filtered)

In [None]:
print(len(is_clear[is_clear]), len(is_clear_filtered[is_clear_filtered]))

In [None]:
pvlib_compare_plot(sample, is_clear, is_clear_filtered)

### Democratic method

In [None]:
is_clear, result = clearsky_detect_model_free.model_free_detect_democratic(sample, verbose=True)

In [None]:
metrics_plot(sample, is_clear, result)

In [None]:
is_clear_filtered, result_filtered = clearsky_detect_model_free.model_free_detect_democratic(sample2, verbose=True)

In [None]:
metrics_plot(sample, is_clear_filtered, result_filtered)

In [None]:
print(len(is_clear[is_clear]), len(is_clear_filtered[is_clear_filtered]))

In [None]:
pvlib_compare_plot(sample, is_clear, is_clear_filtered)

# Conclusion

The filtering based on avg - std does seem to work well for for this sample.  The filtering should be tested on more 'difficult' data, such as the SRRL data set which is what motivated this work in the first place (this will be added to this notebook before it is implemented fully).

As it stands right now, filtering sets individual time stamps to zero (as these days periods will then be labelled as obscured sky).  This has the side effect of actually generating clear sky times (which is seen by the red in the above plots).  In principle, filtering the data should not generate new periods of clear skies - it should always be a subset of the original clear sky days.  This issue will also be addressed.

# MISC