In [1]:
%matplotlib widget
import numpy as np
import matplotlib.pyplot as plt 
import pandas as pd 
import datetime
import re
from mlxtend.preprocessing import minmax_scaling
import pickle
import math
from data_functions import *
from sklearn import preprocessing

In [2]:
solar_data_filename = '/Users/brad/Desktop/CS6620/Project/Data/Sample.csv' 
weather_data_filename = '/Users/brad/Desktop/CS6620/Project/Data/correct_weather.csv' 

In [3]:
solarData = pd.read_csv(solar_data_filename, sep=',')

In [4]:
# sum(solarData['Solar_power'].apply(lambda val: 1 if val <=0 else 0))
# len(solarData['Solar_power'])

1375826

In [5]:
# Seperate date and time using datetime from pandas 
# solarData['Date']= pd.to_datetime(solarData['TIME']).dt.date
solarData['date_time']= pd.to_datetime(solarData['TIME']).dt.floor('H')
solarData = solarData.drop(['TIME', 'Meter_value', 'Unnamed: 0'], axis=1)

In [6]:
solarData['date_time'][0].date() == solarData['date_time'][5].date()

True

In [7]:
# According to Shaju, from ASPIRE, negative values are corrupted
solarData['Solar_power'] = solarData['Solar_power'].apply(lambda val : val if val > 0 else math.nan)
# Maximum possible output by solar panels is 100,000 (3-29 Email)
solarData['Solar_power'] = solarData['Solar_power'].apply(lambda val : val if val < 100000 else math.nan)

In [8]:
plt.figure(figsize=(5,5))
plt.scatter(np.arange(len(solarData)), solarData['Solar_power'])
plt.xlabel("data point index")
plt.ylabel("Power generation (watts)")
plt.title("Scatter of Unprocessed Generation Data")
plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [9]:
plt.figure()
plt.hist(solarData['Solar_power'], edgecolor='black', bins=30)
plt.xlabel("Generation output (watts)")
plt.ylabel("Number of occurences")
plt.title("Distribution of Generation output")
plt.show()


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [10]:
print(solarData['Solar_power'].min())
print(solarData['Solar_power'].max())
print(solarData['Solar_power'].median())
print()
print(solarData['Solar_power'].mean())
print(solarData['Solar_power'].var())
print(solarData['Solar_power'].std())

740.6687727956357
95620.47706058336
17417.057682456667

21697.783441652493
260095452.22527274
16127.475072846113


In [11]:
# <pre># Now need to accumulate power per hour 
hour = -1
# This will contain a list of tuples where 0th index is date, 1st is hour of day (24 - hour based), 2nd is total generation per hour
date_hour_generation = [] 

# This is how many hours had at least 1 corrupted/garbage value 
numAdjusted = 0

# loop through all the data, 
# NaN values in an hour will be replaced with the average for the non-NaN values in the hour 
i = 0
while i < len(solarData['Solar_power']):
    hourTotal = 0
    numNaN = 0
    numTotal = 0
    hour = solarData['date_time'][i].hour
    # get statistics for this single hour 
    while i < len(solarData['Solar_power']) and hour == solarData['date_time'][i].hour:
        numTotal += 1 
        if math.isnan(solarData['Solar_power'][i]):
            numNaN += 1 
        else: 
            hourTotal += solarData['Solar_power'][i]
        i += 1
    # end while inner 
    if numNaN == 0:
        date_hour_generation.append((solarData['date_time'][i -1], hourTotal, numTotal))
    elif numNaN == numTotal: # all values of an hour are garbage, skip 
        continue
    else:
        numAdjusted += 1 
        notNaN = numTotal - numNaN
        avg = hourTotal / notNaN
        hourTotal = avg * numTotal 
        date_hour_generation.append((solarData['date_time'][i-1], hourTotal, numTotal))

# end while outer 
print(f'Number of hours adjusted: {numAdjusted}')
print(f'Total number of hours computed: {len(date_hour_generation)}')
print(f'Percentage of hours with some degree of adjustment: {( (numAdjusted / len(date_hour_generation) )* 100 ) :.2f}%')
# </pre>


Number of hours adjusted: 2477
Total number of hours computed: 5601
Percentage of hours with some degree of adjustment: 44.22%


In [12]:
# compute average of each hour 
solarData.dropna(axis = 0, inplace=True)
solarData.reset_index(drop=True, inplace=True)
sDataProcessed = []
i = 0
date = solarData['date_time'][i].date()
hour = solarData['date_time'][i].hour
total = 0
count = 0
while i < len(solarData):
    if solarData['date_time'][i].date() == date and solarData['date_time'][i].hour == hour:
        # date and hour match
        count += 1
        total += solarData['Solar_power'][i]
    else:
        if count != 0:
            sDataProcessed.append((solarData['date_time'][i-1], total/count))
        total = 0
        count = 0
        date = solarData['date_time'][i].date()
        hour = solarData['date_time'][i].hour
    i += 1

In [13]:
# convert the 
sDataTotal = pd.DataFrame(date_hour_generation, columns=['date_time', 'Solar_total', 'Count'])
# sData is the cumulated generation data frame 
sData = pd.DataFrame(sDataProcessed, columns=['date_time', 'Solar_average'])

In [25]:
plt.figure()
plt.hist(sDataTotal['Generated_total'], edgecolor='black', bins=30)
plt.xlabel("Generation output (watts)")
plt.ylabel("Number of occurences")
plt.title("Distribution using Total Per Hour")
plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [15]:
x = np.arange(len(sData))
plt.figure()
plt.hist(sData['Generated'], bins=30, edgecolor='black', linewidth=1)
plt.xlabel("power output")
plt.ylabel("Number of occurences")
plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [14]:
weatherpf = pd.read_csv(weather_data_filename, sep=',')
weatherpf.columns

Index(['date_time', 'station_id', 'rh', 'eto', 'etr', 'rso', 'ea_avg',
       'precip', 'rh_max', 'rh_min', 'rh_tmn', 'rh_tmx', 'td_avg', 'td_max',
       'td_min', 'td_tmn', 'td_tmx', 'co2_avg', 'ppf_avg', 'solarmj',
       'wet_pct', 'airt_avg', 'airt_max', 'airt_min', 'airt_tmn', 'airt_tmx',
       'lwdn_avg', 'lwup_avg', 'pressure', 'swdn_avg', 'swup_avg', 'windd_sd',
       'windd_avg', 'winds_avg', 'winds_max', 'winds_tmx', 'netrad_avg',
       'fan_rmp_avg', 'visibilitymiles', 'batt_volt_min', 'surfacet1_avg',
       'surfacet2_avg', 'visibilitymiles_avg', 'pressurekpasealevel'],
      dtype='object')

In [15]:
# we can't feed the date strings into our models, lets just get rid of them
# the data is all from the same weather station. We don't need that either 
toDrop = ['station_id', 'rh_tmn', 'rh_tmx', 'td_tmn', 'td_tmx', 'airt_tmn', 'airt_tmx', 'winds_tmx', 'fan_rmp_avg', 'batt_volt_min']
weatherpf = weatherpf.drop(toDrop, axis=1)
weatherpf.head()

Unnamed: 0,date_time,rh,eto,etr,rso,ea_avg,precip,rh_max,rh_min,td_avg,...,windd_sd,windd_avg,winds_avg,winds_max,netrad_avg,visibilitymiles,surfacet1_avg,surfacet2_avg,visibilitymiles_avg,pressurekpasealevel
0,2019-07-11 00:59:59,30,0.002,0.003,0.0,0.741,0.0,31,29,36.9,...,42.0,40.0,5.369,18.119,-63.37,19.884,60.8,60.3,19.822,101.6
1,2019-07-11 01:59:59,31,0.002,0.003,0.0,0.732,0.0,32,30,36.5,...,63.0,97.0,3.803,13.198,-61.05,19.884,58.5,58.3,19.822,101.6
2,2019-07-11 02:59:59,32,0.001,0.002,0.0,0.716,0.0,33,30,36.0,...,46.0,132.0,4.698,17.448,-61.61,19.884,58.3,58.1,19.822,101.6
3,2019-07-11 03:59:59,32,0.002,0.002,0.0,0.703,0.0,33,30,35.4,...,66.0,95.0,4.25,16.553,-60.99,19.635,57.6,57.4,19.822,101.6
4,2019-07-11 04:59:59,32,0.001,0.002,0.0,0.686,0.0,33,30,34.9,...,62.0,106.0,4.026,14.987,-60.79,19.884,56.8,56.8,19.884,101.7


In [16]:
# Seperate date and time using datetime from pandas 
# weatherpf['Date']= pd.to_datetime(weatherpf['date_time']).dt.date
# weatherpf['Hour']= pd.to_datetime(weatherpf['date_time'][:]).dt.hour
weatherpf['date_time']= pd.to_datetime(weatherpf['date_time']).dt.floor('H')

In [14]:
weatherpf.head()

Unnamed: 0,date_time,rh,eto,etr,rso,ea_avg,precip,rh_max,rh_min,td_avg,...,windd_sd,windd_avg,winds_avg,winds_max,netrad_avg,visibilitymiles,surfacet1_avg,surfacet2_avg,visibilitymiles_avg,pressurekpasealevel
0,2019-07-11 01:00:00,30,0.002,0.003,0.0,0.741,0.0,31,29,36.9,...,42.0,40.0,5.369,18.119,-63.37,19.884,60.8,60.3,19.822,101.6
1,2019-07-11 02:00:00,31,0.002,0.003,0.0,0.732,0.0,32,30,36.5,...,63.0,97.0,3.803,13.198,-61.05,19.884,58.5,58.3,19.822,101.6
2,2019-07-11 03:00:00,32,0.001,0.002,0.0,0.716,0.0,33,30,36.0,...,46.0,132.0,4.698,17.448,-61.61,19.884,58.3,58.1,19.822,101.6
3,2019-07-11 04:00:00,32,0.002,0.002,0.0,0.703,0.0,33,30,35.4,...,66.0,95.0,4.25,16.553,-60.99,19.635,57.6,57.4,19.822,101.6
4,2019-07-11 05:00:00,32,0.001,0.002,0.0,0.686,0.0,33,30,34.9,...,62.0,106.0,4.026,14.987,-60.79,19.884,56.8,56.8,19.884,101.7


<p>we want to combine the generation data to the weather data at the same time

we are going to predict the previous hours' weather to predict the next hour 

i.e. weather[hour_i] will try to predict generation[hour_i]

A BETTER WAY TO DO THIS: set the date_hour as the index and then just join the two df's</p>
<pre>
Generated = np.full((len(weatherpf), 1), math.nan)
wi = 0 # weather index
genInd = 0 # generation index
# loop through all the generation
lastWI = 0
while genInd < len(sData):
    added = False
    wi = lastWI
    while wi < len(weatherpf) and not added:
        # hour / date matches! 
        if weatherpf['Date'][wi] == sData['Date'][genInd] and weatherpf['Hour'][wi] == sData['Hour'][genInd]:
            Generated[wi] = sData['Generated'][genInd]
            genInd += 1
            lastWI = wi
            added = True
        else:
            wi += 1   
    if not added:
        genInd += 1

weatherpf['Generated'] = Generated
    </pre>

In [18]:
combinedData = pd.merge_asof(sData, weatherpf, on='date_time')
combinedData = pd.merge_asof(combinedData, sDataTotal, on='date_time')
combinedData.head()

Unnamed: 0,date_time,Solar_average,rh,eto,etr,rso,ea_avg,precip,rh_max,rh_min,...,winds_avg,winds_max,netrad_avg,visibilitymiles,surfacet1_avg,surfacet2_avg,visibilitymiles_avg,pressurekpasealevel,Solar_total,Count
0,2019-07-11 13:00:00,52211.95523,18,0.029,0.033,3.472,0.848,0.0,20,15,...,3.355,10.737,679.8,12.676,91.0,94.8,18.765,101.5,19005150.0,364
1,2019-07-11 14:00:00,52612.209136,17,0.029,0.034,3.223,0.845,0.0,20,14,...,3.355,14.093,612.8,19.884,88.7,94.8,19.2,101.5,25139280.0,478
2,2019-07-11 16:00:00,42182.975794,16,0.022,0.026,2.202,0.782,0.0,19,13,...,2.237,10.066,370.0,19.884,85.1,88.9,19.573,101.3,20209070.0,479
3,2019-07-11 17:00:00,33390.124136,23,0.017,0.02,1.534,0.826,0.0,32,14,...,1.566,9.171,234.1,19.884,83.5,78.1,19.014,101.3,15964920.0,478
4,2019-07-11 18:00:00,22358.862645,18,0.011,0.013,0.828,0.857,0.0,22,14,...,0.895,4.25,185.5,19.635,77.5,70.3,19.2,101.3,10714620.0,479


## Add astral data to input


In [16]:
# datetime.strptime(weatherpf['date_time'][12], "%Y-%m-%d %H:%M:%S")

In [19]:
# use the class I wrote 
loganAstral = LoganAstral()
# list for elevation, azimuth, zenith 
ElAzZe = []
# type(combinedData['date_time'][0].to_pydatetime())
for timeStamp in combinedData['date_time']:
    dt = timeStamp.to_pydatetime()
    ElAzZe.append(loganAstral.computeElAzZe(dt))


In [20]:
elevationAzimuthZenith = np.array(ElAzZe)

In [21]:
combinedData['Elevation'] = elevationAzimuthZenith[:, 0]
combinedData['Azimuth'] = elevationAzimuthZenith[:, 1]
combinedData['Zenith'] = elevationAzimuthZenith[:, 2] 

In [22]:
len(combinedData)

5575

In [23]:
# we will drop the date, but lets keep the month, hour of day, and day of month
combinedData['Month'] = combinedData['date_time'].apply(lambda dt: dt.month)
combinedData['Day'] = combinedData['date_time'].apply(lambda dt: dt.day)
combinedData['Hour'] = combinedData['date_time'].apply(lambda dt: dt.hour)
combinedData.fillna(0, inplace=True)

In [24]:
# save as csv for convienience/accessibility 
fname = '/Users/brad/Desktop/CS6620/Project/Data/combined_processed_data.csv'
combinedData.to_csv(fname)

<pre> # Save data without scaling NOn
# truth = MainDf['Generated']
# MainDf.drop('Generated', axis=1, inplace=True)
MainDf.dropna(axis=1, inplace=True)
dataNonScaled = MainDf.to_numpy()
# truthNonScaled = truth.to_numpy()

data_filename = '/Users/brad/Desktop/CS6620/Project/Data/nonAdjustedData.npy' 
# truth_filename = '/Users/brad/Desktop/CS6620/Project/Data/nonScaledTruthV2.npy' 
np.save(data_filename, dataNonScaled)
# np.save(truth_filename, truthNonScaled)
</pre>