In [1]:
!python --version
!pip list | grep tensorflow

Python 3.7.0
tensorflow            1.14.0
tensorflow-estimator  1.14.0


In [2]:
import pandas as pd
import numpy as np
import pytz

In [3]:
data_src = "./solar_data.csv"
data_df = pd.read_csv(data_src)

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
#Slice The Header Information Into A Seperate Data Frame
df_headers = data_df.iloc[:1]
data_df = data_df.iloc[1:]
print(df_headers.head())

  Source Location ID City State Country Latitude Longitude Time Zone  \
0  NSRDB     1951980    -     -       -    39.74   -104.99         0   

  Elevation Local Time Zone  ... Cloud Type 11 Cloud Type 12 Fill Flag 0  \
0      1608              -7  ...          Dust         Smoke         NaN   

     Fill Flag 1     Fill Flag 2       Fill Flag 3               Fill Flag 4  \
0  Missing Image  Low Irradiance  Exceeds Clearsky  Missing CLoud Properties   

          Fill Flag 5 Surface Albedo Units Version  
0  Rayleigh Violation                  NaN   4.0.0  

[1 rows x 46 columns]


In [5]:
data_df.columns = data_df.iloc[0]  # Use the first row of data as column headers
data_df = data_df[1:]  # Drop the first row after setting it as header
data_df.reset_index(drop=True, inplace=True) #Reset the Index
print(data_df.head())

1  Year Month Day Hour Minute          Temperature Alpha    AOD Asymmetry  \
0  2022     1   1    0      0  -3.4000000000000004  1.25  0.088      0.62   
1  2022     1   1    0     10                 -3.6  1.25  0.088      0.62   
2  2022     1   1    0     20  -3.8000000000000003  1.25  0.088      0.62   
3  2022     1   1    0     30                   -4  1.25  0.088      0.62   
4  2022     1   1    0     40                 -4.3  1.21  0.085      0.62   

1 Clearsky DHI  ...  NaN  NaN NaN  NaN  NaN  NaN  NaN  NaN NaN  NaN  
0            0  ...  NaN  NaN NaN  NaN  NaN  NaN  NaN  NaN NaN  NaN  
1            0  ...  NaN  NaN NaN  NaN  NaN  NaN  NaN  NaN NaN  NaN  
2            0  ...  NaN  NaN NaN  NaN  NaN  NaN  NaN  NaN NaN  NaN  
3            0  ...  NaN  NaN NaN  NaN  NaN  NaN  NaN  NaN NaN  NaN  
4            0  ...  NaN  NaN NaN  NaN  NaN  NaN  NaN  NaN NaN  NaN  

[5 rows x 46 columns]


In [6]:
#Drop Columns With All Values NaN
df_cleaned = data_df.dropna(axis=1, how="all")

In [7]:
# Convert all columns to float, replacing non-numeric values with NaN
df_cleaned = df_cleaned.apply(pd.to_numeric, errors='coerce')
print(df_cleaned.dtypes)

1
Year                    int64
Month                   int64
Day                     int64
Hour                    int64
Minute                  int64
Temperature           float64
Alpha                 float64
AOD                   float64
Asymmetry             float64
Clearsky DHI            int64
Clearsky DNI            int64
Clearsky GHI            int64
Cloud Fill Flag         int64
Cloud Type              int64
Dew Point             float64
DHI                     int64
DNI                     int64
Fill Flag               int64
GHI                     int64
Ozone                 float64
Relative Humidity     float64
Solar Zenith Angle    float64
SSA                   float64
Surface Albedo        float64
Pressure                int64
Precipitable Water    float64
Wind Direction          int64
Wind Speed            float64
dtype: object


In [8]:
#Round Values Of All Columns Upto 3 Decimal Points
df_cleaned = df_cleaned.round(3)
print(df_cleaned.head())

1  Year  Month  Day  Hour  Minute  Temperature  Alpha    AOD  Asymmetry  \
0  2022      1    1     0       0         -3.4   1.25  0.088       0.62   
1  2022      1    1     0      10         -3.6   1.25  0.088       0.62   
2  2022      1    1     0      20         -3.8   1.25  0.088       0.62   
3  2022      1    1     0      30         -4.0   1.25  0.088       0.62   
4  2022      1    1     0      40         -4.3   1.21  0.085       0.62   

1  Clearsky DHI  ...  GHI  Ozone  Relative Humidity  Solar Zenith Angle   SSA  \
0             0  ...    0  0.274              80.14               93.28  0.95   
1             0  ...    0  0.274              81.45               94.99  0.95   
2             0  ...    0  0.274              82.68               96.73  0.95   
3             0  ...    0  0.274              83.93               98.49  0.95   
4             0  ...    0  0.274              79.32              100.27  0.95   

1  Surface Albedo  Pressure  Precipitable Water  Wind Directio

In [9]:
#Join The Date Columns Into A Single Column Called DateTime
df_cleaned['DateTime'] = pd.to_datetime(df_cleaned[['Year', 'Month', 'Day', 'Hour', 'Minute']])
#print(df_cleaned['DateTime'])

In [10]:
local_time_zone = 'America/Denver'
# Convert UTC to local time
df_cleaned['local_datetime'] = df_cleaned['DateTime'].dt.tz_localize('UTC').dt.tz_convert(local_time_zone)
df_cleaned['local_datetime'] = df_cleaned['local_datetime'].dt.tz_localize(None)
print(df_cleaned)

1      Year  Month  Day  Hour  Minute  Temperature  Alpha    AOD  Asymmetry  \
0      2022      1    1     0       0         -3.4   1.25  0.088       0.62   
1      2022      1    1     0      10         -3.6   1.25  0.088       0.62   
2      2022      1    1     0      20         -3.8   1.25  0.088       0.62   
3      2022      1    1     0      30         -4.0   1.25  0.088       0.62   
4      2022      1    1     0      40         -4.3   1.21  0.085       0.62   
...     ...    ...  ...   ...     ...          ...    ...    ...        ...   
52555  2022     12   31    23      10          3.7   1.54  0.010       0.62   
52556  2022     12   31    23      20          3.7   1.54  0.010       0.62   
52557  2022     12   31    23      30          3.7   1.54  0.010       0.62   
52558  2022     12   31    23      40          3.7   1.54  0.010       0.62   
52559  2022     12   31    23      50          3.7   1.54  0.010       0.62   

1      Clearsky DHI  ...  Relative Humidity  Solar 

In [11]:
# Drop the original columns and reorder the dataframe
df_cleaned = df_cleaned.drop(columns=['Year', 'Month', 'Day', 'Hour', 'Minute'])

# Reorder the columns to make 'datetime' the first column
# Create a list of the columns you want to keep in the specific order
columns_order = ['DateTime', 'local_datetime'] + [col for col in df_cleaned.columns if col not in ['DateTime', 'local_datetime']]

# Reorder the DataFrame columns based on the list
df_cleaned = df_cleaned[columns_order]

In [14]:
#Testing
print("Number Of NaN Measurements: " + str(df_cleaned.isnull().sum()))

Number Of NaN Measurements: 1
DateTime              0
local_datetime        0
Temperature           0
Alpha                 0
AOD                   0
Asymmetry             0
Clearsky DHI          0
Clearsky DNI          0
Clearsky GHI          0
Cloud Fill Flag       0
Cloud Type            0
Dew Point             0
DHI                   0
DNI                   0
Fill Flag             0
GHI                   0
Ozone                 0
Relative Humidity     0
Solar Zenith Angle    0
SSA                   0
Surface Albedo        0
Pressure              0
Precipitable Water    0
Wind Direction        0
Wind Speed            0
dtype: int64


In [17]:
#Removal Of Rows With Fill Flag != 0
print("Number Of Rows With Fill Flag Error: ", (df_cleaned['Fill Flag'] != 0).sum())
df_cleaned = df_cleaned[df_cleaned["Fill Flag"] == 0]

Number Of Rows With Fill Flag Error:  0


In [18]:
#The Cloud Type, Cloud Fill Flag and Fill Flags are dropped
df_cleaned = df_cleaned.drop(columns=['Cloud Type', 'Cloud Fill Flag', 'Fill Flag'])

In [19]:
#Identification Of Outliers
GHI_max = 1000
GHI_min = 0

print("Number Of Outliers (Above Max): " + str((df_cleaned['GHI'] > GHI_max).sum()))
#print(df_cleaned.loc[df_cleaned['GHI'] > GHI_max, 'GHI'])
print("Number Of Outliers (Below Min): " + str((df_cleaned['GHI'] < GHI_min).sum()))
#print(df_cleaned.loc[df_cleaned['GHI'] < GHI_min, 'GHI'])

Number Of Outliers (Above Max): 321
Number Of Outliers (Below Min): 0


In [20]:
#Removal Of Outliers
df_cleaned = df_cleaned[(df_cleaned["GHI"] < GHI_max)]
print("Number Of Outliers (Above Max): " + str((df_cleaned['GHI'] > GHI_max).sum()))
print("Updated Number Of Rows: ", df_cleaned.shape[0])

Number Of Outliers (Above Max): 0
Updated Number Of Rows:  49642


In [21]:
#Number Of Night Measurements
nb_night_measurements = int(100*df_cleaned[df_cleaned["Clearsky GHI"] < 30].shape[0] / df_cleaned.shape[0])
print("Percentage Of Data As Night Measurements: " + str(nb_night_measurements) + "%")

#Removal Of Night Measurements
print("Clear Sky GHI Minimums And Maximums Before Removal Of Night Measurements", df_cleaned["Clearsky GHI"].min(), df_cleaned["Clearsky GHI"].max())
df_cleaned = df_cleaned[df_cleaned["Clearsky GHI"] > 30]

Percentage Of Data As Night Measurements: 53%
Clear Sky GHI Minimums And Maximums Before Removal Of Night Measurements 0 1064


In [22]:
#Testing
print("Clear Sky GHI Minimums And Maximums After Removal Of Night Measurements", df_cleaned["Clearsky GHI"].min(), df_cleaned["Clearsky GHI"].max())
print("Updated Number Of Rows: ", df_cleaned.shape[0])

Clear Sky GHI Minimums And Maximums After Removal Of Night Measurements 31 1064
Updated Number Of Rows:  23187


In [23]:
#Introduction Of Clear Sky Index
df_cleaned['Kcs'] = df_cleaned['GHI'] / df_cleaned["Clearsky GHI"]
#print(df_cleaned['Kcs'])
df_cleaned.replace([np.inf, -np.inf], 0, inplace = True)
print("Kcs Minimums And Maximums", df_cleaned["Kcs"].min(), df_cleaned["Kcs"].max())
df_cleaned['Kcs'] = df_cleaned['Kcs'].round(3)

Kcs Minimums And Maximums 0.01444043321299639 1.0


In [24]:
df_cleaned.reset_index(drop=True, inplace=True)

In [None]:
df_cleaned.to_csv("solar_data_cleaned.csv", index=True)

In [25]:
df_cleaned.to_csv("solar_data_cleaned_updated.csv", index=True)

**Testing Code**

In [12]:
print(data_df.head())
print(data_df.shape)
print(data_df.columns)

1  Year Month Day Hour Minute          Temperature Alpha    AOD Asymmetry  \
0  2022     1   1    0      0  -3.4000000000000004  1.25  0.088      0.62   
1  2022     1   1    0     10                 -3.6  1.25  0.088      0.62   
2  2022     1   1    0     20  -3.8000000000000003  1.25  0.088      0.62   
3  2022     1   1    0     30                   -4  1.25  0.088      0.62   
4  2022     1   1    0     40                 -4.3  1.21  0.085      0.62   

1 Clearsky DHI  ...  NaN  NaN NaN  NaN  NaN  NaN  NaN  NaN NaN  NaN  
0            0  ...  NaN  NaN NaN  NaN  NaN  NaN  NaN  NaN NaN  NaN  
1            0  ...  NaN  NaN NaN  NaN  NaN  NaN  NaN  NaN NaN  NaN  
2            0  ...  NaN  NaN NaN  NaN  NaN  NaN  NaN  NaN NaN  NaN  
3            0  ...  NaN  NaN NaN  NaN  NaN  NaN  NaN  NaN NaN  NaN  
4            0  ...  NaN  NaN NaN  NaN  NaN  NaN  NaN  NaN NaN  NaN  

[5 rows x 46 columns]
(52560, 46)
Index([              'Year',              'Month',                'Day',
         

In [50]:
print(df_cleaned.shape)

(23187, 23)


In [13]:
print(df_cleaned)

1                DateTime      local_datetime  Temperature  Alpha    AOD  \
0     2022-01-01 00:00:00 2021-12-31 17:00:00         -3.4   1.25  0.088   
1     2022-01-01 00:10:00 2021-12-31 17:10:00         -3.6   1.25  0.088   
2     2022-01-01 00:20:00 2021-12-31 17:20:00         -3.8   1.25  0.088   
3     2022-01-01 00:30:00 2021-12-31 17:30:00         -4.0   1.25  0.088   
4     2022-01-01 00:40:00 2021-12-31 17:40:00         -4.3   1.21  0.085   
...                   ...                 ...          ...    ...    ...   
52555 2022-12-31 23:10:00 2022-12-31 16:10:00          3.7   1.54  0.010   
52556 2022-12-31 23:20:00 2022-12-31 16:20:00          3.7   1.54  0.010   
52557 2022-12-31 23:30:00 2022-12-31 16:30:00          3.7   1.54  0.010   
52558 2022-12-31 23:40:00 2022-12-31 16:40:00          3.7   1.54  0.010   
52559 2022-12-31 23:50:00 2022-12-31 16:50:00          3.7   1.54  0.010   

1      Asymmetry  Clearsky DHI  Clearsky DNI  Clearsky GHI  Cloud Fill Flag  \
0       