In [1]:
import pandas as pd
import glob
import matplotlib.pyplot as plt

import numpy as np 

import pvlib
from pvlib import clearsky
from pvlib import clearsky, atmosphere, solarposition
from pvlib.location import Location

import warnings
warnings.filterwarnings("ignore")


# 0 - Importing data

In [2]:
GHI_df = pd.DataFrame()
years = np.linspace(2016,2021,6, dtype= int)
file_path = './GHI_dataset/raw_data/GHI_raw_'

for year in years:
    file_path_temp = file_path + str(year) + '.csv'
    df_temp = pd.read_csv(file_path_temp)
    GHI_df = pd.concat([GHI_df, df_temp], axis = 0)

GHI_df['datetime'] = pd.to_datetime(GHI_df['datetime'], format="%Y-%m-%d %H:%M:%S")

GHI_df.set_index('datetime', inplace = True)

In [3]:
print("Number of GHI_df measurements: " + str(GHI_df.shape[0]))

print("Number of NA: " + str(GHI_df['GHI'].isnull().sum()))

GHI_df.head(10)

Number of GHI_df measurements: 17297280
Number of NA: 717723


Unnamed: 0_level_0,GHI
datetime,Unnamed: 1_level_1
2016-01-08 00:00:00,2.923
2016-01-08 00:00:10,2.957
2016-01-08 00:00:20,2.945
2016-01-08 00:00:30,2.938
2016-01-08 00:00:40,2.936
2016-01-08 00:00:50,2.941
2016-01-08 00:01:00,2.934
2016-01-08 00:01:10,2.941
2016-01-08 00:01:20,2.961
2016-01-08 00:01:30,2.95


## Step 1 - Removal of missing values

In [4]:
threshold = 1

def remove_consecutive_nan(df, column, threshold):
    bool_series = df[column].isnull()
    df['block'] = (bool_series.diff(1) != 0).astype('int').cumsum()
    df = df[~((df[column].isnull()) & (df.groupby('block')['block'].transform('size') > threshold))]
    df = df.drop('block', axis=1)
    return df

GHI_df = remove_consecutive_nan(GHI_df, 'GHI', threshold)

print('Number of GHI measurements to interpolate: ' + str(GHI_df['GHI'].isnull().sum()))

Number of GHI measurements to interpolate: 11683


In [5]:
GHI_df['GHI'] = GHI_df['GHI'].interpolate()

print('Number of NaNs: ' + str(GHI_df['GHI'].isnull().sum()))
print("Number of GHI measurements: " + str(GHI_df.shape[0]))

Number of NaNs: 0
Number of GHI measurements: 16591240


## Step 2 - Identification and removal of outliers

In [6]:
GHI_max = 1000
GHI_min = 0

GHI_df = GHI_df[(GHI_df["GHI"] < GHI_max) & (GHI_df["GHI"] > GHI_min)]
print("Number of GHI measurements: " + str(GHI_df.shape[0]))

Number of GHI measurements: 16241644


## Step 3 - Clear sky global horizontal irradiance (GHIcs)

In [None]:
#Clear sky GHI calculation

latitude = 46.518
longitude = 6.565
time_zone = 'Europe/Zurich'
altitude = 400
place = 'Ecublens'
frequency = '10S'

tus = Location(latitude, longitude, time_zone, altitude, place)

cs = tus.get_clearsky(GHI_df.index)
GHI_df['GHIcs'] = cs.ghi


## Step 4 - Removal of night measurements

In [9]:

nb_night_measurements = int(100*GHI_df[GHI_df['GHIcs']<30].shape[0] / GHI_df.shape[0])

GHI_df = GHI_df[GHI_df["GHIcs"] > 30]

print("Percentage of GHI night measurements: " + str(nb_night_measurements) + "%")
print("Number of GHI measurements: " + str(GHI_df.shape[0]))

Percentage of GHI night measurements: 53%
Number of GHI measurements: 7490668


## Step 5 - Clear sky index (kcs)

In [10]:
GHI_df['k'] = GHI_df["GHI"] / GHI_df["GHIcs"]

#Night measurement have GHIcs = 0 => k=inf ; when it happens, we set k to 0
GHI_df.replace([np.inf, -np.inf], 0, inplace= True)

GHI_df.head(5)

Unnamed: 0_level_0,GHI,GHIcs,k
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2016-01-08 07:59:00,15.394,30.209756,0.50957
2016-01-08 07:59:10,15.447,30.471824,0.506927
2016-01-08 07:59:20,15.472,30.734662,0.503406
2016-01-08 07:59:30,15.542,30.998266,0.501383
2016-01-08 07:59:40,15.539,31.262629,0.497047


## Step 6 - Sampling

In [11]:
import os
sampling_frequencies = ["15T", "30T", "45T", "1H"]

sampling_frequencies_eng = ["15_minutes", "30_minutes", "45_minutes", "1_hour"]

root = "./GHI_dataset/cleaned_sampled_data/"

for i, frequency in enumerate(sampling_frequencies):

    
    df_sampled_temp = GHI_df.resample(frequency).mean().copy()
    df_sampled_temp = df_sampled_temp.dropna()
    df_sampled_temp.reset_index(inplace = True)

    df_sampled_temp['year'] = df_sampled_temp['datetime'].dt.year
    years = df_sampled_temp['year'].unique()

    file_path= root + '/GHI_sampled_' + str(sampling_frequencies_eng[i]) + '.csv'
    df_sampled_temp.to_csv(file_path, index=False)
    