In [4]:
from google.colab import drive
drive.mount('/content/drive/')

import os
os.chdir('/content/drive/MyDrive/Times-Series-Library/')
os.getcwd()

Mounted at /content/drive/


'/content/drive/MyDrive/Times-Series-Library'

In [None]:
!pip install -r requirements.txt

In [5]:
import json


import numpy as np
import pandas as pd
import plotly.express as px
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm

# Target data (power output)

In [6]:
with open('./dataset/generation.json') as f:
    target = json.load(f)
target = pd.DataFrame.from_dict(target, orient='index')
target.index = pd.to_datetime(target.index)

In [8]:
target.describe()

Unnamed: 0,0
count,154752.0
mean,33.477192
std,36.019302
min,10.0
25%,11.682
50%,17.94849
75%,35.922384
max,175.0


In [None]:
n_samples_30min=target.resample('30min').size().values

print(np.all(n_samples_30min == n_samples_30min[0]),'\n',n_samples_30min[0])

True 
 2


## Data Features:
1. Exhibits a yearly pattern.
2. March 2020 is a turning point.
3. Intervals between timestamps of each observation are not fixed, but there are always 2 records within 30 minutes.
4. Fixed upper and lower thresholds.

In [None]:

fig = px.line(target, template='plotly_white', title='power output')
fig.add_vline(x='2020-03-01', line_width=2, line_color="#EF553B")

fig.update_layout(showlegend=False, )

fig.update_yaxes(title_text='')
fig.update_xaxes(title_text='')
fig.update_layout(title_x=.5)
fig.show()

Output hidden; open in https://colab.research.google.com to view.

# Temperature data

In [None]:
names = ['A', 'B', 'C']

The XML file is read and subsequently parsed using BeautifulSoup.
Each forecast is then transformed into a column within the resulting DataFrame, with the 'spot_time' column as the index.
Finally, the DataFrame is saved as a CSV file.

This function is already runned and the csv file was saved in `./dataset/temperature_all.csv`

In [None]:
def to_temp_csv(xml_path='./dataset/temperatures.xml', output_path='./dataset/temperature_all.csv'):
    with open(xml_path, 'r') as f:
        file = f.read()

    soup = BeautifulSoup(file, 'xml')

    dfs = []
    for f in soup.find_all('forecast'):

        df = pd.DataFrame(columns=['time_forecast_made', 'spot_time', 'temp_celsius'])
        points = f.find_all('point')
        for point in tqdm(points):
            time_forecast_made = point.find('time_forecast_made').text
            spot_time = point.find('spot_time').text
            temp_celsius = float(point.find('temp_celsius').text)

            df = pd.concat([df, pd.DataFrame({'time_forecast_made': [time_forecast_made],
                                              'spot_time': [spot_time],
                                              'temp_celsius': [temp_celsius]})], ignore_index=True)
        dfs.append(df)

    temp_df = pd.DataFrame()
    for i, df in enumerate(dfs):
        df = df.loc[:, ['spot_time', 'temp_celsius']]
        df.set_index('spot_time', inplace=True)
        df.columns = [names[i]]
        temp_df = pd.concat([temp_df, df], axis='columns')
    temp_df.index = pd.to_datetime(temp_df.index)
    temp_df.to_csv(output_path)

    return temp_df

Read `temperature_all.csv` from file, setting the first column (date) as index

In [None]:
temp_df = pd.read_csv('./dataset/temperature_all.csv', index_col=0)
temp_df

Unnamed: 0_level_0,A,B,C
spot_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-01-01 00:00:00,10.068520,9.043016,11.041315
2018-01-01 00:15:00,7.283149,8.069365,9.853585
2018-01-01 00:30:00,8.149209,7.951525,10.753473
2018-01-01 00:45:00,6.826715,8.583614,10.306547
2018-01-01 01:00:00,9.949895,9.238085,10.521315
...,...,...,...
2022-05-31 22:45:00,21.946381,21.308607,23.271850
2022-05-31 23:00:00,20.849525,20.597889,23.586474
2022-05-31 23:15:00,17.472524,17.821313,20.203671
2022-05-31 23:30:00,17.059294,17.942776,20.549625




In [None]:
fig = px.line(temp_df, template='plotly_white', title='Temperature')
fig.update_yaxes(title_text='')
fig.update_xaxes(title_text='')
fig.update_layout(title_x=.5)
fig.show()

Output hidden; open in https://colab.research.google.com to view.

- N (number of records): 154752

- total hours: 38688 (4 records/h)

- total days: 1612

- range: 2018-01-01 00:00 -> 2022-05-31 23:45

- approximately 4.41 years

In [None]:
temp_df.index = target.index
df_all = pd.concat([temp_df, target], axis='columns')
df_all.columns = names + ['target']

print(f'no nan values: {not np.any(np.isnan(df_all.values))}')

no nan values: True


concatenate temperature and power output to `'dataset/power.csv'`, this will be used for training:

In [None]:
df_all.to_csv('dataset/power.csv')

In [None]:
power_df = pd.read_csv('dataset/power.csv', index_col=0)

* Forecast should estimate output from *1 hour ahead to 4 hours* ahead. This means that the forecast cannot use data that arrives less than 1 hour from the time that is being forecasted.
    * For example, if your forecast needs to begin from 1pm, data available at 12:15pm cannot be used for that particular prediction.
* We need to forecast in 30 minute intervals.
    * For example, a forecast from one hour ahead to four hours ahead would contain six values, each representing the average forecasted output for each successive 30 minute interval