# Loading and combining the data sources

In [1]:
from typing import List

import os

import pytz
import numpy as np
import pandas as pd
from pathlib import Path

from dateutil.parser import parse

import json
import zipfile
from tqdm import tqdm
tqdm.pandas()

from src.assignment1.data.make_dataset import DataProcessor

#### Define functions for loading and combining separate data sources

#### Exploration and loading of weather data

In [3]:
# zip file handler  
zip = zipfile.ZipFile('C:/Users/alber/Desktop/DTU/3_HCAI/46765/ml-energy-systems/data/assignment1/raw/Climate data_2021.zip')
f = zip.open('2021-01-01.txt', 'r')

# Do initial investigation of relevant attributes by loading a single sample file
information_list = []
for line in f:
    information_list.append(json.loads(line))

# Ex
weather_attributes = pd.Series([information['properties']['parameterId'] for information in information_list]).unique()
print(f"Unique weather attributes: \n{weather_attributes}")

municipalities = pd.Series([information['properties']['municipalityName'] for information in information_list]).unique()
print(f"\nUnique municipalities: \n{municipalities}")

Unique weather attributes: 
['no_ice_days' 'temp_grass' 'leaf_moisture' 'mean_temp' 'mean_wind_speed'
 'max_temp_w_date' 'mean_cloud_cover' 'temp_soil_30' 'no_summer_days'
 'temp_soil_10' 'mean_daily_max_temp' 'no_lightning_strikes'
 'max_wind_speed_10min' 'bright_sunshine' 'no_tropical_nights'
 'no_cold_days' 'no_days_acc_precip_1' 'min_temp' 'drought_index'
 'mean_radiation' 'no_days_acc_precip_01' 'acc_heating_degree_days_17'
 'max_wind_speed_3sec' 'vapour_pressure_deficit_mean'
 'pot_evaporation_makkink' 'mean_pressure' 'mean_daily_min_temp'
 'mean_relative_hum' 'acc_precip' 'mean_wind_dir' 'no_days_acc_precip_10'
 'no_frost_days' 'max_precip_30m' 'snow_depth']

Unique municipalities: 
['Furesø' 'Struer' 'Egedal' 'Faxe' 'Fredericia' 'Frederikssund' 'Hvidovre'
 'Lemvig' 'Kalundborg' 'Vesthimmerlands' 'Lejre' 'Haderslev' 'Syddjurs'
 'Thisted' 'Allerød' 'Odense' 'Solrød' 'Svendborg' 'Hørsholm'
 'Lyngby-Taarbæk' 'Gribskov' 'Ikast-Brande' 'Vordingborg' 'Stevns' 'Samsø'
 'Sønderborg' 'Ro

#### Load and save data files

In [4]:
# Set path to data and save folder
DATA_DIR = Path('../../../data/assignment1')
SAVE_DIR = Path(r'../../../data/assignment1/processed')

# Define dataset processor object
processor          = DataProcessor(DATA_DIR=DATA_DIR, SAVE_DIR=SAVE_DIR)

# Load and combine data sources
processor.combine_and_save_data_sources(weather_municipality='Roskilde', priceareas=['DK1', 'DK2'])

In [1]:
from pathlib import Path
import pandas as pd

SAVE_DIR = Path(r'../../../data/assignment1/processed')

# Load DK1 and DK2 datasets
DK1_dataset = pd.read_csv(SAVE_DIR / 'DK1.csv', index_col=0)
DK2_dataset = pd.read_csv(SAVE_DIR / 'DK2.csv', index_col=0)

# Check if missing data occurs
print(f"NaN values occuring in DK1 dataset? {DK1_dataset.isna().any().any()}")
print(f"NaN values occuring in DK2 dataset? {DK2_dataset.isna().any().any()}")

NaN values occuring in DK1 dataset? False
NaN values occuring in DK2 dataset? False


In [2]:
DK1_dataset

Unnamed: 0,StartTimeUTC,EndTimeUTC,PriceArea,Actual,SpotPriceDKK,SpotPriceEUR,BalancingMarketPrice_DownReg,BalancingMarketPrice_UpReg,max_wind_speed_10min,max_wind_speed_3sec,mean_wind_dir,mean_wind_speed
1,2021-01-01 00:00:00+00:00,2021-01-01 01:00:00+00:00,DK1,0.234708,358.579987,48.189999,24.35,24.35,4.6,7.7,160.0,3.2
8,2021-01-01 01:00:00+00:00,2021-01-01 02:00:00+00:00,DK1,0.223329,332.459991,44.680000,23.98,23.98,3.6,5.7,154.0,3.1
15,2021-01-01 02:00:00+00:00,2021-01-01 03:00:00+00:00,DK1,0.172119,319.369995,42.919998,23.72,23.72,4.7,6.4,155.0,3.5
22,2021-01-01 03:00:00+00:00,2021-01-01 04:00:00+00:00,DK1,0.160740,300.540009,40.389999,23.73,23.73,4.6,6.2,160.0,3.5
29,2021-01-01 04:00:00+00:00,2021-01-01 05:00:00+00:00,DK1,0.129445,299.130005,40.200001,24.06,24.06,4.6,6.7,157.0,3.7
...,...,...,...,...,...,...,...,...,...,...,...,...
122585,2022-12-31 17:00:00+00:00,2022-12-31 18:00:00+00:00,DK1,0.453770,506.440002,68.099998,26.17,26.17,6.9,10.5,243.0,5.4
122592,2022-12-31 18:00:00+00:00,2022-12-31 19:00:00+00:00,DK1,0.354196,498.329987,67.010002,15.08,15.08,6.7,9.8,252.0,5.5
122599,2022-12-31 19:00:00+00:00,2022-12-31 20:00:00+00:00,DK1,0.342817,301.190002,40.500000,1.34,11.57,5.1,8.3,249.0,4.3
122606,2022-12-31 20:00:00+00:00,2022-12-31 21:00:00+00:00,DK1,0.389758,110.730003,14.890000,-7.00,14.89,4.2,7.0,243.0,3.7
