In [1]:
import os
import re
import pandas as pd
import requests
import pickle
from tqdm import tqdm

from bs4 import BeautifulSoup

import warnings
warnings.simplefilter('ignore') # Comment out to see warnings

In [2]:
# Parent Directoy URL
url= "http://reports.ieso.ca/public/GenOutputbyFuelHourly/"
page = requests.get(url,allow_redirects=True)

In [3]:
# Setting up GCloud Storage Authentication
from google.cloud import storage
path_to_private_key = './ieso-dashboard-c639f1a39298.json'
client = storage.Client.from_service_account_json(json_credentials_path=path_to_private_key)
bucket = client.bucket('amol_javahire')

In [4]:
# Retrieve File names from parent directory url
soup = BeautifulSoup(page.text)
xml_cap=soup.find_all("a",href=True)
files=[]
for element in xml_cap:
    file_name=element.text
    if re.search("\.xml$", file_name) and not re.search("_v", file_name):
        files.append(str(file_name))

In [5]:
# Helper Function to extract fuel values from each row
def get_fuel_values(element):
    fuels=element.find_all('fueltotal')
    nuc_val,gas_val,hyd_val,win_val,sol_val,bio_val=None,None,None,None,None,None
    for fuel in fuels:
        if fuel.find("fuel").text=='NUCLEAR' and fuel.find('output')!=None:
            nuc_val=fuel.find('output').text
        elif fuel.find("fuel").text=='GAS' and fuel.find('output')!=None:
            gas_val=fuel.find('output').text
        elif fuel.find("fuel").text=='HYDRO' and fuel.find('output')!=None:
            hyd_val=fuel.find('output').text
        elif fuel.find("fuel").text=='WIND' and fuel.find('output')!=None:
            win_val=fuel.find('output').text
        elif fuel.find("fuel").text=='SOLAR' and fuel.find('output')!=None:
            sol_val=fuel.find('output').text
        elif fuel.find("fuel").text=='BIOFUEL' and fuel.find('output')!=None:
            bio_val=fuel.find('output').text
        else:
            continue
    # print(nuc_val,gas_val,hyd_val,win_val,sol_val,bio_val)
    return nuc_val,gas_val,hyd_val,win_val,sol_val,bio_val

In [6]:
# Helper Function to parse xml data to csv
def xml_to_df(soup,col_names):
    dic={}
    index=0
    created_at=soup.find('createdat').text
    days=soup.find_all('dailydata')
    for day in tqdm(days,desc='Processing :: '):
        date_val=day.find('day').text
        hours=day.find_all('hourlydata')
        for hour in hours:
            hour_val=hour.find('hour').text
            nuc_val,gas_val,hyd_val,win_val,sol_val,bio_val= get_fuel_values(hour)

            # Calulate total Fuel output
            total_val=0
            for i in [nuc_val,gas_val,hyd_val,win_val,sol_val,bio_val]:
                if i!=None:
                    total_val+=int(i)
            row_data = {col_names[0]:created_at,col_names[1]: date_val, col_names[2]: hour_val, col_names[3]: nuc_val, col_names[4]: gas_val, col_names[5]: hyd_val, col_names[6]: win_val, col_names[7]: sol_val, col_names[8]: bio_val, col_names[9]: total_val}
            dic[index] = row_data
            index += 1
    df = pd.DataFrame.from_dict(dic,"index")
    return df

In [7]:
### LOCAL DATA CURATION

# # Defining Output Data Structure and extracting and filling values correspondingly and then dump final data to ./[outdir]
# col_names=['created_at','mkt_date','mkt_he','nuclear','gas','hydro','wind','solar','biofuel','total']
# outdir = './GenOutputbyFuelHourly'
# if not os.path.exists(outdir):
#     os.mkdir(outdir)
# for file in files:
#     print('Downloading ' + file, end=" ")
#     curr_url=str(url)+str(file)
#     page = requests.get(curr_url,allow_redirects=True)
#     soup = BeautifulSoup(page.text, 'lxml')
#     print(' -- Done')
#     report_df=xml_to_df(soup,col_names)
#     fullname = os.path.join(outdir, (str(file[:-4])))
#     # Pickle Dump to ./out directory
#     report_df.to_csv(fullname+'.csv',index=False)
# print('Dumped all files in ./GenOutputbyFuelHourly')

In [8]:
### GCP DATA CURATION

# Defining Output Data Structure and extracting and filling values correspondingly and then dump final data to ./[outdir]
col_names=['created_at','mkt_date','mkt_he','nuclear','gas','hydro','wind','solar','biofuel','total']
for file in files:
    print('Loading ' + file, end=" ")
    curr_url=str(url)+str(file)
    page = requests.get(curr_url,allow_redirects=True)
    soup = BeautifulSoup(page.text, 'lxml')
    print(' -- Done')
    report_df=xml_to_df(soup,col_names)
    
    # Create a blob instance to upload your data into 
    blob = bucket.blob('GenOutputbyFuelHourly/'+str(file[:-4])+'.csv')
    blob.upload_from_string(report_df.to_csv(index=False), 'text/csv')
print('Dumped all files in ./GenOutputbyFuelHourly')

Loading PUB_GenOutputbyFuelHourly.xml  -- Done


Processing :: 100%|███████████████████████████| 234/234 [00:04<00:00, 57.26it/s]


Loading PUB_GenOutputbyFuelHourly_2015.xml  -- Done


Processing :: 100%|███████████████████████████| 365/365 [00:06<00:00, 57.53it/s]


Loading PUB_GenOutputbyFuelHourly_2016.xml  -- Done


Processing :: 100%|███████████████████████████| 366/366 [00:06<00:00, 56.02it/s]


Loading PUB_GenOutputbyFuelHourly_2017.xml  -- Done


Processing :: 100%|███████████████████████████| 365/365 [00:06<00:00, 60.05it/s]


Loading PUB_GenOutputbyFuelHourly_2018.xml  -- Done


Processing :: 100%|███████████████████████████| 365/365 [00:06<00:00, 59.50it/s]


Loading PUB_GenOutputbyFuelHourly_2019.xml  -- Done


Processing :: 100%|███████████████████████████| 365/365 [00:06<00:00, 53.91it/s]


Loading PUB_GenOutputbyFuelHourly_2020.xml  -- Done


Processing :: 100%|███████████████████████████| 366/366 [00:05<00:00, 61.05it/s]


Loading PUB_GenOutputbyFuelHourly_2021.xml  -- Done


Processing :: 100%|███████████████████████████| 365/365 [00:07<00:00, 49.41it/s]


Loading PUB_GenOutputbyFuelHourly_2022.xml  -- Done


Processing :: 100%|███████████████████████████| 234/234 [00:04<00:00, 49.48it/s]


Dumped all files in ./GenOutputbyFuelHourly


In [2]:
from google.cloud import bigquery

# Construct a BigQuery client object.
path_to_private_key = './ieso-dashboard-c639f1a39298.json'
client = bigquery.Client.from_service_account_json(json_credentials_path=path_to_private_key)

# TODO(developer): Set table_id to the ID of the table to create.
table_id = "ieso-dashboard.GenOutputbyFuelHourly.test"

job_config = bigquery.LoadJobConfig(
    autodetect=True,
    write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE,
    # The source format defaults to CSV, so the line below is optional.
    source_format=bigquery.SourceFormat.CSV,
)
uri = "gs://amol_javahire/GenOutputbyFuelHourly/PUB_GenOutputbyFuelHourly_*"

load_job = client.load_table_from_uri(
    uri, table_id, job_config=job_config
)  # Make an API request.

load_job.result()  # Waits for the job to complete.

LoadJob<project=ieso-dashboard, location=US, id=59da8a99-347b-417a-bc55-d4608031321a>