In [1]:
import os
import re
import pandas as pd
import requests
import pickle
from tqdm import tqdm

from bs4 import BeautifulSoup

import warnings
warnings.simplefilter('ignore') # Comment out to see warnings

In [2]:
# Parent Directoy URL
url= "http://reports.ieso.ca/public/RealtimeMktPrice/"
page = requests.get(url,allow_redirects=True)

In [3]:
# Setting up GCloud Storage Authentication
from google.cloud import storage
path_to_private_key = './ieso-dashboard-c639f1a39298.json'
storage_client = storage.Client.from_service_account_json(json_credentials_path=path_to_private_key)
bucket = storage_client.bucket('amol_javahire')

In [4]:
# Retrieve File names from parent directory url
soup = BeautifulSoup(page.text)
url_arr=soup.find_all("a",href=True)
xml_files=[]
csv_files=[]
for element in url_arr:
    file_name=element.text
    if re.search("\.xml$", file_name) and not re.search("_v", file_name):
        xml_files.append(str(file_name))
    # if re.search("\.csv$", file_name) and not re.search("_v", file_name):
    #     csv_files.append(str(file_name))

In [5]:
# Helper Function to fetch the child columns
def fill_prices_list(element):
    int_prices=element.find_all("intervalprice")
    arr=[]
    index=0
    for element in int_prices:
        arr.append(element.find("mcp").text)
        index+=1
    while index<12:
        arr.append(None)
        index+=1
    return arr

def fill_prices_avg(element):
    int_prices=element.find_all("intervalprice")
    total,count=0,0
    if len(int_prices)==0:
        return None
    for element in int_prices:
        total+=float(element.find("mcp").text)
        count+=1
    avg=round((total/count),2)
    return [avg]

In [6]:
# Helper Function to extract Ontario zone data and creating a df out of its values
def xml_to_df(soup,col_names):
    df= pd.DataFrame(columns=col_names)

    ont_data=soup.find('intertiezonalprices').find_all('prices')
    for element in ont_data:
        if element.find("pricetype").text=='Energy':
            df['ont_ene']=fill_prices_avg(element)
        elif element.find("pricetype").text=='10S':
            df['ont_10s']=fill_prices_avg(element)
        elif element.find("pricetype").text=='10N':
            df['ont_10n']=fill_prices_avg(element)
        elif element.find("pricetype").text=='30R':
            df['ont_30r']=fill_prices_avg(element)
        else:
            continue
            
    # Created at column
    xml_cdate=soup.find("createdat").text
    df['created_at']=xml_cdate
    # Market Date column
    xml_mdate=soup.find("deliverydate").text
    df['mkt_date']=xml_mdate
    # Delivery hour column    
    xml_mhour=soup.find("deliveryhour").text
    df['mkt_he']=xml_mhour
        
    return df

In [7]:
### LOCAL DATA CURATION

# # Defining Output Data Structure and extracting and filling values correspondingly and then dump final data to ./[outdir]

# # col_names=['ont_ene','ont_10s','ont_10n','ont_30r','man_ene','man_10n','man_30r','mic_ene','mic_10n','mic_30r','min_ene','min_10n','min_30r','new_ene','new_10n','new_30r','que_at_ene','que_at_10n','que_at_30r','que_b5d_ene','que_b5d_10n','que_b5d_30r','que_d5a_ene','que_d5a_10n','que_d5a_30r','que_d4z_ene','que_d4z_10n','que_d4z_30r','que_h9a_ene','que_h9a_10n','que_h9a_30r','que_h4z_ene','que_h4z_10n','que_h4z_30r','que_p33_ene','que_p33_10n','que_p33_30r','que_q4c_ene','que_q4c_10n','que_q4c_30r','que_x2y_ene','que_x2y_10n','que_x2y_30r','man_sk_ene','man_sk_10n','man_sk_30r']
# col_names=['ont_ene','ont_10s','ont_10n','ont_30r']
# outdir = './RealtimeMktPrice'
# if not os.path.exists(outdir):
#     os.mkdir(outdir)
# for file in tqdm(xml_files,desc='Processing :: '):
#     # print('Processing ' + file, end=" ")
#     curr_url=str(url)+str(file)
#     page = requests.get(curr_url,allow_redirects=True)
#     soup = BeautifulSoup(page.text, 'lxml')
#     report_df=xml_to_df(soup,col_names)
#     fullname = os.path.join(outdir, (str(file[:-4])))
#     # Pickle Dump to ./out directory
#     report_df.to_csv(fullname+'.csv',index=False)
# print('Dumped all files in ./RealtimeMktPrice')

In [10]:
### GCP DATA CURATION

# Defining Output Data Structure and extracting and filling values correspondingly and then dump final data to ./[outdir]

# col_names=['ont_ene','ont_10s','ont_10n','ont_30r','man_ene','man_10n','man_30r','mic_ene','mic_10n','mic_30r','min_ene','min_10n','min_30r','new_ene','new_10n','new_30r','que_at_ene','que_at_10n','que_at_30r','que_b5d_ene','que_b5d_10n','que_b5d_30r','que_d5a_ene','que_d5a_10n','que_d5a_30r','que_d4z_ene','que_d4z_10n','que_d4z_30r','que_h9a_ene','que_h9a_10n','que_h9a_30r','que_h4z_ene','que_h4z_10n','que_h4z_30r','que_p33_ene','que_p33_10n','que_p33_30r','que_q4c_ene','que_q4c_10n','que_q4c_30r','que_x2y_ene','que_x2y_10n','que_x2y_30r','man_sk_ene','man_sk_10n','man_sk_30r']
col_names=['created_at','mkt_date','mkt_he','ont_ene','ont_10s','ont_10n','ont_30r']

for file in tqdm(xml_files,desc='Processing :: '):
    # print('Processing ' + file, end=" ")
    curr_url=str(url)+str(file)
    page = requests.get(curr_url,allow_redirects=True)
    soup = BeautifulSoup(page.text, 'lxml')
    report_df=xml_to_df(soup,col_names)
    
    # Create a blob instance to upload your data into 
    blob = bucket.blob('RealtimeMktPrice/'+str(file[:-4])+'.csv')
    blob.upload_from_string(report_df.to_csv(index=False), 'text/csv')
print('Dumped all files in ./RealtimeMktPrice')

Processing :: 100%|███████████████████████████| 770/770 [21:48<00:00,  1.70s/it]

Dumped all files in ./RealtimeMktPrice





In [13]:
# !pip install --upgrade google-cloud-bigquery

Collecting google-cloud-bigquery
  Downloading google_cloud_bigquery-3.3.2-py2.py3-none-any.whl (211 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.9/211.9 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m MB/s[0m eta [36m0:00:01[0m:01[0m
Collecting proto-plus<2.0.0dev,>=1.22.0
  Downloading proto_plus-1.22.0-py3-none-any.whl (47 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
Collecting google-cloud-bigquery-storage<3.0.0dev,>=2.0.0
  Downloading google_cloud_bigquery_storage-2.14.2-py2.py3-none-any.whl (182 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m182.1/182.1 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
Collecting grpcio<2.0dev,>=1.47.0
  Downloading grpcio-1.47.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.5 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.5/4.5 MB[0m 

In [20]:
from google.cloud import bigquery

# Construct a BigQuery client object.
path_to_private_key = './ieso-dashboard-c639f1a39298.json'
client = bigquery.Client.from_service_account_json(json_credentials_path=path_to_private_key)

# TODO(developer): Set table_id to the ID of the table to create.
table_id = "ieso-dashboard.RealtimeMktPrice.test"

job_config = bigquery.LoadJobConfig(
    autodetect=True,
    write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE,
    # The source format defaults to CSV, so the line below is optional.
    source_format=bigquery.SourceFormat.CSV,
)
uri = "gs://amol_javahire/RealtimeMktPrice/PUB_RealtimeMktPrice*"

load_job = client.load_table_from_uri(
    uri, table_id, job_config=job_config
)  # Make an API request.

load_job.result()  # Waits for the job to complete.

LoadJob<project=ieso-dashboard, location=US, id=3f858682-2d8a-4a34-ab08-582f75d0290c>