In [115]:
import requests
import os
from bs4 import BeautifulSoup
import pandas as pd
import re
import pickle
import datetime 
import yagmail
from cryptography.fernet import Fernet
import traceback
import logging
import bp_sql as bp
import zipfile
import time
import html5lib
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

options = Options()

#example: prefs = {"download.default_directory" : "C:\Tutorial\down"};


options.headless = True

from pyvirtualdisplay import Display
display = Display(visible=0, size=(800, 800))  
display.start()

SyntaxError: EOL while scanning string literal (2023052294.py, line 20)

In [119]:
script_start_time = time.time()

today = datetime.date.today().strftime("%Y-%m-%d")

#DB path
spp_db = 'SPP.db'

zip_fldr = 'SPP_Zips'

cwd = os.getcwd()

#zip folder path
zip_fldr_path = os.path.join(cwd,  zip_fldr)

prefs = {"download.default_directory" : zip_fldr_path}
options.add_experimental_option("prefs",prefs)

#contents of zip folder path
zip_list = os.listdir(os.path.join(cwd, zip_fldr))

#used for saving files
save_folder = 'SPP_Zips/'

#Website and file type to download
Domain = 'http://ercot.com'
url = 'https://www.ercot.com/mp/data-products/data-product-details?id=NP6-785-ER'
filetype = 'zip'

mon_dict = {1:'JAN', 2:'FEB', 3:'MAR', 4:'APR', 5:'MAY', 6:'JUN', 7:'JUL', 8:'AUG', 9:'SEP', 10:'OCT', 11:'NOV', 12:'DEC'}
revs_mon_dict = dict([(value, key) for key, value in mon_dict.items()])

In [4]:
def get_credentials():
    key_path = os.path.join(os.path.expanduser('~'), '.fernet')
    key = pickle.load(open(key_path, 'rb'))
    cipher_suite = Fernet(key)
    encrypted_credentials_df = pd.read_csv('encrypted_credentials.csv')

    gmail = 'Gmail'
    gmail_ec_row = encrypted_credentials_df.loc[encrypted_credentials_df.login_account == gmail]
    gmail_user = gmail_ec_row.iloc[0]['username']
    gmail_pwd_encrypt = gmail_ec_row.iloc[0]['encrypted_password']
    gmail_pwd = cipher_suite.decrypt(str.encode(gmail_pwd_encrypt)).decode('utf-8')
    
    return gmail_user, gmail_pwd


def send_email(email_subject, email_contents):
    gmail_user, gmail_pwd = get_credentials()
    
    yag = yagmail.SMTP(user=gmail_user, password=gmail_pwd)
    
    yag.send(to=gmail_user, subject=email_subject, contents=email_contents)

def check_max_date(db_file):
    conn = bp.create_connection(db_file)
    max_date_df = pd.read_sql_query('''Select max(DELIVERY_DATE) as MAX_DELIVERY_DATE,  max(DELIVERY_HOUR) as MAX_DELIVERY_HOUR 
                                       from ercot_hist_spp 
                                       where delivery_date = (select max(delivery_date) from ercot_hist_spp where settlement_point_price is not null)''', conn)
    
    max_date =  pd.to_datetime(max_date_df['MAX_DELIVERY_DATE'][0])
    max_hour = max_date_df['MAX_DELIVERY_HOUR'][0]
    conn.close()
    return max_date, max_hour

def get_sheet_list(file:str, year:int):
    with zipfile.ZipFile(file) as zipped_file:
        summary = zipped_file.open(r'xl/workbook.xml').read()
    soup = BeautifulSoup(summary, 'xml')
    sheet_list = [sheet.get("name") for sheet in soup.find_all("sheet")]

    if year == min_file_year:
        sheets_list = [x for x in sheet_list if revs_mon_dict.get(str(x[0:3]).upper())>= min_file_mon]

    return sheet_list 
    

In [5]:
sql_create_spp_table = ''' Create Table if not exists ercot_hist_spp (
                                    DELIVERY_DATE text,
                                    DELIVERY_HOUR integer,
                                    DELIVERY_INTERVAL integer,
                                    REPEATED_HOUR_FLAG text,
                                    SETTLEMENT_POINT_NAME text,
                                    SETTLEMENT_POINT_TYPE text,
                                    SETTLEMENT_POINT_PRICE real);
                                '''

sql_create_spp_tbl_index = '''Create index IF NOT EXISTS index_dd_ercot_hist_spp on ercot_hist_spp (DELIVERY_DATE)'''

sql_create_spp_view = ''' Create View if not exists ercot_avg_spp as
                                Select DELIVERY_DATE, DELIVERY_HOUR, SETTLEMENT_POINT_NAME, SETTLEMENT_POINT_TYPE, AVG(SETTLEMENT_POINT_PRICE) as SETTLEMENT_POINT_PRICE 
                                from ercot_hist_spp 
                                group by DELIVERY_DATE, DELIVERY_HOUR, SETTLEMENT_POINT_NAME, SETTLEMENT_POINT_TYPE
                                ;
                                '''

In [6]:
#create tbl and view if they dont exist
create_list = [sql_create_spp_table, sql_create_spp_tbl_index, sql_create_spp_view]
for c in create_list:
    bp.create_table(spp_db, c)

In [7]:

#get max date of data in db
max_date, max_hour = check_max_date(spp_db)

if None not in (max_date, max_hour):
    max_year = max_date.year
    max_mon = max_date.month
    max_day = max_date.day
else:
    max_year, max_mon, max_day = 2000, 1 , 1
    max_date = date(max_year, max_mon, max_day).strftime("%m/%d/%y")
    

max_mon_abrv = mon_dict.get(max_mon)

if max_mon == 12 and max_day == 31 and max_hour == 24:
    min_file_year = max_year + 1
    min_file_mon = 1
else:
    min_file_year = max_year
    min_file_mon = max_mon

In [120]:
#Get websites HTML, get all the filename and associated links
driver = webdriver.Chrome(chrome_options=options)
driver.get(url)

html = driver.page_source
soup = BeautifulSoup(html)

file_list = soup.find_all(class_='name')
friendly_list = [f.next_element for f in file_list]
long_name_list = [f['title'] for f in file_list]
link_list = soup.findAll('a', attrs={'href': re.compile("/misdownload/")}) 
link_list = [f['href'] for f in link_list]

website_file_df = pd.DataFrame(zip(friendly_list, long_name_list, link_list), columns=['web_friendly_name','web_long_name','web_link'])  
#driver.quit()


  driver = webdriver.Chrome(chrome_options=options)


In [94]:
website_file_df['web_file_yr'] = website_file_df.web_friendly_name.apply(lambda x: int(x[-4:]))
website_file_df['web_file_date'] = website_file_df.web_long_name.apply(lambda x: pd.to_datetime(x.split('.')[3]))


zip_list_df = pd.DataFrame(zip_list, columns=['fldr_filename'])
zip_list_df['fdr_friendly_name'] = zip_list_df.fldr_filename.apply(lambda x: x.split('.')[3])

merge_df = website_file_df.merge(zip_list_df,how='outer',left_on='web_friendly_name', right_on='fdr_friendly_name')
merge_df

Unnamed: 0,web_friendly_name,web_long_name,web_link,web_file_yr,web_file_date,fldr_filename,fdr_friendly_name
0,RTMLZHBSPP_2022,rpt.00013061.0000000000000000.20221204.0835546...,https://www.ercot.com/misdownload/servlets/mir...,2022.0,2022-12-04,rpt.00013061.0000000000000000.RTMLZHBSPP_2022....,RTMLZHBSPP_2022
1,RTMLZHBSPP_2021,rpt.00013061.0000000000000000.20220101.0838204...,https://www.ercot.com/misdownload/servlets/mir...,2021.0,2022-01-01,rpt.00013061.0000000000000000.RTMLZHBSPP_2021....,RTMLZHBSPP_2021
2,RTMLZHBSPP_2020,rpt.00013061.0000000000000000.20210101.0841274...,https://www.ercot.com/misdownload/servlets/mir...,2020.0,2021-01-01,rpt.00013061.0000000000000000.RTMLZHBSPP_2020....,RTMLZHBSPP_2020
3,RTMLZHBSPP_2019,rpt.00013061.0000000000000000.20200101.0825169...,https://www.ercot.com/misdownload/servlets/mir...,2019.0,2020-01-01,rpt.00013061.0000000000000000.RTMLZHBSPP_2019....,RTMLZHBSPP_2019
4,RTMLZHBSPP_2018,rpt.00013061.0000000000000000.20190101.0825270...,https://www.ercot.com/misdownload/servlets/mir...,2018.0,2019-01-01,rpt.00013061.0000000000000000.RTMLZHBSPP_2018....,RTMLZHBSPP_2018
5,RTMLZHBSPP_2017,rpt.00013061.0000000000000000.20180101.0824277...,https://www.ercot.com/misdownload/servlets/mir...,2017.0,2018-01-01,rpt.00013061.0000000000000000.RTMLZHBSPP_2017....,RTMLZHBSPP_2017
6,RTMLZHBSPP_2016,rpt.00013061.0000000000000000.20170101.0833023...,https://www.ercot.com/misdownload/servlets/mir...,2016.0,2017-01-01,rpt.00013061.0000000000000000.RTMLZHBSPP_2016....,RTMLZHBSPP_2016
7,RTMLZHBSPP_2015,rpt.00013061.0000000000000000.20160101.0821419...,https://www.ercot.com/misdownload/servlets/mir...,2015.0,2016-01-01,rpt.00013061.0000000000000000.RTMLZHBSPP_2015....,RTMLZHBSPP_2015
8,RTMLZHBSPP_2014,rpt.00013061.0000000000000000.20150101.0822487...,https://www.ercot.com/misdownload/servlets/mir...,2014.0,2015-01-01,rpt.00013061.0000000000000000.RTMLZHBSPP_2014....,RTMLZHBSPP_2014
9,RTMLZHBSPP_2013,rpt.00013061.0000000000000000.20140101.0833150...,https://www.ercot.com/misdownload/servlets/mir...,2013.0,2014-01-01,rpt.00013061.0000000000000000.RTMLZHBSPP_2013....,RTMLZHBSPP_2013


In [125]:
#Download necessary zips
for i, r in merge_df[~merge_df.web_long_name.isna()].iterrows():

    # zip_path = os.path.join(cwd,  zip_fldr, r['web_long_name'])

    #download new zip
    if (r['web_file_date']>max_date and r['web_file_yr']>= min_file_year):
        with open(save_folder + r['web_long_name'], 'wb') as file:
            response = requests.get(r['web_link'])
            file.write(response.content)

#extracts the xlsx from each zip and places in same directory
z_list = [j for j in os.listdir(zip_path) if '.zip' in j]

for file in z_list:
    with zipfile.ZipFile(zip_path + '/' + file, 'r') as zipObj:
        zipObj.extractall(path=zip_path)
        os.remove(os.path.join(zip_path, file))
