In [1]:
import requests
import os
import xml
from bs4 import BeautifulSoup
import pandas as pd
import re
from zipfile import ZipFile
import sqlite3
import openpyxl as xl
pd.set_option('display.max_colwidth', None)
#pd.set_option("display.max_rows", None)

In [2]:
lmp_db = r'C:\Users\BPassini\Databases_Py\Ercot_SPP\LMP_DB.db'

In [3]:
def create_connection(db_file):
    """ create a database connection to the SQLite database
        specified by db_file
    :param db_file: database file
    :return: Connection object or None
    """
    conn = None
    try:
        conn = sqlite3.connect(db_file)
        return conn
    except Error as e:
        print(e)

    return conn

In [4]:
def query_db(db_file, sql):
    conn = create_connection(db_file)
    
    df = pd.read_sql_query(sql, conn)
    
    conn.close()
    
    return df

In [5]:
def create_table(conn, create_table_sql):
    """ create a table from the create_table_sql statement
    :param conn: Connection object
    :param create_table_sql: a CREATE TABLE statement
    :return:
    """
    try:
        c = conn.cursor()
        c.execute(create_table_sql)
    except (TypeError, NameError) as e:
        print(e)

In [6]:
def delete_rows(conn, delete_sql):
    ''' Delete rows from a sqlite db
    :param conn: Connection object
    :param delete_sql: a delete statement'''
    c = conn.cursor()
    c.execute(delete_sql)
    conn.commit()
    conn.close()

In [7]:
def check_max_date():
    conn = create_connection(r'C:\Users\BPassini\Databases_Py\Ercot_SPP\LMP_DB.db')
    max_date_df = pd.read_sql_query('''Select max(DELIVERY_DATE) as MAX_DELIVERY_DATE,  max(DELIVERY_HOUR) as MAX_DELIVERY_HOUR 
                                       from ercot_hist_spp 
                                       where delivery_date = (select max(delivery_date) from ercot_hist_spp where settlement_point_price is not null)''', conn)
    
    max_date =  pd.to_datetime(max_date_df['MAX_DELIVERY_DATE'][0])
    max_hour = max_date_df['MAX_DELIVERY_HOUR'][0]
    conn.close()
    return max_date, max_hour

In [None]:
#get max date of data in db
max_date, max_hour = check_max_date()
max_year = max_date.year
max_mon = max_date.month
max_day = max_date.day

In [None]:
#figure out which file to pull/start in

if max_mon == 12 and max_day == 31 and max_hour == 24:
    min_file_year = max_year + 1
else:
    min_file_year = max_year

In [None]:
#Website and file type to download

Domain = 'http://mis.ercot.com'
url = 'http://mis.ercot.com/misapp/GetReports.do?reportTypeId=13061&reportTitle=Historical%20RTM%20Load%20Zone%20and%20Hub%20Prices&showHTMLView=&mimicKey'
filetype = 'zip'

In [None]:
#Get websites HTML, get all the filename and associated links

soup = BeautifulSoup(requests.get(url).text, 'html.parser')
file_list = soup.find_all(class_='labelOptional_ind')
link_list = soup.findAll('a', attrs={'href': re.compile("/misdownload/")}) 

In [None]:
#Slim down the information in the previous list and put them into new lists

link_name_list = []

for link in link_list:
    link_name_list.append(link.get('href'))

folder_name_list = []    
    
for class_ in file_list:
    folder_name_list.append(str(class_.next_element))

In [None]:
#create a df of filename and links 

data_dict = {'ZipFolderName':folder_name_list, 'DownLoadLink':link_name_list}
download_df = pd.DataFrame(data_dict)
download_df['ZipFolderYear'] = download_df.ZipFolderName.str.split('.').str[5].str[-4:]
download_df['ZipFolderDate'] = download_df.ZipFolderName.str.split('.').str[3]

In [None]:
download_df.head()

In [None]:
#download_df Download each zip from df that hasn't been downloaded already and save to specific folder in wd

zip_list = os.listdir(os.path.join(os.getcwd(),  'LMP_Zips'))
save_folder = 'LMP_Zips/'

for i in range(len(download_df)):
    zip_date = download_df['ZipFolderDate'][i]
    zip_year = int(download_df['ZipFolderYear'][i])
    zip_name = download_df['ZipFolderName'][i]
    link_name = download_df['DownLoadLink'][i]
    
    if zip_year>=min_file_year and zip_name not in zip_list:
        with open(save_folder + zip_name, 'wb') as file:
            response = requests.get(Domain + link_name)
            file.write(response.content)


In [None]:
#extracts the xlsx from each zip and places in same directory

zip_path = os.path.join(os.getcwd(),  'LMP_Zips')

for file in os.listdir(zip_path):
    if '.zip' in file:
        with ZipFile(zip_path + '/' + file, 'r') as zipObj:
            zipObj.extractall(path=zip_path)

In [None]:
#delete zips
for file in os.listdir(os.path.join(os.getcwd(),  'LMP_Zips')):
    if '.zip' in file:
        os.remove(os.path.join(os.getcwd(),  'LMP_Zips', file))
    

In [None]:
conn = create_connection(lmp_db)

sql_create_lmp_table = ''' Create Table if not exists ercot_hist_spp (
                                    DELIVERY_DATE text,
                                    DELIVERY_HOUR integer,
                                    DELIVERY_INTERVAL integer,
                                    REPEATED_HOUR_FLAG text,
                                    SETTLEMENT_POINT_NAME text,
                                    SETTLEMENT_POINT_TYPE text,
                                    SETTLEMENT_POINT_PRICE real);
                                '''

create_table(conn, sql_create_lmp_table)


In [None]:
%%time
delete_rows(conn, delete_sql='''delete from ercot_hist_spp where settlement_point_price is null''')

In [None]:
%%time
query_db(lmp_db, sql="Select *  from ercot_hist_spp where settlement_point_price isnull")

In [None]:
%%time
column_dict = {'Delivery Date':'DELIVERY_DATE'
               , 'Delivery Hour':'DELIVERY_HOUR'
               ,'Delivery Interval':'DELIVERY_INTERVAL'
               ,'Repeated Hour Flag':'REPEATED_HOUR_FLAG'
               ,'Settlement Point Name':'SETTLEMENT_POINT_NAME'
               ,'Settlement Point Type':'SETTLEMENT_POINT_TYPE'
               ,'Settlement Point Price':'SETTLEMENT_POINT_PRICE'}

conn = conn = create_connection(lmp_db)

for file in os.listdir(os.path.join(os.getcwd(),  'LMP_Zips')):
    if '.xlsx' in file and int(file[-9:-5]) >= min_file_year:
        print(file)
        file_path = os.path.join(os.getcwd(),  'LMP_Zips', file)
        wb = xl.load_workbook(file_path, read_only=True)
        for sheet in wb.sheetnames:
            upload_sheet = pd.read_excel(file_path, sheet_name=sheet, engine='openpyxl')
            upload_sheet['Delivery Date'] = pd.to_datetime(upload_sheet['Delivery Date'])
            upload_sheet = upload_sheet[upload_sheet['Delivery Date']>max_date]
            min_del_date = min(test_sheet['Delivery Date'])
            max_del_date = max(test_sheet['Delivery Date'])
            if len(upload_sheet)!=1:
                upload_sheet.rename(columns=column_dict, inplace=True)
                upload_sheet.to_sql(name='ercot_hist_spp', con=conn, if_exists='append', index=False)
                print(sheet)
                print(min_del_date, max_del_date)
                
                
conn.close()