**Date Created: 10/19/2025**

In [1]:
import os
import json
import requests
import pandas as pd
from datetime import datetime
from mysql import connector
from dotenv import load_dotenv

In [2]:
load_dotenv() # this must be run in order to grant getenv function permission to access objects in .env file. 

True

In [3]:
# senseitive information stored in .env file
API_KEY = os.getenv("API_KEY")
API_HOST = os.getenv("API_HOST")

## GET Request (Extract)

In [4]:
# API Request
url = f"https://api.eia.gov/v2/electricity/retail-sales/data/?api_key={API_KEY}&frequency=monthly&data[0]=customers&data[1]=price&data[2]=revenue&data[3]=sales&facets[stateid][]=CA&sort[0][column]=period&sort[0][direction]=desc&offset=0&length=5000"
# Don't forget to add "api_key =" and "&" after you insert API KEY variable
# Full documentation here: https://www.eia.gov/opendata/documentation/APIv2.1.0.pdf

In [5]:
# Storing the API response into a variable
response = requests.get(url)
# Printing response would return the 200 code
json_data = response.json() # accessing the json formatted data
# json_data examines what the data hierarchy looks like within the "response" key, where the data we want is imbedded

# there are a total of 1770 records

In [6]:
 # json_data contains a lot of nested data, we need to specify what we want to access
records = json_data["response"]["data"] 
records # stored as a list of dictionaries, with each dictionary being one row of data

[{'period': '2025-08',
  'stateid': 'CA',
  'stateDescription': 'California',
  'sectorid': 'ALL',
  'sectorName': 'all sectors',
  'customers': '16206105',
  'price': '29.31',
  'revenue': '7030.66884',
  'sales': '23990.05275',
  'customers-units': 'number of customers',
  'price-units': 'cents per kilowatt-hour',
  'revenue-units': 'million dollars',
  'sales-units': 'million kilowatt hours'},
 {'period': '2025-08',
  'stateid': 'CA',
  'stateDescription': 'California',
  'sectorid': 'COM',
  'sectorName': 'commercial',
  'customers': '1777748',
  'price': '29.31',
  'revenue': '3211.75215',
  'sales': '10956.13653',
  'customers-units': 'number of customers',
  'price-units': 'cents per kilowatt-hour',
  'revenue-units': 'million dollars',
  'sales-units': 'million kilowatt hours'},
 {'period': '2025-08',
  'stateid': 'CA',
  'stateDescription': 'California',
  'sectorid': 'IND',
  'sectorName': 'industrial',
  'customers': '147263',
  'price': '24.69',
  'revenue': '1035.93428',
 

## Transform

In [7]:
rows = [] # creating an empty list that will hold a row for each item
column_names = ['period', 'stateid', 'sectorid', 'customers', 'price', 'revenue', 'sales']

In [8]:
# loop through each record in the API response for the fields that are relevant, excluded state and unit measure descriptions
for record in records:
        period = record["period"]
        stateid = record["stateid"]
        sectorid = record["sectorid"]
        customers = record["customers"]
        price = record["price"]
        revenue = record["revenue"]
        sales = record["sales"]

        # putting each row into a tuple (instead of list, for its immutability)
        data_tuple = (period, stateid, sectorid, customers, price, revenue, sales)

        # append each row tuple into the rows list
        rows.append(data_tuple)

In [9]:
# create pandas dataframe
df = pd.DataFrame(rows, columns=column_names)
df.head(7)
# null values are represented as None or NaN (not a number), with the latter having a mathematical significance to it. Source: https://www.geeksforgeeks.org/python/difference-between-nan-and-none-in-python/


# how to force data type and skip over rows with missing data?

Unnamed: 0,period,stateid,sectorid,customers,price,revenue,sales
0,2025-08,CA,ALL,16206105.0,29.31,7030.66884,23990.05275
1,2025-08,CA,COM,1777748.0,29.31,3211.75215,10956.13653
2,2025-08,CA,IND,147263.0,24.69,1035.93428,4196.41325
3,2025-08,CA,OTH,,,,
4,2025-08,CA,RES,14281081.0,31.58,2772.68749,8778.62439
5,2025-08,CA,TRA,13.0,17.49,10.29492,58.87859
6,2025-07,CA,ALL,16397241.0,30.04,6795.81309,22623.72628


In [10]:
df.info()
# why is Dtype all objects? Suspect there are misisng data causing python reader to cast entire column to object type. 
# Non-Null Count indicates that there are lots of missing data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1776 entries, 0 to 1775
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   period     1776 non-null   object
 1   stateid    1776 non-null   object
 2   sectorid   1776 non-null   object
 3   customers  1060 non-null   object
 4   price      1480 non-null   object
 5   revenue    1480 non-null   object
 6   sales      1480 non-null   object
dtypes: object(7)
memory usage: 97.2+ KB


### Detecting & Handling Nulls in Pandas Dataframe

In [11]:
# Pandas provides 2 functions to detect null values (NaN and None)
# isnull() and isna()

# using isnull(), where True represents missing data
nulls = df.isnull()
nulls.head(3)

# using isna() to see if there are more/less rows
nas = df.isna()
nas.head(3)

Unnamed: 0,period,stateid,sectorid,customers,price,revenue,sales
0,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False


In [12]:
# Dropping rows with at least one null value
df = df.dropna()


## Load into SQL

In [13]:
# loading in our variables from dotenv file for connecting to SQL  
MYSQL_HOST = os.getenv("MYSQL_HOST")
MYSQL_PORT = os.getenv("MYSQL_PORT")
MYSQL_USER = os.getenv("MYSQL_USER")
MYSQL_PASSWORD = os.getenv("MYSQL_PASSWORD")
MYSQL_DATABASE = os.getenv("MYSQL_DATABASE")

In [14]:
# making the connection
server_connection = connector.connect(
    host = MYSQL_HOST,
    port = MYSQL_PORT,
    user = MYSQL_USER,
    password = MYSQL_PASSWORD,
    connection_timeout = 10, # we want this process to stop executing after 10 seconds in the case of an error.
    autocommit = False,
    raise_on_warnings = True
)

# a cursor is an structure allowing us to traverse a result set. Allows python to communicate with the database 
server_cursor = server_connection.cursor()

In [15]:
# not sure why we need to close the connections. To simply test if we can connect to server? 
server_cursor.close()
server_connection.close()

In [16]:
# connecting to database
db_connection = connector.connect(
    host = MYSQL_HOST,
    port = MYSQL_PORT,
    user = MYSQL_USER,
    password = MYSQL_PASSWORD,
    database = MYSQL_DATABASE
)

cur = db_connection.cursor(buffered=True)

In [17]:
# checking if our table actually exists in mysql
sql_table = "e_sales"
# this method executes the given database query
a = cur.execute(f"SELECT * FROM {sql_table}") 
b = cur.fetchall()
print(b)
# printing a returns none, seems that cur.execute() will always return that

[]


In [18]:
if cur.fetchall() is None:
    raise SystemExit(f"This table {sql_table} is NOT found")
else:
    print(f"Success")

# seems to be giving None when I run fetchone because there is no existing data at first, then inserted some dummy data and ran again. Returned Success.

Success


In [None]:
# UPSERT = update and insert

table_cols = ['period', 'stateid', 'sectorid', 'customers', 'price', 'revenue', 'sales']

sales_df = df[table_cols]
print(sales_df)
# what is the purpose of this step? Didn't we already create a df earlier? 
# seems like it is a way to select custom columns we want from our original data frame

       period stateid sectorid customers  price     revenue        sales
0     2025-08      CA      ALL  16206105  29.31  7030.66884  23990.05275
1     2025-08      CA      COM   1777748  29.31  3211.75215  10956.13653
2     2025-08      CA      IND    147263  24.69  1035.93428   4196.41325
4     2025-08      CA      RES  14281081  31.58  2772.68749   8778.62439
5     2025-08      CA      TRA        13  17.49    10.29492     58.87859
...       ...     ...      ...       ...    ...         ...          ...
1266  2008-01      CA      ALL  15256818  11.55     2612.35  22619.92118
1267  2008-01      CA      COM   1868985  10.71  1075.50296     10043.05
1268  2008-01      CA      IND     76967   9.01   353.72745   3925.52432
1270  2008-01      CA      RES  13310410  13.73  1177.33679   8576.32181
1271  2008-01      CA      TRA       456   7.71      5.7828     75.02505

[1060 rows x 7 columns]


In [None]:
# extract rows as tuples from the df
sales_tuples = sales_df.itertuples(index=False, name=None)

list_of_sales_tuples = list(sales_tuples)
# not sure what this step does, tuple format makes it easier for MySQL to read into database table?


In [23]:
UPSERT_SQL = f"""
INSERT INTO {sql_table}
(period, stateid, sectorid, customers, price, revenue, sales)
VALUES(%s, %s, %s, %s, %s, %s, %s) AS src
ON DUPLICATE KEY UPDATE
period = src.period,
stateid = src.stateid,
sectorid = src.sectorid,
customers = src.customers,
price = src.price,
revenue = src.revenue,
sales = src.sales;
"""
# why did he put triple quotes for f string? 
# ON DUPLICATE KEY UPDATE helps prevent duplicate rows, just update existing row.


In [25]:
# return how many rows we uploaded to MySQL
rows_uploaded = len(list_of_sales_tuples)

try: 
    cur.executemany(UPSERT_SQL, list_of_sales_tuples)
    db_connection.commit()
    print(f"Success! {rows_uploaded} uploaded.")
except Exception as e:
    db_connection.rollback()
    print(f"Error! Rollback due to {e}")
finally:
    cur.close()
    db_connection.close()
    print("All database connects closed. Clean up completed.")


Success! 1060 uploaded.
All database connects closed. Clean up completed.


### Above code chunk initially gave: Error! Rollback due to 1048 (23000): Column 'customers' cannot be null All database connects closed. Clean up completed."

potential reasons/solutions:
- make our SQL table to accept nulls
- see what our data frame currently looks like
- make sure data types are changed in our initial df table, understand how this affects later steps (what variables are using df)
- look at other articles on how to ETL, consult more sources, understand the process


Article: 
- https://www.getgalaxy.io/learn/common-errors/mysql-error-1048-column-cannot-be-null--causes-fixes
- https://levelup.gitconnected.com/rest-api-to-mysql-database-using-python-eb7b4606d6c3



## Questions:
- Did all the non null rows get loaded in? Do a before and after check.
- What if we wanted to load null values into mysql? What to do after defining schema to accept nulls? Do we need to tag null values with "\n" like in LOAD DATA INFILE()? 

## Next steps:
- read more documentations on how to get pandas DF to Mysql. Common ETL problems and approaches. 