In [41]:
# Fetchib the API key for EIA energy data
import os
import pandas as pd
from dotenv import load_dotenv
import requests
load_dotenv()
EIA_API_KEY = os.getenv("EIA_API_KEY")
if not EIA_API_KEY:
    raise ValueError("EIA_API_KEY is not set in the environment variables.")


In [42]:
import requests
import pandas as pd
import time

api_url = f"https://api.eia.gov/v2/electricity/rto/fuel-type-data/data/?api_key={EIA_API_KEY}"

LIMIT = 5000
offset = 0

all_data = []

while True:
    payload = {
        "frequency": "hourly",
        "data": ["value"],
        "facets": {
            "fueltype": ["SUN", "WND"],
            "respondent": ["TEX"]
        },
        "sort": [{"column": "period", "direction": "desc"}],
        "offset": offset,
        "length": LIMIT
    }
    
    response = requests.post(api_url, json=payload)
    response.raise_for_status()
    data = response.json()

    records = data.get("response", {}).get("data", [])
    if not records:
        break
    
    all_data.extend(records)
    print(f"Fetched {len(records)} records at offset {offset}")

    if len(records) < LIMIT:
        break  # last page
    
    offset += LIMIT
    time.sleep(5)  # be polite to the API

df = pd.DataFrame(all_data)
print(f"Total records fetched: {len(df)}")
print(df.head())


Fetched 5000 records at offset 0
Fetched 5000 records at offset 5000
Fetched 5000 records at offset 10000
Fetched 5000 records at offset 15000
Fetched 5000 records at offset 20000
Fetched 5000 records at offset 25000
Fetched 5000 records at offset 30000
Fetched 5000 records at offset 35000
Fetched 5000 records at offset 40000
Fetched 5000 records at offset 45000
Fetched 5000 records at offset 50000
Fetched 5000 records at offset 55000
Fetched 5000 records at offset 60000
Fetched 5000 records at offset 65000
Fetched 5000 records at offset 70000
Fetched 5000 records at offset 75000
Fetched 5000 records at offset 80000
Fetched 5000 records at offset 85000
Fetched 5000 records at offset 90000
Fetched 5000 records at offset 95000
Fetched 5000 records at offset 100000
Fetched 5000 records at offset 105000
Fetched 4250 records at offset 110000
Total records fetched: 114250
          period respondent respondent-name fueltype type-name  value  \
0  2025-07-08T04        TEX           Texas     

In [50]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114250 entries, 0 to 114249
Data columns (total 7 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   period           114250 non-null  object
 1   respondent       114250 non-null  object
 2   respondent-name  114250 non-null  object
 3   fueltype         114250 non-null  object
 4   type-name        114250 non-null  object
 5   value            114250 non-null  object
 6   value-units      114250 non-null  object
dtypes: object(7)
memory usage: 6.1+ MB


In [51]:
os.getcwd().rsplit("\\", 2)[0]
df.to_csv(os.path.join(os.getcwd().rsplit("\\", 2)[0], r"data\raw", "eia_hourly_texas_raw.csv"), index=False)

In [52]:
df_copy = df.copy() 

In [53]:
# sort by period in descending order
df_copy.sort_values(by='period', ascending=False, inplace=True)
df_copy.head()

Unnamed: 0,period,respondent,respondent-name,fueltype,type-name,value,value-units
0,2025-07-08T04,TEX,Texas,SUN,Solar,0,megawatthours
1,2025-07-08T04,TEX,Texas,WND,Wind,11946,megawatthours
2,2025-07-08T03,TEX,Texas,SUN,Solar,0,megawatthours
3,2025-07-08T03,TEX,Texas,WND,Wind,10473,megawatthours
4,2025-07-08T02,TEX,Texas,SUN,Solar,941,megawatthours


In [54]:
#drop duplicates
df_copy.drop_duplicates(subset=['period', 'fueltype', 'respondent', 'value'], inplace=True)

In [55]:
df_copy.info()

<class 'pandas.core.frame.DataFrame'>
Index: 114250 entries, 0 to 114249
Data columns (total 7 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   period           114250 non-null  object
 1   respondent       114250 non-null  object
 2   respondent-name  114250 non-null  object
 3   fueltype         114250 non-null  object
 4   type-name        114250 non-null  object
 5   value            114250 non-null  object
 6   value-units      114250 non-null  object
dtypes: object(7)
memory usage: 7.0+ MB


In [None]:
# add new column 'operationalStatus' and map with column 'value', if 'value' has '0', then 'operationalStatus' is 'offline', else 'online'
df_copy['operationalStatus'] = df_copy['value'].apply(lambda x: 'offline' if x == '0' else 'online')

In [59]:
df_copy.head()

Unnamed: 0,period,respondent,respondent-name,fueltype,type-name,value,value-units,operationalStatus
0,2025-07-08T04,TEX,Texas,SUN,Solar,0,megawatthours,offline
1,2025-07-08T04,TEX,Texas,WND,Wind,11946,megawatthours,online
2,2025-07-08T03,TEX,Texas,SUN,Solar,0,megawatthours,offline
3,2025-07-08T03,TEX,Texas,WND,Wind,10473,megawatthours,online
4,2025-07-08T02,TEX,Texas,SUN,Solar,941,megawatthours,online


In [61]:
# Convert the date to datetime format
df_copy['period'] = pd.to_datetime(df_copy['period'], format='mixed')
df_copy['year'] = df_copy['period'].dt.year
df_copy['month'] = df_copy['period'].dt.month
df_copy['day'] = df_copy['period'].dt.day
df_copy['hour'] = df_copy['period'].dt.hour
df_copy.head()

Unnamed: 0,period,respondent,respondent-name,fueltype,type-name,value,value-units,operationalStatus,year,month,day,hour
0,2025-07-08 04:00:00,TEX,Texas,SUN,Solar,0,megawatthours,offline,2025,7,8,4
1,2025-07-08 04:00:00,TEX,Texas,WND,Wind,11946,megawatthours,online,2025,7,8,4
2,2025-07-08 03:00:00,TEX,Texas,SUN,Solar,0,megawatthours,offline,2025,7,8,3
3,2025-07-08 03:00:00,TEX,Texas,WND,Wind,10473,megawatthours,online,2025,7,8,3
4,2025-07-08 02:00:00,TEX,Texas,SUN,Solar,941,megawatthours,online,2025,7,8,2


In [62]:
# drop columns that are not needed
columns_to_drop = ['respondent-name', 'type-name']
df_copy.drop(columns=columns_to_drop, inplace=True, errors='ignore')

In [65]:
# rename value TEX to TX in column 'respondent'
df_copy['respondent'] = df_copy['respondent'].replace({'TEX': 'TX'})
df_copy.head()

Unnamed: 0,period,respondent,fueltype,value,value-units,operationalStatus,year,month,day,hour
0,2025-07-08 04:00:00,TX,SUN,0,megawatthours,offline,2025,7,8,4
1,2025-07-08 04:00:00,TX,WND,11946,megawatthours,online,2025,7,8,4
2,2025-07-08 03:00:00,TX,SUN,0,megawatthours,offline,2025,7,8,3
3,2025-07-08 03:00:00,TX,WND,10473,megawatthours,online,2025,7,8,3
4,2025-07-08 02:00:00,TX,SUN,941,megawatthours,online,2025,7,8,2


In [67]:
os.getcwd().rsplit("\\", 2)[0]
df_copy.to_csv(os.path.join(os.getcwd().rsplit("\\", 2)[0], r"data\raw", "eia_hourly_texas_filtered.csv"), index=False)