In [1]:
# import useful libraries
import pandas as pd
import numpy as np
import reverse_geocoder as rg
import os

# enable copy on write (default in pandas 3.0)
pd.options.mode.copy_on_write = True

### Read weather function

In [2]:
def read_weather(year, event_type):
    files = os.listdir('../weather_data/'+event_type)
    file_name = [fname for fname in files if str(year) in fname]
    if len(file_name) == 0:
        print("No file in that year for that event type")
        return 
    if len(file_name) > 1:
        print("Multiple files with that year in their name")
        return
    if event_type == 'lightning':
        return pd.read_csv(r'../weather_data/'+event_type+'/'
                  + file_name[0], skiprows=2, parse_dates=['#ZDAY'])
    return pd.read_csv(r'../weather_data/'+event_type+'/'
                  + file_name[0], skiprows=2, parse_dates=['#ZTIME'])

In [19]:
power_2023.sample(2)

Unnamed: 0,Event Month,Date Event Began,Time Event Began,Date of Restoration,Time of Restoration,Area Affected,NERC Region,Alert Criteria,Event Type,Demand Loss (MW),Number of Customers Affected
88,March,03/14/2023,08:00:00,03/16/2023,08:20:00,Connecticut: Massachusetts: Vermont: Rhode Isl...,NPCC,"Loss of electric service to more than 50,000 c...",Severe Weather,Unknown,83000
36,February,02/02/2023,08:15:00,02/02/2023,19:45:00,Arkansas: Mississippi: Texas:,SERC,"Loss of electric service to more than 50,000 c...",Severe Weather,Unknown,74426


In [26]:
# Read the files
power_2023 = pd.read_excel('../power_data/2023_Annual_Summary.xls', skiprows=1)
weather_2023 = pd.read_csv('../weather_data/cleaned/tvs-2023.csv', parse_dates=['DATE'])
weather_2023.sample(5)



Unnamed: 0.1,Unnamed: 0,DATE,WSR_ID,CELL_ID,LAT_mean,LAT_min,LAT_max,LON_mean,LON_min,LON_max,AVGDV_max,LLDV_max,MXDV_max,MXDV_HEIGHT_max,DEPTH_max,MAX_SHEAR_max,MAX_SHEAR_HEIGHT_max
23401,23401,2023-07-21,KCMH,T5,39.45847,39.45847,39.45847,-82.22124,-82.22124,-82.22124,34,80,80,5,25.6,33,30.8
15651,15651,2023-06-16,KDFW,B3,33.76506,33.76506,33.76506,-96.45274,-96.45274,-96.45274,49,64,69,10,6.8,23,10.4
34857,34857,2023-10-05,KIAH,G0,30.747696,30.69505,30.78501,-95.214258,-95.2573,-95.15984,70,94,96,10,7.7,33,9.6
21049,21049,2023-07-06,KMEM,Y4,34.25779,34.22448,34.2911,-90.366485,-90.38023,-90.35274,49,80,84,21,28.3,33,31.0
31102,31102,2023-08-29,KRDU,F0,35.715976,35.62758,35.8212,-79.313397,-79.38802,-79.25859,67,91,94,25,23.5,48,18.0


### Finding the county of the lightning event using reverse_geocoder

In [27]:
locations = list(zip(weather_2023['LAT_mean'], weather_2023['LON_mean']))
weather_2023['location'] = locations

# then get the county (admin2) into a new column
address = rg.search(locations)
weather_2023['county'] = [x['admin2'] for x in address]
weather_2023['state'] = [x['admin1'] for x in address]

In [28]:
# Filter power outages by severe weather only
sev_weather_power_2023 = power_2023[power_2023['Event Type'].str.contains(r'Severe Weather', regex=True)]
# convert to string datatype
sev_weather_power_2023['Date of Restoration'] = sev_weather_power_2023['Date of Restoration'].astype(str)
sev_weather_power_2023['Area Affected'] = sev_weather_power_2023['Area Affected'].astype(str)
weather_2023['county'] = weather_2023['county'].astype(str)
weather_2023['state'] = weather_2023['state'].astype(str)

In [15]:
# drop rows when 'county' column is empty, and lat lon is outside US range
# first we create a mask for nonempty county and continental US range lat 24 to 50 and lon -125 to -66
lightning_mask = (weather_2023['county']!='') & (weather_2023['LAT_mean']>=24) & (weather_2023['LAT_mean']<=50) & (weather_2023['LON_mean']>=-125) &(weather_2023['LON_mean']<=-66)
weather_2023 = weather_2023[lightning_mask]
weather_2023.sample(5)


Unnamed: 0.1,Unnamed: 0,DATE,WSR_ID,CELL_ID,LAT_mean,LAT_min,LAT_max,LON_mean,LON_min,LON_max,AVGDV_max,LLDV_max,MXDV_max,MXDV_HEIGHT_max,DEPTH_max,MAX_SHEAR_max,MAX_SHEAR_HEIGHT_max,location,county,state
33091,33091,2023-09-19,KPBI,X1,26.06658,26.06658,26.06658,-80.59374,-80.59374,-80.59374,35,60,70,10,24.4,28,10.0,"(26.06658, -80.59374)",Broward County,Florida
6479,6479,2023-04-01,KIND,V7,39.16935,39.09853,39.20159,-86.638008,-86.7839,-86.39542,47,77,85,6,7.8,43,6.3,"(39.16935, -86.638008)",Monroe County,Indiana
36562,36562,2023-12-24,KIAH,Y0,30.454105,29.68605,30.6514,-95.845188,-96.1438,-94.75594,67,93,104,9,7.5,39,8.6,"(30.454105, -95.84518833333334)",Grimes County,Texas
4031,4031,2023-03-04,KPBZ,U8,40.66198,40.66198,40.66198,-80.25742,-80.25742,-80.25742,36,50,57,7,5.1,95,1.7,"(40.66198, -80.25742)",Beaver County,Pennsylvania
3426,3426,2023-03-03,KATL,U1,33.55005,33.55005,33.55005,-85.17265,-85.17265,-85.17265,34,54,92,26,23.5,31,26.3,"(33.55005, -85.17265)",Carroll County,Georgia


In [34]:
# when Date of Restoration is Unknown, copy Date of Event
sev_weather_power_2023.loc[sev_weather_power_2023['Date of Restoration'] == 'Unknown', 'Date of Restoration'] = sev_weather_power_2023.loc[sev_weather_power_2023['Date of Restoration'] == 'Unknown', 'Date Event Began']
power_2023 = sev_weather_power_2023 
power_2023['Date Event Began'] = pd.to_datetime(power_2023['Date Event Began'])
power_2023['Date of Restoration'] = pd.to_datetime(power_2023['Date of Restoration'])
power_2023_filtered = power_2023[power_2023['Date Event Began'].isin(weather_2023['DATE'])]
power_2023_filtered.sample(5)

Unnamed: 0,Event Month,Date Event Began,Time Event Began,Date of Restoration,Time of Restoration,Area Affected,NERC Region,Alert Criteria,Event Type,Demand Loss (MW),Number of Customers Affected
61,February,2023-02-25,00:08:00,2023-02-26,18:08:00,California: Los Angeles County;,WECC,"Loss of electric service to more than 50,000 c...",Severe Weather,Unknown,153555
89,March,2023-03-14,09:25:00,2023-03-15,15:00:00,New York:,NPCC,"Loss of electric service to more than 50,000 c...",Severe Weather,Unknown,Unknown
5,January,2023-01-02,18:22:00,2023-01-02,22:27:00,Arkansas: Union County; Louisiana: Ouachita Pa...,SERC,"Unexpected Transmission loss within its area, ...",Severe Weather,20,6849
146,May,2023-05-22,15:39:00,2023-05-22,18:00:00,Florida: Putnam County;,SERC,"Unexpected Transmission loss within its area, ...",Severe Weather,0,0
133,May,2023-05-01,05:16:00,2023-05-01,09:31:00,Connecticut: Massachusetts: Rhode Island: Main...,NPCC,"Loss of electric service to more than 50,000 c...",Severe Weather,Unknown,54000


In [43]:
county = pd.read_csv("../extras/uscounties.csv", index_col=0)
county['county'] = county['county'].astype(str)
power_2023_filtered["county_info_area_affected"] = power_2023_filtered.apply(lambda row: any(counti in row['Area Affected'] for counti in county['county']) , axis=1)
left_merged = pd.merge(weather_2023, power_2023_filtered, how='left', left_on='DATE', right_on='Date Event Began', indicator=True)
filtered_merge = left_merged[left_merged.apply(lambda row: (row['_merge'] == 'both') and(str(row['county']) in str(row['Area Affected']) and str(row['state']) in str(row['Area Affected'])) or ((str(row['state']) in str(row['Area Affected']))and (not row['county_info_area_affected'])), axis=1)]
left_merged['power_outage'] = left_merged.apply(lambda row: (row['_merge'] == 'both') and(str(row['county']) in str(row['Area Affected']) and str(row['state']) in str(row['Area Affected'])) or ((str(row['state']) in str(row['Area Affected']))and (not row['county_info_area_affected'])), axis=1)
columns_to_drop = ['Unnamed: 0', 'WSR_ID', 'CELL_ID', 'LAT_mean', 'LAT_min', 'LAT_max', 'LON_mean', 'LON_min', 'LON_max', 'Date Event Began', 'Time Event Began', 'Date of Restoration', 'Time of Restoration', 'Area Affected',
 'NERC Region','Alert Criteria', 'Event Type', 'Demand Loss (MW)', 'Number of Customers Affected', 'county_info_area_affected', '_merge']

weather_outage = left_merged.copy().drop(columns = columns_to_drop)



In [45]:
weather_outage['power_outage'].value_counts(normalize=True)

False    0.975966
True     0.024034
Name: power_outage, dtype: float64

In [44]:
weather_outage

Unnamed: 0,DATE,AVGDV_max,LLDV_max,MXDV_max,MXDV_HEIGHT_max,DEPTH_max,MAX_SHEAR_max,MAX_SHEAR_HEIGHT_max,location,county,state,Event Month,power_outage
0,2023-01-01,36,49,56,7,6.5,29,6.6,"(32.39305, -110.68147)",Pima County,Arizona,,False
1,2023-01-01,52,96,96,2,5.9,86,1.8,"(32.18141, -110.52664)",Cochise County,Arizona,,False
2,2023-01-01,31,50,50,4,9.1,17,4.2,"(34.34357, -117.82177)",San Bernardino County,California,,False
3,2023-01-01,35,62,62,6,8.3,21,6.4,"(34.29869, -117.62836)",San Bernardino County,California,,False
4,2023-01-01,39,52,52,4,5.0,24,2.6,"(35.0269, -118.24596)",Kern County,California,,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
40147,2023-12-30,46,57,67,5,5.0,60,4.8,"(39.736655, -121.76278)",Butte County,California,,False
40148,2023-12-30,51,83,83,3,12.6,108,3.2,"(39.48597166666666, -121.55395833333334)",Butte County,California,,False
40149,2023-12-30,84,123,126,11,9.1,64,11.4,"(35.842000000000006, -90.09149333333332)",Mississippi County,Arkansas,,False
40150,2023-12-30,41,61,73,7,8.2,39,6.5,"(34.76809, -119.80920333333331)",Santa Barbara County,California,,False
