In [1]:
# install reverse-geocoder https://pypi.org/project/reverse_geocoder/
!pip install reverse-geocoder



In [2]:
import pandas as pd
import numpy as np
import reverse_geocoder as rg

In [3]:
power_2019 = pd.read_excel('../power_data/2019_Annual_Summary.xls', skiprows=1)



In [4]:
power_2019.head()

Unnamed: 0,Month,Date Event Began,Time Event Began,Date of Restoration,Time of Restoration,Area Affected,NERC Region,Alert Criteria,Event Type,Demand Loss (MW),Number of Customers Affected
0,January,01/05/2019,13:19:00,01/05/2019,15:07:00,Washington:,WECC,Complete loss of Interpersonal Communication a...,System Operations,0,Unknown
1,January,01/06/2019,03:00:00,01/09/2019,07:00:00,Washington:,WECC,"Loss of electric service to more than 50,000 c...",Severe Weather,230,230000
2,January,01/06/2019,17:56:00,01/06/2019,21:52:00,California: Sacramento County;,WECC,"Loss of electric service to more than 50,000 c...",Severe Weather,300,90382
3,January,01/06/2019,01:00:00,01/06/2019,12:00:00,"Washington: King County, Thurston County, Pier...",WECC,"Loss of electric service to more than 50,000 c...",Severe Weather,Unknown,230000
4,January,01/07/2019,20:57:00,01/07/2019,21:32:00,Michigan:,RF,Complete loss of monitoring or control capabil...,System Operations,0,0


In [5]:
lightning_2019 = pd.read_csv('../weather_data/lightning/tvs-tiles-2019.csv', skiprows=2)

In [6]:
lightning_2019.head()

Unnamed: 0,#ZDAY,CENTERLON,CENTERLAT,TOTAL_COUNT
0,20190101,-97.6,27.0,1
1,20190101,-152.3,60.9,3
2,20190101,-73.2,43.0,1
3,20190101,-135.7,57.0,1
4,20190101,-86.2,34.9,1


## Reverse Geocoding

Reverse geocoding is getting address (or county, state) from lat-lon.

We use the [reverse_geocoder](https://github.com/thampiman/reverse-geocoder) library
 It gives us county (admin 2) and state (admin 1) info too!


In [7]:
# first add a new column containing both lat and lon
locations = list(zip(lightning_2019['CENTERLAT'],lightning_2019['CENTERLON']))
lightning_2019['location'] = locations

# then get the county (admin2) into a new column
address = rg.search(locations)
lightning_2019['county'] = [x['admin2'] for x in address]
lightning_2019['state'] = [x['admin1'] for x in address]

Loading formatted geocoded file...


In [8]:
lightning_2019.head()

Unnamed: 0,#ZDAY,CENTERLON,CENTERLAT,TOTAL_COUNT,location,county,state
0,20190101,-97.6,27.0,1,"(27.0, -97.6)",Kenedy County,Texas
1,20190101,-152.3,60.9,3,"(60.9, -152.3)",Kenai Peninsula Borough,Alaska
2,20190101,-73.2,43.0,1,"(43.0, -73.2)",Bennington County,Vermont
3,20190101,-135.7,57.0,1,"(57.0, -135.7)",Sitka City and Borough,Alaska
4,20190101,-86.2,34.9,1,"(34.9, -86.2)",Madison County,Alabama


## Filtering only Severe Weather

In [9]:
sev_weather_power_2019 = power_2019[power_2019['Event Type'].str.contains(r'Severe Weather', regex=True)]

In [10]:
sev_weather_power_2019.head()

Unnamed: 0,Month,Date Event Began,Time Event Began,Date of Restoration,Time of Restoration,Area Affected,NERC Region,Alert Criteria,Event Type,Demand Loss (MW),Number of Customers Affected
1,January,01/06/2019,03:00:00,01/09/2019,07:00:00,Washington:,WECC,"Loss of electric service to more than 50,000 c...",Severe Weather,230,230000
2,January,01/06/2019,17:56:00,01/06/2019,21:52:00,California: Sacramento County;,WECC,"Loss of electric service to more than 50,000 c...",Severe Weather,300,90382
3,January,01/06/2019,01:00:00,01/06/2019,12:00:00,"Washington: King County, Thurston County, Pier...",WECC,"Loss of electric service to more than 50,000 c...",Severe Weather,Unknown,230000
8,January,01/12/2019,11:30:00,01/13/2019,22:00:00,Missouri: Jackson County; Kansas: Johnson County;,SPP RE,"Loss of electric service to more than 50,000 c...",Severe Weather,Unknown,112530
9,January,01/12/2019,11:30:00,Unknown,Unknown,Missouri: Nebraska:,SERC,"Loss of electric service to more than 50,000 c...",Severe Weather,Unknown,116600


# Clean the Unknown Entries from Dates


In [11]:
sev_weather_power_2019['Date of Restoration'].astype('str')
sev_weather_power_2019['Area Affected'].astype('str')
lightning_2019['county'].astype('str')
lightning_2019['state'].astype('str')

0             Texas
1            Alaska
2           Vermont
3            Alaska
4           Alabama
            ...    
56890    Washington
56891    Washington
56892    Washington
56893    Washington
56894        Oregon
Name: state, Length: 56895, dtype: object

In [12]:
sev_weather_power_2019_copy = sev_weather_power_2019.copy()

sev_weather_power_2019_copy.loc[sev_weather_power_2019_copy['Date of Restoration'] == 'Unknown', 'Date of Restoration'] = sev_weather_power_2019_copy.loc[sev_weather_power_2019_copy['Date of Restoration'] == 'Unknown', 'Date Event Began']

In [13]:
sev_weather_power_2019_copy.head()

Unnamed: 0,Month,Date Event Began,Time Event Began,Date of Restoration,Time of Restoration,Area Affected,NERC Region,Alert Criteria,Event Type,Demand Loss (MW),Number of Customers Affected
1,January,01/06/2019,03:00:00,01/09/2019,07:00:00,Washington:,WECC,"Loss of electric service to more than 50,000 c...",Severe Weather,230,230000
2,January,01/06/2019,17:56:00,01/06/2019,21:52:00,California: Sacramento County;,WECC,"Loss of electric service to more than 50,000 c...",Severe Weather,300,90382
3,January,01/06/2019,01:00:00,01/06/2019,12:00:00,"Washington: King County, Thurston County, Pier...",WECC,"Loss of electric service to more than 50,000 c...",Severe Weather,Unknown,230000
8,January,01/12/2019,11:30:00,01/13/2019,22:00:00,Missouri: Jackson County; Kansas: Johnson County;,SPP RE,"Loss of electric service to more than 50,000 c...",Severe Weather,Unknown,112530
9,January,01/12/2019,11:30:00,01/12/2019,Unknown,Missouri: Nebraska:,SERC,"Loss of electric service to more than 50,000 c...",Severe Weather,Unknown,116600


## Rename for easy naming

In [14]:
df1 = lightning_2019.copy()
df2 = sev_weather_power_2019_copy.copy()

In [15]:
# convert string to datetime
df1['#ZDAY'] = pd.to_datetime(df1['#ZDAY'], format='%Y%m%d')
df2['Date Event Began'] = pd.to_datetime(df2['Date Event Began'], format='%m/%d/%Y')
df2['Date of Restoration'] = pd.to_datetime(df2['Date of Restoration'], format='%m/%d/%Y')

In [16]:
# Filter df2 based on whether #ZDAY falls within the range of Date Event Began and Date of Restoration in df2
df2_filtered = df2[df2['Date Event Began'].isin(df1['#ZDAY']) & df2['Date of Restoration'].isin(df1['#ZDAY'])]

In [17]:
df2_filtered

Unnamed: 0,Month,Date Event Began,Time Event Began,Date of Restoration,Time of Restoration,Area Affected,NERC Region,Alert Criteria,Event Type,Demand Loss (MW),Number of Customers Affected
1,January,2019-01-06,03:00:00,2019-01-09,07:00:00,Washington:,WECC,"Loss of electric service to more than 50,000 c...",Severe Weather,230,230000
2,January,2019-01-06,17:56:00,2019-01-06,21:52:00,California: Sacramento County;,WECC,"Loss of electric service to more than 50,000 c...",Severe Weather,300,90382
3,January,2019-01-06,01:00:00,2019-01-06,12:00:00,"Washington: King County, Thurston County, Pier...",WECC,"Loss of electric service to more than 50,000 c...",Severe Weather,Unknown,230000
8,January,2019-01-12,11:30:00,2019-01-13,22:00:00,Missouri: Jackson County; Kansas: Johnson County;,SPP RE,"Loss of electric service to more than 50,000 c...",Severe Weather,Unknown,112530
9,January,2019-01-12,11:30:00,2019-01-12,Unknown,Missouri: Nebraska:,SERC,"Loss of electric service to more than 50,000 c...",Severe Weather,Unknown,116600
...,...,...,...,...,...,...,...,...,...,...,...
246,November,2019-11-01,01:15:00,2019-11-02,21:30:00,Connecticut: Maine: Massachusetts: Rhode Islan...,NPCC,"Loss of electric service to more than 50,000 c...",Severe Weather,Unknown,80066
247,November,2019-11-01,02:41:00,2019-11-01,Unknown,New York: Broome County;,NPCC,"Loss of electric service to more than 50,000 c...",Severe Weather,Unknown,66325
255,November,2019-11-20,09:49:00,2019-11-20,15:20:00,"California: Colusa County, Lake County, Mendoc...",WECC,"Loss of electric service to more than 50,000 c...",Severe Weather/Transmission Interruption,178,54000
259,November,2019-11-26,18:07:00,2019-11-27,12:27:00,California:,WECC,"Loss of electric service to more than 50,000 c...",Severe Weather,300,93000


In [18]:
df2_filtered_copy = df2_filtered.copy()

# Check if county in df1 is contained within Area Affected in df2_filtered_copy
df2_filtered_copy['is_county_in_area'] = df2_filtered_copy.apply(lambda row: any(county in row['Area Affected'] for county in df1['county']), axis=1)

In [19]:
# Merge df1 with the filtered df2 based on the condition is_county_in_area is True
merged_df = pd.merge(df1, df2_filtered_copy[df2_filtered_copy['is_county_in_area']], how='inner', left_on='#ZDAY', right_on='Date Event Began')

# Drop the temporary column
merged_df.drop(columns='is_county_in_area', inplace=True)


In [21]:
merged_df.head()

Unnamed: 0,#ZDAY,CENTERLON,CENTERLAT,TOTAL_COUNT,location,county,state,Month,Date Event Began,Time Event Began,Date of Restoration,Time of Restoration,Area Affected,NERC Region,Alert Criteria,Event Type,Demand Loss (MW),Number of Customers Affected
0,2019-01-06,-120.0,35.0,1,"(35.0, -120.0)",Santa Barbara County,California,January,2019-01-06,03:00:00,2019-01-09,07:00:00,Washington:,WECC,"Loss of electric service to more than 50,000 c...",Severe Weather,230,230000
1,2019-01-06,-120.0,35.0,1,"(35.0, -120.0)",Santa Barbara County,California,January,2019-01-06,17:56:00,2019-01-06,21:52:00,California: Sacramento County;,WECC,"Loss of electric service to more than 50,000 c...",Severe Weather,300,90382
2,2019-01-06,-120.0,35.0,1,"(35.0, -120.0)",Santa Barbara County,California,January,2019-01-06,01:00:00,2019-01-06,12:00:00,"Washington: King County, Thurston County, Pier...",WECC,"Loss of electric service to more than 50,000 c...",Severe Weather,Unknown,230000
3,2019-01-06,-123.4,47.3,1,"(47.3, -123.4)",Grays Harbor County,Washington,January,2019-01-06,03:00:00,2019-01-09,07:00:00,Washington:,WECC,"Loss of electric service to more than 50,000 c...",Severe Weather,230,230000
4,2019-01-06,-123.4,47.3,1,"(47.3, -123.4)",Grays Harbor County,Washington,January,2019-01-06,17:56:00,2019-01-06,21:52:00,California: Sacramento County;,WECC,"Loss of electric service to more than 50,000 c...",Severe Weather,300,90382


## The above merging is not perfectly accurate.

Therefore we further filter the merged dataframe to show only those counties where the county is inside Area Affected, or the state is inside Area affected.

I do not know of a better way because the state within Area Affected may be true, but the counties may not match, if I check for "and" instead of "or", the problem is that some Area Affected do not have county info at all

In [22]:
filtered_df = merged_df[merged_df.apply(lambda row: row['county'] in row['Area Affected'] or row['state'] in row['Area Affected'], axis=1)]

In [23]:
filtered_df

Unnamed: 0,#ZDAY,CENTERLON,CENTERLAT,TOTAL_COUNT,location,county,state,Month,Date Event Began,Time Event Began,Date of Restoration,Time of Restoration,Area Affected,NERC Region,Alert Criteria,Event Type,Demand Loss (MW),Number of Customers Affected
1,2019-01-06,-120.0,35.0,1,"(35.0, -120.0)",Santa Barbara County,California,January,2019-01-06,17:56:00,2019-01-06,21:52:00,California: Sacramento County;,WECC,"Loss of electric service to more than 50,000 c...",Severe Weather,300,90382
3,2019-01-06,-123.4,47.3,1,"(47.3, -123.4)",Grays Harbor County,Washington,January,2019-01-06,03:00:00,2019-01-09,07:00:00,Washington:,WECC,"Loss of electric service to more than 50,000 c...",Severe Weather,230,230000
5,2019-01-06,-123.4,47.3,1,"(47.3, -123.4)",Grays Harbor County,Washington,January,2019-01-06,01:00:00,2019-01-06,12:00:00,"Washington: King County, Thurston County, Pier...",WECC,"Loss of electric service to more than 50,000 c...",Severe Weather,Unknown,230000
6,2019-01-06,-118.2,46.0,1,"(46.0, -118.2)",Walla Walla County,Washington,January,2019-01-06,03:00:00,2019-01-09,07:00:00,Washington:,WECC,"Loss of electric service to more than 50,000 c...",Severe Weather,230,230000
8,2019-01-06,-118.2,46.0,1,"(46.0, -118.2)",Walla Walla County,Washington,January,2019-01-06,01:00:00,2019-01-06,12:00:00,"Washington: King County, Thurston County, Pier...",WECC,"Loss of electric service to more than 50,000 c...",Severe Weather,Unknown,230000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24898,2019-11-27,-90.1,38.3,1,"(38.3, -90.1)",Monroe County,Illinois,November,2019-11-27,12:00:00,2019-11-30,02:00:00,"Michigan: Tuscola County, Sanilac County, Huro...",RF,"Loss of electric service to more than 50,000 c...",Severe Weather,30,107000
24905,2019-11-27,-90.2,38.4,1,"(38.4, -90.2)",Monroe County,Illinois,November,2019-11-27,12:00:00,2019-11-30,02:00:00,"Michigan: Tuscola County, Sanilac County, Huro...",RF,"Loss of electric service to more than 50,000 c...",Severe Weather,30,107000
24907,2019-11-27,-88.7,38.4,1,"(38.4, -88.7)",Wayne County,Illinois,November,2019-11-27,12:00:00,2019-11-30,02:00:00,"Michigan: Tuscola County, Sanilac County, Huro...",RF,"Loss of electric service to more than 50,000 c...",Severe Weather,30,107000
24943,2019-11-27,-86.9,42.1,1,"(42.1, -86.9)",Berrien County,Michigan,November,2019-11-27,12:00:00,2019-11-30,02:00:00,"Michigan: Tuscola County, Sanilac County, Huro...",RF,"Loss of electric service to more than 50,000 c...",Severe Weather,30,107000


In [24]:
filtered_df.to_csv('merged_2019.csv')