In [4]:
# install reverse-geocoder https://pypi.org/project/reverse_geocoder/
!pip install reverse-geocoder



In [5]:
# import useful libraries
import pandas as pd
import numpy as np
import reverse_geocoder as rg

# enable copy on write (default in pandas 3.0)
pd.options.mode.copy_on_write = True

In [7]:
# powercut data
power_2019 = pd.read_excel('../power_data/2019_Annual_Summary.xls', skiprows=1)



In [8]:
# lightning data
lightning_2019 = pd.read_csv('../weather_data/lightning/tvs-tiles-2019.csv', skiprows=2)

In [10]:
lightning_2019.head()

Unnamed: 0,#ZDAY,CENTERLON,CENTERLAT,TOTAL_COUNT
0,20190101,-97.6,27.0,1
1,20190101,-152.3,60.9,3
2,20190101,-73.2,43.0,1
3,20190101,-135.7,57.0,1
4,20190101,-86.2,34.9,1


## Reverse Geocoding

Reverse geocoding is getting address (or county, state) from lat-lon.

We use the [reverse_geocoder](https://github.com/thampiman/reverse-geocoder) library
 It gives us county (admin 2) and state (admin 1) info too!


In [11]:
# first add a new column containing both lat and lon
locations = list(zip(lightning_2019['CENTERLAT'],lightning_2019['CENTERLON']))
lightning_2019['location'] = locations

# then get the county (admin2) into a new column
address = rg.search(locations)
lightning_2019['county'] = [x['admin2'] for x in address]
lightning_2019['state'] = [x['admin1'] for x in address]

Loading formatted geocoded file...


In [12]:
# lets see now how it looks
lightning_2019

Unnamed: 0,#ZDAY,CENTERLON,CENTERLAT,TOTAL_COUNT,location,county,state
0,20190101,-97.6,27.0,1,"(27.0, -97.6)",Kenedy County,Texas
1,20190101,-152.3,60.9,3,"(60.9, -152.3)",Kenai Peninsula Borough,Alaska
2,20190101,-73.2,43.0,1,"(43.0, -73.2)",Bennington County,Vermont
3,20190101,-135.7,57.0,1,"(57.0, -135.7)",Sitka City and Borough,Alaska
4,20190101,-86.2,34.9,1,"(34.9, -86.2)",Madison County,Alabama
...,...,...,...,...,...,...,...
56890,20191231,-123.6,47.4,1,"(47.4, -123.6)",Grays Harbor County,Washington
56891,20191231,-123.7,47.5,1,"(47.5, -123.7)",Grays Harbor County,Washington
56892,20191231,-123.0,47.8,1,"(47.8, -123.0)",Kitsap County,Washington
56893,20191231,-123.7,47.8,4,"(47.8, -123.7)",Clallam County,Washington


## Filtering only Severe Weather

In [13]:
# filter only severe weather
sev_weather_power_2019 = power_2019[power_2019['Event Type'].str.contains(r'Severe Weather', regex=True)]

In [14]:
sev_weather_power_2019

Unnamed: 0,Month,Date Event Began,Time Event Began,Date of Restoration,Time of Restoration,Area Affected,NERC Region,Alert Criteria,Event Type,Demand Loss (MW),Number of Customers Affected
1,January,01/06/2019,03:00:00,01/09/2019,07:00:00,Washington:,WECC,"Loss of electric service to more than 50,000 c...",Severe Weather,230,230000
2,January,01/06/2019,17:56:00,01/06/2019,21:52:00,California: Sacramento County;,WECC,"Loss of electric service to more than 50,000 c...",Severe Weather,300,90382
3,January,01/06/2019,01:00:00,01/06/2019,12:00:00,"Washington: King County, Thurston County, Pier...",WECC,"Loss of electric service to more than 50,000 c...",Severe Weather,Unknown,230000
8,January,01/12/2019,11:30:00,01/13/2019,22:00:00,Missouri: Jackson County; Kansas: Johnson County;,SPP RE,"Loss of electric service to more than 50,000 c...",Severe Weather,Unknown,112530
9,January,01/12/2019,11:30:00,Unknown,Unknown,Missouri: Nebraska:,SERC,"Loss of electric service to more than 50,000 c...",Severe Weather,Unknown,116600
...,...,...,...,...,...,...,...,...,...,...,...
246,November,11/01/2019,01:15:00,11/02/2019,21:30:00,Connecticut: Maine: Massachusetts: Rhode Islan...,NPCC,"Loss of electric service to more than 50,000 c...",Severe Weather,Unknown,80066
247,November,11/01/2019,02:41:00,Unknown,Unknown,New York: Broome County;,NPCC,"Loss of electric service to more than 50,000 c...",Severe Weather,Unknown,66325
255,November,11/20/2019,09:49:00,11/20/2019,15:20:00,"California: Colusa County, Lake County, Mendoc...",WECC,"Loss of electric service to more than 50,000 c...",Severe Weather/Transmission Interruption,178,54000
259,November,11/26/2019,18:07:00,11/27/2019,12:27:00,California:,WECC,"Loss of electric service to more than 50,000 c...",Severe Weather,300,93000


# Clean the Unknown Entries from Dates


In [15]:
# convert to string datatype
sev_weather_power_2019['Date of Restoration'] = sev_weather_power_2019['Date of Restoration'].astype(str)
sev_weather_power_2019['Area Affected'] = sev_weather_power_2019['Area Affected'].astype(str)
lightning_2019['county'] = lightning_2019['county'].astype(str)
lightning_2019['state'] = lightning_2019['state'].astype(str)

In [16]:
# drop rows when 'county' column is empty, and lat lon is outside US range
# first we create a mask for nonempty county and continental US range lat 24 to 50 and lon -125 to -66
lightning_mask = (lightning_2019['county']!='') & (lightning_2019['CENTERLAT']>=24) & (lightning_2019['CENTERLAT']<=50) &(lightning_2019['CENTERLON']>=-125) &(lightning_2019['CENTERLON']<=-66)

lightning_2019 = lightning_2019[lightning_mask]

lightning_2019

Unnamed: 0,#ZDAY,CENTERLON,CENTERLAT,TOTAL_COUNT,location,county,state
0,20190101,-97.6,27.0,1,"(27.0, -97.6)",Kenedy County,Texas
2,20190101,-73.2,43.0,1,"(43.0, -73.2)",Bennington County,Vermont
4,20190101,-86.2,34.9,1,"(34.9, -86.2)",Madison County,Alabama
5,20190101,-83.7,35.6,1,"(35.6, -83.7)",Sevier County,Tennessee
6,20190101,-83.6,35.6,1,"(35.6, -83.6)",Sevier County,Tennessee
...,...,...,...,...,...,...,...
56890,20191231,-123.6,47.4,1,"(47.4, -123.6)",Grays Harbor County,Washington
56891,20191231,-123.7,47.5,1,"(47.5, -123.7)",Grays Harbor County,Washington
56892,20191231,-123.0,47.8,1,"(47.8, -123.0)",Kitsap County,Washington
56893,20191231,-123.7,47.8,4,"(47.8, -123.7)",Clallam County,Washington


In [17]:
sev_weather_power_2019_copy = sev_weather_power_2019.copy()

# when Date of Restoration is Unknown, copy Date of Event
sev_weather_power_2019_copy.loc[sev_weather_power_2019_copy['Date of Restoration'] == 'Unknown', 'Date of Restoration'] = sev_weather_power_2019_copy.loc[sev_weather_power_2019_copy['Date of Restoration'] == 'Unknown', 'Date Event Began']

In [18]:
sev_weather_power_2019_copy

Unnamed: 0,Month,Date Event Began,Time Event Began,Date of Restoration,Time of Restoration,Area Affected,NERC Region,Alert Criteria,Event Type,Demand Loss (MW),Number of Customers Affected
1,January,01/06/2019,03:00:00,01/09/2019,07:00:00,Washington:,WECC,"Loss of electric service to more than 50,000 c...",Severe Weather,230,230000
2,January,01/06/2019,17:56:00,01/06/2019,21:52:00,California: Sacramento County;,WECC,"Loss of electric service to more than 50,000 c...",Severe Weather,300,90382
3,January,01/06/2019,01:00:00,01/06/2019,12:00:00,"Washington: King County, Thurston County, Pier...",WECC,"Loss of electric service to more than 50,000 c...",Severe Weather,Unknown,230000
8,January,01/12/2019,11:30:00,01/13/2019,22:00:00,Missouri: Jackson County; Kansas: Johnson County;,SPP RE,"Loss of electric service to more than 50,000 c...",Severe Weather,Unknown,112530
9,January,01/12/2019,11:30:00,01/12/2019,Unknown,Missouri: Nebraska:,SERC,"Loss of electric service to more than 50,000 c...",Severe Weather,Unknown,116600
...,...,...,...,...,...,...,...,...,...,...,...
246,November,11/01/2019,01:15:00,11/02/2019,21:30:00,Connecticut: Maine: Massachusetts: Rhode Islan...,NPCC,"Loss of electric service to more than 50,000 c...",Severe Weather,Unknown,80066
247,November,11/01/2019,02:41:00,11/01/2019,Unknown,New York: Broome County;,NPCC,"Loss of electric service to more than 50,000 c...",Severe Weather,Unknown,66325
255,November,11/20/2019,09:49:00,11/20/2019,15:20:00,"California: Colusa County, Lake County, Mendoc...",WECC,"Loss of electric service to more than 50,000 c...",Severe Weather/Transmission Interruption,178,54000
259,November,11/26/2019,18:07:00,11/27/2019,12:27:00,California:,WECC,"Loss of electric service to more than 50,000 c...",Severe Weather,300,93000


## Rename for easy naming

In [19]:
df1 = lightning_2019.copy()
df2 = sev_weather_power_2019_copy.copy()

In [20]:
# convert string to datetime
df1['#ZDAY'] = pd.to_datetime(df1['#ZDAY'], format='%Y%m%d')
df2['Date Event Began'] = pd.to_datetime(df2['Date Event Began'], format='%m/%d/%Y')
df2['Date of Restoration'] = pd.to_datetime(df2['Date of Restoration'], format='%m/%d/%Y')

## Inspecting df1 and df2

In [21]:
df1['county'] = df1['county'].astype(str)
df1['state'] = df1['state'].astype(str)

In [22]:
df2['Area Affected'] = df2['Area Affected'].astype(str)
df2['Month'] = df2['Month'].astype(str)

## Lightning date to Power cut date

Here we filter those columns from power data which have some lightning observation. Mostly it gives us the same data since the lightning data is very vast and encorporates all dates in powercut.

In [23]:
# Filter df2 based on whether #ZDAY falls within the range of Date Event Began
df2_filtered = df2[df2['Date Event Began'].isin(df1['#ZDAY'])]

In [24]:
df2_filtered

Unnamed: 0,Month,Date Event Began,Time Event Began,Date of Restoration,Time of Restoration,Area Affected,NERC Region,Alert Criteria,Event Type,Demand Loss (MW),Number of Customers Affected
1,January,2019-01-06,03:00:00,2019-01-09,07:00:00,Washington:,WECC,"Loss of electric service to more than 50,000 c...",Severe Weather,230,230000
2,January,2019-01-06,17:56:00,2019-01-06,21:52:00,California: Sacramento County;,WECC,"Loss of electric service to more than 50,000 c...",Severe Weather,300,90382
3,January,2019-01-06,01:00:00,2019-01-06,12:00:00,"Washington: King County, Thurston County, Pier...",WECC,"Loss of electric service to more than 50,000 c...",Severe Weather,Unknown,230000
8,January,2019-01-12,11:30:00,2019-01-13,22:00:00,Missouri: Jackson County; Kansas: Johnson County;,SPP RE,"Loss of electric service to more than 50,000 c...",Severe Weather,Unknown,112530
9,January,2019-01-12,11:30:00,2019-01-12,Unknown,Missouri: Nebraska:,SERC,"Loss of electric service to more than 50,000 c...",Severe Weather,Unknown,116600
...,...,...,...,...,...,...,...,...,...,...,...
246,November,2019-11-01,01:15:00,2019-11-02,21:30:00,Connecticut: Maine: Massachusetts: Rhode Islan...,NPCC,"Loss of electric service to more than 50,000 c...",Severe Weather,Unknown,80066
247,November,2019-11-01,02:41:00,2019-11-01,Unknown,New York: Broome County;,NPCC,"Loss of electric service to more than 50,000 c...",Severe Weather,Unknown,66325
255,November,2019-11-20,09:49:00,2019-11-20,15:20:00,"California: Colusa County, Lake County, Mendoc...",WECC,"Loss of electric service to more than 50,000 c...",Severe Weather/Transmission Interruption,178,54000
259,November,2019-11-26,18:07:00,2019-11-27,12:27:00,California:,WECC,"Loss of electric service to more than 50,000 c...",Severe Weather,300,93000


We check if the power outage dataset has a county information in the Area Affected or not

Find columns which do not have any county info. On those columns check states. Have boolean columns which say whether did county match (accurate) or state match

In [25]:
# get all US states
county = pd.read_csv("../extras/uscounties.csv", index_col=0)
county['county'] = county['county'].astype(str)
county

Unnamed: 0,state,lat,lng,population,location,county
0,California,34.3219,-118.2247,9936690,"(34.3219, -118.2247)",Los Angeles County
1,Illinois,41.8401,-87.8168,5225367,"(41.8401, -87.8168)",Cook County
2,Texas,29.8578,-95.3938,4726177,"(29.8578, -95.3938)",Harris County
3,Arizona,33.3490,-112.4915,4430871,"(33.349, -112.4915)",Maricopa County
4,California,33.0343,-116.7351,3289701,"(33.0343, -116.7351)",San Diego County
...,...,...,...,...,...,...
3139,Nebraska,41.9128,-99.9768,384,"(41.9128, -99.9768)",Blaine County
3140,Texas,33.6165,-100.2558,216,"(33.6165, -100.2558)",King County
3141,Texas,26.9285,-97.7017,116,"(26.9285, -97.7017)",Kenedy County
3142,Texas,31.8493,-103.5800,96,"(31.8493, -103.58)",Loving County


In [26]:
df2_filtered_copy = df2_filtered.copy()

df2_filtered_copy["county_info_area_affected"] = df2_filtered_copy.apply(lambda row: any(counti in row['Area Affected'] for counti in county['county']) , axis=1)

In [27]:
df2_filtered_copy

Unnamed: 0,Month,Date Event Began,Time Event Began,Date of Restoration,Time of Restoration,Area Affected,NERC Region,Alert Criteria,Event Type,Demand Loss (MW),Number of Customers Affected,county_info_area_affected
1,January,2019-01-06,03:00:00,2019-01-09,07:00:00,Washington:,WECC,"Loss of electric service to more than 50,000 c...",Severe Weather,230,230000,False
2,January,2019-01-06,17:56:00,2019-01-06,21:52:00,California: Sacramento County;,WECC,"Loss of electric service to more than 50,000 c...",Severe Weather,300,90382,True
3,January,2019-01-06,01:00:00,2019-01-06,12:00:00,"Washington: King County, Thurston County, Pier...",WECC,"Loss of electric service to more than 50,000 c...",Severe Weather,Unknown,230000,True
8,January,2019-01-12,11:30:00,2019-01-13,22:00:00,Missouri: Jackson County; Kansas: Johnson County;,SPP RE,"Loss of electric service to more than 50,000 c...",Severe Weather,Unknown,112530,True
9,January,2019-01-12,11:30:00,2019-01-12,Unknown,Missouri: Nebraska:,SERC,"Loss of electric service to more than 50,000 c...",Severe Weather,Unknown,116600,False
...,...,...,...,...,...,...,...,...,...,...,...,...
246,November,2019-11-01,01:15:00,2019-11-02,21:30:00,Connecticut: Maine: Massachusetts: Rhode Islan...,NPCC,"Loss of electric service to more than 50,000 c...",Severe Weather,Unknown,80066,False
247,November,2019-11-01,02:41:00,2019-11-01,Unknown,New York: Broome County;,NPCC,"Loss of electric service to more than 50,000 c...",Severe Weather,Unknown,66325,True
255,November,2019-11-20,09:49:00,2019-11-20,15:20:00,"California: Colusa County, Lake County, Mendoc...",WECC,"Loss of electric service to more than 50,000 c...",Severe Weather/Transmission Interruption,178,54000,True
259,November,2019-11-26,18:07:00,2019-11-27,12:27:00,California:,WECC,"Loss of electric service to more than 50,000 c...",Severe Weather,300,93000,False


here we now check if the date rows match between the two dataframes

In [28]:

# Merge df1 with the filtered df2 based on the condition that is county in area is true and the lightning date matches the power cut date
# it is an inner join which means give only those that are in the intersection
# therefore only those dates that are in lightning and that are in powercuts will be searched out
# remember this is still a match on dates, and it remains to match on area/county

#merged_df = pd.merge(df1, df2_filtered_copy, how='inner', left_on='#ZDAY', right_on='Date Event Began')

left_merged_df = pd.merge(df1, df2_filtered_copy, how='left', left_on='#ZDAY', right_on='Date Event Began', indicator=True)

In [30]:
inner_merged_df = left_merged_df[left_merged_df['_merge'] !='left_only']

In [31]:
inner_merged_df

Unnamed: 0,#ZDAY,CENTERLON,CENTERLAT,TOTAL_COUNT,location,county,state,Month,Date Event Began,Time Event Began,Date of Restoration,Time of Restoration,Area Affected,NERC Region,Alert Criteria,Event Type,Demand Loss (MW),Number of Customers Affected,county_info_area_affected,_merge
124,2019-01-06,-120.0,35.0,1,"(35.0, -120.0)",Santa Barbara County,California,January,2019-01-06,03:00:00,2019-01-09,07:00:00,Washington:,WECC,"Loss of electric service to more than 50,000 c...",Severe Weather,230,230000,False,both
125,2019-01-06,-120.0,35.0,1,"(35.0, -120.0)",Santa Barbara County,California,January,2019-01-06,17:56:00,2019-01-06,21:52:00,California: Sacramento County;,WECC,"Loss of electric service to more than 50,000 c...",Severe Weather,300,90382,True,both
126,2019-01-06,-120.0,35.0,1,"(35.0, -120.0)",Santa Barbara County,California,January,2019-01-06,01:00:00,2019-01-06,12:00:00,"Washington: King County, Thurston County, Pier...",WECC,"Loss of electric service to more than 50,000 c...",Severe Weather,Unknown,230000,True,both
127,2019-01-06,-123.4,47.3,1,"(47.3, -123.4)",Grays Harbor County,Washington,January,2019-01-06,03:00:00,2019-01-09,07:00:00,Washington:,WECC,"Loss of electric service to more than 50,000 c...",Severe Weather,230,230000,False,both
128,2019-01-06,-123.4,47.3,1,"(47.3, -123.4)",Grays Harbor County,Washington,January,2019-01-06,17:56:00,2019-01-06,21:52:00,California: Sacramento County;,WECC,"Loss of electric service to more than 50,000 c...",Severe Weather,300,90382,True,both
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61010,2019-11-27,-71.7,42.2,1,"(42.2, -71.7)",Worcester County,Massachusetts,November,2019-11-27,12:00:00,2019-11-30,02:00:00,"Michigan: Tuscola County, Sanilac County, Huro...",RF,"Loss of electric service to more than 50,000 c...",Severe Weather,30,107000,True,both
61011,2019-11-27,-79.2,42.4,3,"(42.4, -79.2)",Chautauqua County,New York,November,2019-11-27,12:00:00,2019-11-30,02:00:00,"Michigan: Tuscola County, Sanilac County, Huro...",RF,"Loss of electric service to more than 50,000 c...",Severe Weather,30,107000,True,both
61012,2019-11-27,-79.2,42.5,1,"(42.5, -79.2)",Chautauqua County,New York,November,2019-11-27,12:00:00,2019-11-30,02:00:00,"Michigan: Tuscola County, Sanilac County, Huro...",RF,"Loss of electric service to more than 50,000 c...",Severe Weather,30,107000,True,both
61013,2019-11-27,-71.9,42.5,1,"(42.5, -71.9)",Worcester County,Massachusetts,November,2019-11-27,12:00:00,2019-11-30,02:00:00,"Michigan: Tuscola County, Sanilac County, Huro...",RF,"Loss of electric service to more than 50,000 c...",Severe Weather,30,107000,True,both


## Now we match county

and we match state only when no county info present.

Caveat: some counties have same names in different states, so need to check state too

In [32]:
filtered_df = left_merged_df[left_merged_df.apply(lambda row: (row['_merge'] == 'both') and(str(row['county']) in str(row['Area Affected']) and str(row['state']) in str(row['Area Affected'])) or ((str(row['state']) in str(row['Area Affected']))and (not row['county_info_area_affected'])), axis=1)]

## Now we add the power outage yes/no to lightning

Once we have the filtered_df, we can use it to go back to the lightning data, and add the column which says whether power outage happened or not

As a rough estimate, for the year 2019,
 55000 rows of lightning data, we got about 3000 power outages.

One can say that the total power outage is 92, then how are we getting 3000 outages, the answer is that these are not distinct outages, rather these are distinct lightning events.

So from about 55000 lightning events, 3000 of them gave rise to power outages, and multiple lightnings may be responsible for a single outage.


## Now we do the check whether a row in lightning gave outage

We do this via the following:

The left_merged_df has all rows in lightning whether power outage or not

The filtered_df has rows that gave rise to

In [33]:
# add a boolean column for the filter
left_merged_df['power_outage'] = left_merged_df.apply(lambda row: (row['_merge'] == 'both') and(str(row['county']) in str(row['Area Affected']) and str(row['state']) in str(row['Area Affected'])) or ((str(row['state']) in str(row['Area Affected']))and (not row['county_info_area_affected'])), axis=1)

In [34]:
left_merged_df[left_merged_df['power_outage']]

Unnamed: 0,#ZDAY,CENTERLON,CENTERLAT,TOTAL_COUNT,location,county,state,Month,Date Event Began,Time Event Began,...,Time of Restoration,Area Affected,NERC Region,Alert Criteria,Event Type,Demand Loss (MW),Number of Customers Affected,county_info_area_affected,_merge,power_outage
127,2019-01-06,-123.4,47.3,1,"(47.3, -123.4)",Grays Harbor County,Washington,January,2019-01-06,03:00:00,...,07:00:00,Washington:,WECC,"Loss of electric service to more than 50,000 c...",Severe Weather,230,230000,False,both,True
130,2019-01-06,-118.2,46.0,1,"(46.0, -118.2)",Walla Walla County,Washington,January,2019-01-06,03:00:00,...,07:00:00,Washington:,WECC,"Loss of electric service to more than 50,000 c...",Severe Weather,230,230000,False,both,True
133,2019-01-06,-123.2,47.0,2,"(47.0, -123.2)",Grays Harbor County,Washington,January,2019-01-06,03:00:00,...,07:00:00,Washington:,WECC,"Loss of electric service to more than 50,000 c...",Severe Weather,230,230000,False,both,True
136,2019-01-06,-123.1,47.0,1,"(47.0, -123.1)",Grays Harbor County,Washington,January,2019-01-06,03:00:00,...,07:00:00,Washington:,WECC,"Loss of electric service to more than 50,000 c...",Severe Weather,230,230000,False,both,True
196,2019-01-06,-118.1,45.9,1,"(45.9, -118.1)",Walla Walla County,Washington,January,2019-01-06,03:00:00,...,07:00:00,Washington:,WECC,"Loss of electric service to more than 50,000 c...",Severe Weather,230,230000,False,both,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60322,2019-11-01,-67.8,46.1,1,"(46.1, -67.8)",Aroostook County,Maine,November,2019-11-01,01:15:00,...,21:30:00,Connecticut: Maine: Massachusetts: Rhode Islan...,NPCC,"Loss of electric service to more than 50,000 c...",Severe Weather,Unknown,80066,False,both,True
60360,2019-11-01,-73.8,41.0,2,"(41.0, -73.8)",Westchester County,New York,November,2019-11-01,01:00:00,...,13:00:00,New York:,NPCC,"Loss of electric service to more than 50,000 c...",Severe Weather,Unknown,8000,False,both,True
60363,2019-11-01,-73.7,41.0,1,"(41.0, -73.7)",Westchester County,New York,November,2019-11-01,01:00:00,...,13:00:00,New York:,NPCC,"Loss of electric service to more than 50,000 c...",Severe Weather,Unknown,8000,False,both,True
60460,2019-11-01,-67.9,46.0,1,"(46.0, -67.9)",Aroostook County,Maine,November,2019-11-01,01:15:00,...,21:30:00,Connecticut: Maine: Massachusetts: Rhode Islan...,NPCC,"Loss of electric service to more than 50,000 c...",Severe Weather,Unknown,80066,False,both,True


## We polish the dataframe by dropping unimportant columns

In [35]:
left_merged_df.columns.tolist()

['#ZDAY',
 'CENTERLON',
 'CENTERLAT',
 'TOTAL_COUNT',
 'location',
 'county',
 'state',
 'Month',
 'Date Event Began',
 'Time Event Began',
 'Date of Restoration',
 'Time of Restoration',
 'Area Affected',
 'NERC Region',
 'Alert Criteria',
 'Event Type',
 'Demand Loss (MW)',
 'Number of Customers Affected',
 'county_info_area_affected',
 '_merge',
 'power_outage']

In [38]:
columns_to_drop = ['Month', 'Date Event Began', 'Time Event Began', 'Date of Restoration', 'Time of Restoration', 'Area Affected',
 'NERC Region','Alert Criteria', 'Event Type', 'Demand Loss (MW)', 'Number of Customers Affected', 'county_info_area_affected', '_merge']

lightning_outage = left_merged_df.copy()

lightning_outage = lightning_outage.drop(columns = columns_to_drop)

lightning_outage[lightning_outage["power_outage"]]

Unnamed: 0,#ZDAY,CENTERLON,CENTERLAT,TOTAL_COUNT,location,county,state,power_outage
127,2019-01-06,-123.4,47.3,1,"(47.3, -123.4)",Grays Harbor County,Washington,True
130,2019-01-06,-118.2,46.0,1,"(46.0, -118.2)",Walla Walla County,Washington,True
133,2019-01-06,-123.2,47.0,2,"(47.0, -123.2)",Grays Harbor County,Washington,True
136,2019-01-06,-123.1,47.0,1,"(47.0, -123.1)",Grays Harbor County,Washington,True
196,2019-01-06,-118.1,45.9,1,"(45.9, -118.1)",Walla Walla County,Washington,True
...,...,...,...,...,...,...,...,...
60322,2019-11-01,-67.8,46.1,1,"(46.1, -67.8)",Aroostook County,Maine,True
60360,2019-11-01,-73.8,41.0,2,"(41.0, -73.8)",Westchester County,New York,True
60363,2019-11-01,-73.7,41.0,1,"(41.0, -73.7)",Westchester County,New York,True
60460,2019-11-01,-67.9,46.0,1,"(46.0, -67.9)",Aroostook County,Maine,True


In [None]:
lightning_outage.to_csv('../merged/lightning_outage_2019.csv', index=False)