In [1]:
import reverse_geocoder as rg
import pandas as pd
import numpy as np
import ftplib
import requests
import matplotlib.pyplot as plt
import os
import re

In [2]:
def clean_lightning(lightning):
    lightning['#ZDAY'] = pd.to_datetime(lightning['#ZDAY'], format='%Y%m%d')
    # add county and state columns
    addresses = rg.search(list(zip(lightning['CENTERLAT'],lightning['CENTERLON'])))
    lightning['county'] = [x['admin2'] for x in addresses]
    lightning['state'] = [x['admin1'] for x in addresses]
    ## drop rows with no county (occurs when not in US)
    lightning = lightning[lightning['county'] != '']
    return lightning

In [3]:
def clean_power(power):
    power['Date Event Began'] = pd.to_datetime(power['Date Event Began'], format='%m/%d/%Y')
    power = power[power['Event Type'].str.contains(r'Severe Weather', regex=True)]
    return power.drop(columns=['Month', 'Time Event Began', 'Date of Restoration', 'Time of Restoration', 
                         'NERC Region', 'Alert Criteria', 'Event Type', 'Demand Loss (MW)', 'Number of Customers Affected'])

In [4]:
# get all US states
counties = pd.read_csv("../extras/uscounties.csv", index_col=0)
counties['county'] = counties['county'].astype(str)

In [5]:
def in_area(county,state,area_affected):
    """
    input:

    county, state: the county and state of the weather event
    area_affected: the area affected by the power outage(a string listing states and possibly counties)
    has_county_info: whether area_affected

    output: True if either state and county are both in area_affected, or
            False if state is in area_affected and there is no county info for area_affected
    """
    if not county or not state or not area_affected:
        raise Exception(f"Invalid (null) input. county: {county}, state: {state}, area_affected: {area_affected}")

    # adding a colon to state ensures that it's matched exactly to a state in area_affected
    # (rather than a county whose name is a state)
    stateC = ''.join([state,':'])

    # has_county_info is True if area_affected includes a county, false otherwise
    has_county_info = any(cty in area_affected for cty in counties[counties['state'] == state]['county'])
    
    return stateC in area_affected and (county in area_affected or not has_county_info)

In [6]:
def merge_lightning(lightning,power):
    """
    Merge the lightning data and power data for a given year
    """
    lightning = clean_lightning(lightning)
    power = clean_power(power)
    merged = pd.merge(lightning, power, how='left', left_on='#ZDAY', right_on='Date Event Began', indicator=True)
    merged['power_outage'] = merged.apply(lambda row: (row['_merge'] == 'both') and in_area(str(row['county']),
                                                                                            str(row['state']),
                                                                                            str(row['Area Affected'])),
                                          axis = 'columns')
    return merged.drop(columns=['Date Event Began', 'Area Affected', '_merge'])

In [7]:
event_types = {'hail':'hail', 'storm_structure':'structure', 'tornados':'tvs', 'lightning':'nldn-tiles', 'mesocyclone':'mda'}
for event in event_types:
    path = '../weather_data/'+event
    if not os.path.exists(path):
        os.mkdir(path)

In [8]:
# connect to the server
ftp = ftplib.FTP('ftp.ncdc.noaa.gov', timeout=30) #pass the url without protocol
ftp.login() #pass credentials if anonymous access is not allowed

# switch to the directory containing the data
ftp.cwd('/pub/data/swdi/database-csv/v2/')
ftp.pwd()

httpurl = 'https://www.ncei.noaa.gov/pub/data/swdi/database-csv/v2/'
# get the list of files in this ftp dir
all_files= ftp.nlst()

In [9]:
def download_file(year, event_type):
    event_name = event_types[event_type]
    pattern = event_name+"-"+str(year)
    file_name = [fname for fname in all_files if pattern in fname]
    if len(file_name) == 0:
        return "No file in that year for that event type" 
    file_name = file_name[0]
    print("Considering file ", file_name)
    if os.path.exists('../weather_data/{}/{}'.format(event_type, file_name)):
        return "file already exists"
    query_parameters = {"downloadformat": "csv"}
    print("Getting the response from the URL .....")
    response = requests.get(httpurl+file_name, params=query_parameters)
    if response.ok:
        print("Downloaded succesfully")
    with open(r'../weather_data/{}/{}'.format(event_type, file_name), "wb") as f:
        f.write(response.content)
    print('Saved in folder')

In [10]:
def read_weather(year, event_type):
    files = os.listdir('../weather_data/'+event_type)
    file_name = [fname for fname in files if str(year) in fname]
    if len(file_name) == 0:
        return "No file in that year for that event type"
    if len(file_name) > 1:
        return "Multiple files with that year in their name"
    if event_type == 'lightning' or event_type == 'tornado':
        return pd.read_csv(r'../weather_data/'+event_type+'/'
                  + file_name[0], skiprows=2, parse_dates=['#ZDAY'])
    return pd.read_csv(r'../weather_data/'+event_type+'/'
                  + file_name[0], skiprows=2, parse_dates=['#ZTIME'])

In [11]:
def merge(year, event_type):
    weather = read_weather(year,event_type)
    print(weather.info())
    power= pd.read_excel('../power_data/' + str(year) + '_Annual_Summary.xls', skiprows=1)
    if event_type == 'lightning':
        return merge_lightning(weather,power)
    else:
        # TODO: implement
        pass

In [None]:
merged_lightning_2019 = merge(2019, 'lightning')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3073235 entries, 0 to 3073234
Data columns (total 4 columns):
 #   Column       Dtype         
---  ------       -----         
 0   #ZDAY        datetime64[ns]
 1   CENTERLON    float64       
 2   CENTERLAT    float64       
 3   TOTAL_COUNT  int64         
dtypes: datetime64[ns](1), float64(2), int64(1)
memory usage: 93.8 MB
None
Loading formatted geocoded file...


In [None]:
merged_lightning_2019