In [None]:
# final version to run code created in src/d02_intermediate/parition_hurdat.py



In [None]:
# need code to convert HURDAT2 data into a more usable format

## Preparing the data

The data for this project is in the HURDAT2 format from NHC. This format contains rows of storm positions interspersed with header rows denoting which storm the subsequent position data corresponds to.

Because of this, the raw data table has a few problems:
- No column names are provided.
- Columns contain a mix of data types.
- Many rows are full of missing data for most columns.

As a result, our first goal will be to convert the data into a more usable format. We begin by importing the raw data as is into a Pandas DataFrame, `atl`. In doing so, we also assign column names which correspond to the information in the storm position data rows. We will later separate the header rows into a new DataFrame and assign them their own column names.

In [1]:
import os, sys

root_dir = os.path.join(os.getcwd(), '..')
sys.path.append(root_dir)

import pandas as pd
import numpy as np

fn = 'Pacific.csv'
fn_no_ext = os.path.splitext(fn)[0]

# create a list of column names

header = ['date', 'time', 'recordID', 'status', 'lat', 'long', 'maxSustWind', 'minPressure', 'extNE34', 'extSE34', 'extSW34', 'extNW34', 'extNE50', 'extSE50', 'extSW50', 'extNW50', 'extNE64', 'extSE64', 'extSW64', 'extNW64']


# import atlantic.csv from raw data folder using our column names, and verify the new DataFrame.

#hurdat = pd.read_csv('../data/01_raw/Atlantic.csv', names = header)
hurdat = pd.read_csv(f'../data/01_raw/{fn}', names = header)
#remove our temporary header
hurdat.drop(index = 0, inplace = True)
hurdat.reset_index(drop=True, inplace=True)
hurdat.head()

Unnamed: 0,date,time,recordID,status,lat,long,maxSustWind,minPressure,extNE34,extSE34,extSW34,extNW34,extNE50,extSE50,extSW50,extNW50,extNE64,extSE64,extSW64,extNW64
0,EP011949,UNNAMED,7.0,,,,,,,,,,,,,,,,,
1,19490611,0000,,TS,20.2N,106.3W,45.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0
2,19490611,0600,,TS,20.2N,106.4W,45.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0
3,19490611,1200,,TS,20.2N,106.7W,45.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0
4,19490611,1800,,TS,20.3N,107.7W,45.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0


From the NHC documentation for the HURDAT2 database, column names are as follows:

- `date`: timecode for the position entry, format `YYYYMMDD`
- `time`: timecode for the position entry, format `HHMM` in 24-Hr UTC
- `recordID`: special designation for significant position entries. Can be empty or contain values:
 - **C**: closest approach to coast when not followed by landfall
 - **G**: genesis
 - **I**: intensity peak in both pressure and wind
 - **L**: landfall
 - **P**: minimum pressure
 - **R**: additional intensity detail during rapid changes
 - **S**: change of status
 - **T**: additional track/position detail
 - **W**: maximum wind speed
- `status`: tropical depression, tropical storm, hurricane, extratropical cyclone, subtropical depression, subtropical storm, low pressure system, tropical wave, or disturbance
- `lat`: latitude of center of storm
- `long`: longitude of center of storm
- `maxSustWind`: maximum sustained wind
- `minPressure`: minimum central pressure
- `extDDXX`: extent of `XX` nautical mile per hour (knots) winds in the `DD` cardinal direction quadrant

Next, we want to separate the rows of `atl` into two new DataFrames, `storms` for header rows and `positions` for position data.

In [2]:
# first, create a new column to denote whether rows are header rows (True) or data rows (False)


header = [] # list to be used as new column atl['header']


for entry in hurdat['date']:
    if entry.find('AL') != -1: # all header columns, and only header columns, contain 'AL'
        header.append(True)
    elif entry.find('EP') != -1: 
        header.append(True)
    elif entry.find('CP') != -1:
        header.append(True)
    else:
        header.append(False)

       
hurdat['header'] = pd.Series(header) # add the list as a pandas series into a column of atl dataFrame


# create dataframes of only storm names and only position data so we can edit the columns and dtypes

storms = hurdat[hurdat['header'] == True].copy() # all header columns of atl copied into new dataframe storms
positions = hurdat[hurdat['header'] == False].copy() # all data columns of atl copied into new dataframe positions


Now that we have our DataFrames, `storms` and `positions`, each one needs a bit more preparation.

In [3]:
# for the storms dataframe, we need to remove unnecessary columns, rename existing columns, create a new
#     year column, and assign the correct dtypes to all columns


# drop unnecessary columns, rename remaining columns, and clean up indices

storms.drop(['status', 'lat', 'long', 'maxSustWind', 'minPressure', 'extNE34', 'extSE34', 'extSW34', 'extNW34', 'extNE50', 'extSE50', 'extSW50', 'extNW50', 'extNE64', 'extSE64', 'extSW64', 'extNW64', 'header'], axis = 1, inplace = True)
storms.columns = ['stormID', 'name', 'numPositions']
storms.reset_index(drop=True, inplace=True)
storms.head()

Unnamed: 0,stormID,name,numPositions
0,EP011949,UNNAMED,7
1,EP021949,UNNAMED,25
2,EP031949,UNNAMED,25
3,EP041949,UNNAMED,9
4,EP051949,UNNAMED,9


In [4]:
# column names are as follows:
#
# 'stormID': an individual identifier for each storm in the form ALXXYYYY denoting the storm was the XXth storm
#     of (A)t(L)antic Hurricane Season YYYY. Useful when storms in different years share the same name, and for
#     unnamed storms.
# 'name': name of storm.
# 'numPositions': the number of position entries in positions dataFrame corresponding to this storm

In [5]:
#convert the new columns to the correct dtypes

stormYears = [] #create a new list to be used as a numeric years column

for stormID in storms['stormID']:
    stormYears.append(stormID[4:9]) # strip out the year from the stormID.
                                    # note that this year may not necessarily correspond to the calendar dates
                                    #      during which the storm existed, but rather the Hurricane Season to which
                                    #      it belonged.
    
    
storms['year'] = pd.Series(stormYears).astype('int') # assign new year column as integer dtype
storms['numPositions'] = storms['numPositions'].astype('int') # reassign number of positions integer dtype
storms['name'] = storms['name'].astype('str').str.strip() # reassign storm names string dtype and strip whitespace
storms['stormID'] = storms['stormID'].astype('str').str.strip() # reassign stormID string dtype and strip whitespace


# this completes work on the storms dataFrame. We can verify it now.

storms.head()

Unnamed: 0,stormID,name,numPositions,year
0,EP011949,UNNAMED,7,1949
1,EP021949,UNNAMED,25,1949
2,EP031949,UNNAMED,25,1949
3,EP041949,UNNAMED,9,1949
4,EP051949,UNNAMED,9,1949


In [6]:
# for the positions dataFrame we need to clean up the indices, reformat the latitude and longitude columns
#     to make them usable by geopandas, create new columns for the storm name and stormId to make the 
#     dataframe searchable by these criteria, reform


# clean up the indices

positions.reset_index(drop=True, inplace=True)
positions.head()

Unnamed: 0,date,time,recordID,status,lat,long,maxSustWind,minPressure,extNE34,extSE34,...,extNW34,extNE50,extSE50,extSW50,extNW50,extNE64,extSE64,extSW64,extNW64,header
0,19490611,0,,TS,20.2N,106.3W,45.0,-999.0,-999.0,-999.0,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,False
1,19490611,600,,TS,20.2N,106.4W,45.0,-999.0,-999.0,-999.0,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,False
2,19490611,1200,,TS,20.2N,106.7W,45.0,-999.0,-999.0,-999.0,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,False
3,19490611,1800,,TS,20.3N,107.7W,45.0,-999.0,-999.0,-999.0,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,False
4,19490612,0,,TS,20.4N,108.6W,45.0,-999.0,-999.0,-999.0,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,False


In [None]:
# we can convert the latitude and longitude information into integers by removing the cardinal direction.
# we can instead write XX.XW as -XX.X and XX.XE as XX.X.
# we can also write XX.XN as XX.X and XX.XS as -XX.X.


intLat = [] # create lists to be used as new series for latitude and longitude 
intLong = []
        
    
for cardLat in positions['lat']:
    if cardLat.find('N') != -1: # for latitudes of degrees north, strip the whitespace and N
        intLat.append(cardLat.strip(" N"))
    else: # for latitudes of degrees south, strip the whitespace and S, and add a negative to the front
        intLat.append('-'+cardLat.strip(" S"))
    
for cardLong in positions['long']:
    if cardLong.find('E') != -1: #for longitudes of degrees east, strip the whitespace and E
        intLong.append(cardLong.strip(" E"))
    else: # for longitudes of degrees west, strip the whitespace and W, and add a negative to the front
        intLong.append('-'+cardLong.strip(" W"))

        
# replace the existing longitude and latitude columns with the new ones

positions['lat'] = pd.Series(intLat).astype('float')
positions['long'] = pd.Series(intLong).astype('float')

In [None]:
#use the number of position updates for each storm to create a column for the positions dataframe
#containing the appropriate names


stormNames = [] # create a list to be used as the names column for the positions dataFrame


for i in range(len(storms)): # for each storm in the storms dataFrame...
    for j in range(storms['numPositions'][i]): # for the number of rows indicated, add the storm name to the list
        stormNames.append(storms['name'][i])
        
        
#add the new list containing a name for every row of positions into positions dataFrame
        
positions['name'] = pd.Series(stormNames)

In [None]:
#repeat the process for storm IDs


stormIDs = []


for i in range(len(storms)):
    for j in range(storms['numPositions'][i]):
        stormIDs.append(storms['stormID'][i])
        
        
positions['stormID'] = pd.Series(stormIDs)
positions.drop(columns="header", inplace = True)
positions.head()

In [None]:
# now we can save "storms" and "positions" to new csv files in data/02_intermediate

positions_fn = ( fn_no_ext + "_positions.csv" )
positions.to_csv(f"../data/02_intermediate/{positions_fn}")

storms_fn = ( fn_no_ext + "_storms.csv" )
storms.to_csv(f"../data/02_intermediate/{storms_fn}")

print(f"Partitioned {fn} into:\n /data/02_intermediate/{positions_fn}\n /data/02_intermediate/{storms_fn}")