# Hurricane Path Predictor
The HURDAT2 dataset is from the US National Hurricane Center. It is a collection of Atlantic hurricanes that occured from 1851 to 2016.

### Importing the Data

In [586]:
# Import python modules
import pandas as pd
import numpy as np

In [587]:
# Read .csv file line-by-line
with open('hurdat2-1851-2016-041117.csv', 'r') as f: 
       file_data = [row.strip().split(',') for row in f]
# Convert data file to data frame
df = pd.DataFrame(file_data)

### Cleaning the Data

In [588]:
# Delete columns with missing or incomplete data
df.drop([2] + list(range(7,21)), axis='columns', inplace=True)
# Renaming columns
data = pd.DataFrame(np.matrix(df),
                    columns=['date', 'time', 'status', 'lat', 'lon', 'max_wind'])

In [589]:
# Encoding new events
def time_convert(time_str):
    try:
        return(int(time_str.strip()))
    except:
        pass
data.date = data.date.apply(lambda x: '99999999' if x[0] == 'A' else x.strip())
data.time = data.time.apply(lambda x: x.strip())
data.status = data.status.apply(lambda x: 'NE' if x == '' else x.strip())
data.lat = data.lat.apply(lambda x: '00.0N' if x is None else x.strip())
data.lon = data.lon.apply(lambda x: '00.0W' if x is None else x.strip())
data.max_wind = data.max_wind.apply(lambda x: 0 if x is None else int(x.strip()))


In [590]:
# Only 'North' latitudes in the data set
lst = []
for x in data.lat:
    if x is not None:
        lst.append(x[-1])
lst = set(lst)
print(lst)

{'N'}


In [591]:
# 'West' and 'East' longitudes in the data set
lst = []
for x in data.lon:
    if x is not None:
        lst.append(x[-1])
lst = set(lst)
print(lst)

{'W', 'E'}


In [592]:
# Converting latitude to float data type
data.lat = data.lat.apply(lambda x: float(x.strip()[:-1]))

In [593]:
# Converting longitude to float data type
def lon_convert(lon):
    lon = lon.strip()
    lon_sign = 1
    if lon[-1] == 'W':
        lon_sign = -1
    return(lon_sign * float(lon[:-1]))

data.lon = data.lon.apply(lambda x: lon_convert(x))

In [594]:
# Identifying noise in the time column
def time_filter(t):
    try:
        int(t)
        return(t)
    except:
        return('9999')

data.time = data.time.apply(lambda x: time_filter(x))

### Engineering Features

In [595]:
# Extract date information
data.loc[:, 'year'] = data.date.apply(lambda x: int(x[0:4]))
data.loc[:, 'month'] = data.date.apply(lambda x: int(x[4:6]))
data.loc[:, 'day'] = data.date.apply(lambda x: int(x[6:8]))
data.drop('date', axis='columns', inplace=True)

In [596]:
# Extract time information
data.loc[:, 'hour'] = data.time.apply(lambda x: int(x[0:2]))
data.loc[:, 'minute'] = data.time.apply(lambda x: int(x[2:4]))
data.drop('time', axis='columns', inplace=True)

In [597]:
# Convert current time to an hour decimal
time_hour_decimal = np.array(data.hour) + np.array(data.minute) / 60
# Time delta in hours
time_delta = time_hour_decimal - np.roll(time_hour_decimal, 1)
# Correcting time deltas that occur across midnight
time_delta = [24 + x if x < 0 else x for x in time_delta]
data.loc[:, 'time_delta'] = time_delta

In [598]:
# Created latitude and longitude rate of change features
data.loc[:, 'lat_vel'] = (np.array(data.lat) - np.roll(np.array(data.lat), 1)) / time_delta
data.loc[:, 'lat_acc'] = np.round((np.array(data.lat_vel) - 
                          np.roll(np.array(data.lat_vel), 1)) / time_delta, 8)
data.loc[:, 'lon_vel'] = (np.array(data.lon) - np.roll(np.array(data.lon), 1)) / time_delta
data.loc[:, 'lon_acc'] = np.round((np.array(data.lon_vel) - 
                          np.roll(np.array(data.lon_vel), 1)) / time_delta, 8)
# Filter out bogus results
data.lat_vel = data.lat_vel.apply(lambda x: 0 if abs(x) > 3 else x)
data.lat_acc = data.lat_acc.apply(lambda x: 0 if abs(x) > 3 else x)
data.lon_vel = data.lon_vel.apply(lambda x: 0 if abs(x) > 3 else x)
data.lon_acc = data.lon_acc.apply(lambda x: 0 if abs(x) > 3 else x)

In [599]:
# Capture transitions in storm's status
data.loc[:, 'prev_status'] = np.roll(np.array(data.status), 1)

In [600]:
# Create binary categorical columns
data = pd.concat([data,
                  pd.get_dummies(data.status),
                  pd.get_dummies(data.prev_status),
                  pd.get_dummies(data.month, prefix='month'),
                  pd.get_dummies(data.day, prefix='day'),
                  pd.get_dummies(data.hour, prefix='hour')],
                axis='columns')
# Deleting redundant columns
data.drop(['status', 'prev_status', 'month', 'day', 'hour'], axis='columns', inplace=True)

In [617]:
# Next position (lat, lon)
label = pd.DataFrame(np.roll(np.array(data[['lat', 'lon']]), [-1,-1]),
                     columns=['lat_pred', 'lat_pred'])

In [618]:
label

Unnamed: 0,lat_pred,lat_pred.1
0,28.0,-94.8
1,28.0,-95.4
2,28.0,-96.0
3,28.1,-96.5
4,28.2,-96.8
5,28.2,-97.0
6,28.3,-97.6
7,28.4,-98.3
8,28.6,-98.9
9,29.0,-99.4
