# Hurricane Path Predictor
The HURDAT2 dataset is from the US National Hurricane Center. It is a collection of Atlantic hurricanes that occured from 1851 to 2016.

### Importing the Data

In [673]:
# Import python modules
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [647]:
# Read .csv file line-by-line
with open('hurdat2-1851-2016-041117.csv', 'r') as f: 
       file_data = [row.strip().split(',') for row in f]
# Convert data file to data frame
df = pd.DataFrame(file_data)

### Cleaning the Data

In [648]:
# Delete columns with missing or incomplete data
df.drop([2] + list(range(7,21)), axis='columns', inplace=True)
# Renaming columns
data = pd.DataFrame(np.matrix(df),
                    columns=['date', 'time', 'status', 'lat', 'lon', 'max_wind'])

In [649]:
# Encoding new events
def time_convert(time_str):
    try:
        return(int(time_str.strip()))
    except:
        pass
data.date = data.date.apply(lambda x: '99999999' if x[0] == 'A' else x.strip())
data.time = data.time.apply(lambda x: x.strip())
data.status = data.status.apply(lambda x: 'NE' if x == '' else x.strip())
data.lat = data.lat.apply(lambda x: '00.0N' if x is None else x.strip())
data.lon = data.lon.apply(lambda x: '00.0W' if x is None else x.strip())
data.max_wind = data.max_wind.apply(lambda x: 0 if x is None else int(x.strip()))


In [650]:
# Only 'North' latitudes in the data set
lst = []
for x in data.lat:
    if x is not None:
        lst.append(x[-1])
lst = set(lst)
print(lst)

{'N'}


In [651]:
# 'West' and 'East' longitudes in the data set
lst = []
for x in data.lon:
    if x is not None:
        lst.append(x[-1])
lst = set(lst)
print(lst)

{'W', 'E'}


In [652]:
# Converting latitude to float data type
data.lat = data.lat.apply(lambda x: float(x.strip()[:-1]))

In [653]:
# Converting longitude to float data type
def lon_convert(lon):
    lon = lon.strip()
    lon_sign = 1
    if lon[-1] == 'W':
        lon_sign = -1
    return(lon_sign * float(lon[:-1]))

data.lon = data.lon.apply(lambda x: lon_convert(x))

In [654]:
# Identifying noise in the time column
def time_filter(t):
    try:
        int(t)
        return(t)
    except:
        return('9999')

data.time = data.time.apply(lambda x: time_filter(x))

### Engineering Features

In [655]:
# Extract date information
data.loc[:, 'year'] = data.date.apply(lambda x: int(x[0:4]))
data.loc[:, 'month'] = data.date.apply(lambda x: int(x[4:6]))
data.loc[:, 'day'] = data.date.apply(lambda x: int(x[6:8]))
data.drop('date', axis='columns', inplace=True)

In [656]:
# Extract time information
data.loc[:, 'hour'] = data.time.apply(lambda x: int(x[0:2]))
data.loc[:, 'minute'] = data.time.apply(lambda x: int(x[2:4]))
data.drop('time', axis='columns', inplace=True)

In [657]:
# Convert current time to an hour decimal
time_hour_decimal = np.array(data.hour) + np.array(data.minute) / 60
# Time delta in hours
time_delta = time_hour_decimal - np.roll(time_hour_decimal, 1)
# Correcting time deltas that occur across midnight
time_delta = [24 + x if x < 0 else x for x in time_delta]
data.loc[:, 'time_delta'] = time_delta

In [658]:
# Created latitude and longitude rate of change features
data.loc[:, 'lat_vel'] = (np.array(data.lat) - np.roll(np.array(data.lat), 1)) / time_delta
# data.loc[:, 'lat_acc'] = np.round((np.array(data.lat_vel) - 
#                           np.roll(np.array(data.lat_vel), 1)) / time_delta, 8)
data.loc[:, 'lon_vel'] = (np.array(data.lon) - np.roll(np.array(data.lon), 1)) / time_delta
# data.loc[:, 'lon_acc'] = np.round((np.array(data.lon_vel) - 
#                           np.roll(np.array(data.lon_vel), 1)) / time_delta, 8)
# Filter out bogus results
data.lat_vel = data.lat_vel.apply(lambda x: 0 if abs(x) > 0.1 else x)
# data.lat_acc = data.lat_acc.apply(lambda x: 0 if abs(x) > 3 else x)
data.lon_vel = data.lon_vel.apply(lambda x: 0 if abs(x) > 0.1 else x)
# data.lon_acc = data.lon_acc.apply(lambda x: 0 if abs(x) > 3 else x)

In [660]:
# Capture transitions in storm's status
data.loc[:, 'prev_status'] = np.roll(np.array(data.status), 1)

In [661]:
# Create binary categorical columns
data = pd.concat([data,
                  pd.get_dummies(data.status),
                  pd.get_dummies(data.prev_status),
                  pd.get_dummies(data.month, prefix='month'),
                  pd.get_dummies(data.day, prefix='day'),
                  pd.get_dummies(data.hour, prefix='hour')],
                axis='columns')
# Deleting redundant columns
data.drop(['status', 'prev_status', 'month', 'day', 'hour'], axis='columns', inplace=True)

In [662]:
# Correct first entry values
data.time_delta = data.time_delta.apply(lambda x: 0 if x < 0 else x)

In [664]:
data.columns

Index(['lat', 'lon', 'max_wind', 'year', 'minute', 'time_delta', 'lat_vel',
       'lon_vel', 'DB', 'EX', 'HU', 'LO', 'NE', 'SD', 'SS', 'TD', 'TS', 'WV',
       'DB', 'EX', 'HU', 'LO', 'NE', 'SD', 'SS', 'TD', 'TS', 'WV', 'month_1',
       'month_2', 'month_3', 'month_4', 'month_5', 'month_6', 'month_7',
       'month_8', 'month_9', 'month_10', 'month_11', 'month_12', 'month_99',
       'day_1', 'day_2', 'day_3', 'day_4', 'day_5', 'day_6', 'day_7', 'day_8',
       'day_9', 'day_10', 'day_11', 'day_12', 'day_13', 'day_14', 'day_15',
       'day_16', 'day_17', 'day_18', 'day_19', 'day_20', 'day_21', 'day_22',
       'day_23', 'day_24', 'day_25', 'day_26', 'day_27', 'day_28', 'day_29',
       'day_30', 'day_31', 'day_99', 'hour_0', 'hour_1', 'hour_2', 'hour_3',
       'hour_4', 'hour_5', 'hour_6', 'hour_7', 'hour_8', 'hour_9', 'hour_10',
       'hour_11', 'hour_12', 'hour_13', 'hour_14', 'hour_15', 'hour_16',
       'hour_17', 'hour_18', 'hour_19', 'hour_20', 'hour_21', 'hour_22',
       '

In [665]:
# Next position (lat, lon)
label = pd.DataFrame(np.roll(np.array(data[['lat', 'lon']]), [-1,-1]),
                     columns=['lat_pred', 'lat_pred'])

### Modeling

In [671]:
# Split data into a test set and a training set
X_train, X_test, y_train, y_test = train_test_split(data, label, shuffle=True, test_size=0.2)

In [674]:
# Standardizing the inputs
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

In [675]:
# Neural network
model = Sequential()
n_input = X_train.shape[1]
n_hidden = n_input
# Topology
model.add(Dense(n_hidden, input_dim=n_input, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam')
istory = model.fit(X_train, y_train, validation_data=(X_test, y_test), 
              epochs=100, batch_size=None, verbose=0)
train_loss = history.history['loss']
test_loss = history.history['val_loss']
plt.plot(train_loss, label='Training loss')
plt.plot(test_loss, label='Testing loss')
