# North Atlantic Hurricane Basin Predictor

![alt text](USA_zones.png)
<center>(Figure 1) Hurricane Zones</center>

Import modules

In [104]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from imblearn.over_sampling import RandomOverSampler

Import data

In [28]:
df = pd.read_csv('./data_preprocessed.csv')
df.drop('Unnamed: 0', axis=1, inplace=True)

Set independent and dependent variables

In [127]:
df = df.sample(n=10000)
ros = RandomOverSampler(random_state=42)
X = df.drop(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K'], axis=1)
y = df[['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K']]

Split data into training and test sets

In [128]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=42,
                                                    shuffle=True)

### Zone "G"

Balancing classes and standardize independent variables

Balancing classes greatly improved F1 score

In [122]:
# Oversamping 'hits' to balance classes
X_trainG, y_trainG = ros.fit_sample(X_train, y_train.G)    
scaler = StandardScaler()
X_trainG = scaler.fit_transform(X_trainG)
X_test = scaler.transform(X_test)

Baseline for a storm striking zone "G" (Southern Florida)

In [81]:
1 - y.G.sum() / y.G.count()

0.9041

The classes aren't balanced with 90.2% of storm events missing zone "G"

Predictive model

In [82]:
clf = LogisticRegression()
print(cross_val_score(clf, X_trainG, y_trainG, cv=3, n_jobs=-1).mean())
print(cross_val_score(clf, X_trainG, y_trainG, scoring='f1', cv=3, n_jobs=-1).mean())

0.8628449166297772
0.8705360141671176


In [102]:
clf = RandomForestClassifier()
print(cross_val_score(clf, X_trainG, y_trainG, cv=3, n_jobs=-1).mean())
print(cross_val_score(clf, X_trainG, y_trainG, scoring='f1', cv=5, n_jobs=-1).mean())

0.9856131031429837
0.9908617169368563


In [84]:
clf = SVC()
print(cross_val_score(clf, X_trainG, y_trainG, cv=3, n_jobs=-1).mean())
print(cross_val_score(clf, X_trainG, y_trainG, scoring='f1', cv=3, n_jobs=-1).mean())

0.9454035709015788
0.9478700642555117


In [85]:
clf = KNeighborsClassifier()
print(cross_val_score(clf, X_trainG, y_trainG, cv=3, n_jobs=-1).mean())
print(cross_val_score(clf, X_trainG, y_trainG, scoring='f1', cv=3, n_jobs=-1).mean())

0.9182529142688506
0.923994035873223


In [87]:
clf = AdaBoostClassifier()
print(cross_val_score(clf, X_trainG, y_trainG, cv=3, n_jobs=-1).mean())
print(cross_val_score(clf, X_trainG, y_trainG, scoring='f1', cv=3, n_jobs=-1).mean())

0.7871477054743986
0.7891001209641132


After at glance at the different classifiers, Random Forest jumps out as a strong candidate

In [129]:
parms = {
    'n_estimators': [46],
    'max_depth': [97],
}
model = RandomForestClassifier()
clf = GridSearchCV(model, parms, cv=5, scoring='f1', n_jobs=-1, verbose=1, error_score=0)
clf.fit(X_trainG, y_trainG)
print(clf.best_score_, clf.best_estimator_)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    1.9s remaining:    2.8s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.4s finished


0.9920738783985297 RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=97, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=46, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)


In [130]:
metrics.f1_score(y_test.G, clf.predict(X_test))

  'precision', 'predicted', average, warn_for)


0.0