In [1]:
# Packages
import pandas as pd
import torch
from torch.autograd import Variable
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')
%matplotlib inline
import pylab
import math
pylab.rcParams['figure.figsize'] = (8.25, 6)

In [2]:
# Import data
tor_df = pd.read_csv("/data/raw/tor_train_set.csv")

In [3]:
# Seperating variables
# The outcomes
tornado_outcome = tor_df.iloc[:, [2]]


# Make sure variable is right
list(tornado_outcome)

['DAMAGE_PROPERTY']

In [4]:
# The predictors
tornado_predictors = tor_df.iloc[:, 3:]

# Drop the ones that are irrelevant to this model type
tornado_predictors = tornado_predictors.drop(['BEGIN_LAT',
                                              'BEGIN_LON',
                                              'OPEN_WATER_PROP',
                                              'DEV_OPEN_PROP',
                                              'DEV_LOW_PROP',
                                              'DEV_MED_PROP',
                                              'DEV_HIGH_PROP',
                                              'BARREN_LAND_PROP',
                                              'DECID_FOREST_PROP',
                                              'EVERGR_FOREST_PROP',
                                              'MIXED_FOREST_PROP',
                                              'SHRUB_SCRUB_PROP',
                                              'GRASS_LAND_PROP',
                                              'PASTURE_HAY_PROP',
                                              'CULT_CROPS_PROP',
                                              'WOOD_WETLAND_PROP',
                                              'HERB_WETLAND_PROP',
                                              'MOB_HOME_DENS',
                                              'POP_DENS',
                                              'INCOME',
                                              'TOT_DEV_INT',
                                              'TOT_WOOD_AREA',
                                              'WOOD_DEV_INT',
                                              'STATE_RANK',
                                              'EXP_INC_AREA'],
                                             axis = 1)


# Make sure the variables are right
list(tornado_predictors)

['DURATION_SECONDS',
 'TOR_LENGTH',
 'TOR_WIDTH',
 'YEAR',
 'MULTI_VORT_IND',
 'TOR_AREA',
 'TIME_SPLINE_1',
 'TIME_SPLINE_2',
 'TIME_SPLINE_3',
 'TIME_SPLINE_4',
 'TIME_SPLINE_5',
 'TIME_SPLINE_6',
 'TIME_SPLINE_7',
 'TIME_SPLINE_8',
 'JULIAN_SPLINE_1',
 'JULIAN_SPLINE_2',
 'JULIAN_SPLINE_3',
 'JULIAN_SPLINE_4',
 'JULIAN_SPLINE_5',
 'JULIAN_SPLINE_6',
 'JULIAN_SPLINE_7',
 'JULIAN_SPLINE_8',
 'JULIAN_SPLINE_9',
 'JULIAN_SPLINE_10',
 'JULIAN_SPLINE_11',
 'JULIAN_SPLINE_12']

In [5]:
# Make the outcomes into a numpy array
outcome_array = tornado_outcome.values

# Makes that numpy array into a torch Tensor
outcome_Tensor = torch.from_numpy(outcome_array)


# Make the predictors into a numpy array
predictors_array = tornado_predictors.values

# Make that numpy array into a torch Tensor
predictors_Tensor = torch.from_numpy(predictors_array)


# Convert those DoubleTensors to FloatTensors
# Changing outcomes from a DoubleTensor to a FloatTensor
outcome_Tensor = outcome_Tensor.float()

# Changing predictors too
predictors_Tensor = predictors_Tensor.float()

In [6]:
# Import cross-validation data
cv_df = pd.read_csv("/home/jeremy/github_tornadoesr/data/raw/tor_cv_set.csv")


# Get the outcomes
cv_outcome = cv_df.iloc[:, [2]]

# Convert the pandas column to a ndarray and then into a FloatTensor
cv_outcome_Tensor = torch.from_numpy(cv_outcome.values).float()


# Make sure the variable is right
list(cv_outcome)

['DAMAGE_PROPERTY']

In [7]:
# Get the validation set predictors
cv_predictors = cv_df.loc[:, ['DURATION_SECONDS',
                              'TOR_LENGTH',
                              'TOR_WIDTH',
                              'YEAR',
                              'MULTI_VORT_IND',
                              'TOR_AREA',
                              'TIME_SPLINE_1',
                              'TIME_SPLINE_2',
                              'TIME_SPLINE_3',
                              'TIME_SPLINE_4',
                              'TIME_SPLINE_5',
                              'TIME_SPLINE_6',
                              'TIME_SPLINE_7',
                              'TIME_SPLINE_8',
                              'JULIAN_SPLINE_1',
                              'JULIAN_SPLINE_2',
                              'JULIAN_SPLINE_3',
                              'JULIAN_SPLINE_4',
                              'JULIAN_SPLINE_5',
                              'JULIAN_SPLINE_6',
                              'JULIAN_SPLINE_7',
                              'JULIAN_SPLINE_8',
                              'JULIAN_SPLINE_9',
                              'JULIAN_SPLINE_10',
                              'JULIAN_SPLINE_11',
                              'JULIAN_SPLINE_12']]

# Make the validation set predictors into a numpy array
cv_predictors_Tensor = torch.from_numpy(cv_predictors.values).float()


# Make sure the variables are right
list(cv_predictors)

['DURATION_SECONDS',
 'TOR_LENGTH',
 'TOR_WIDTH',
 'YEAR',
 'MULTI_VORT_IND',
 'TOR_AREA',
 'TIME_SPLINE_1',
 'TIME_SPLINE_2',
 'TIME_SPLINE_3',
 'TIME_SPLINE_4',
 'TIME_SPLINE_5',
 'TIME_SPLINE_6',
 'TIME_SPLINE_7',
 'TIME_SPLINE_8',
 'JULIAN_SPLINE_1',
 'JULIAN_SPLINE_2',
 'JULIAN_SPLINE_3',
 'JULIAN_SPLINE_4',
 'JULIAN_SPLINE_5',
 'JULIAN_SPLINE_6',
 'JULIAN_SPLINE_7',
 'JULIAN_SPLINE_8',
 'JULIAN_SPLINE_9',
 'JULIAN_SPLINE_10',
 'JULIAN_SPLINE_11',
 'JULIAN_SPLINE_12']

In [8]:
# Import the test set data
test_df = pd.read_csv("/home/jeremy/github_tornadoesr/data/raw/tor_test_set.csv")


# Get the outcomes
test_outcome = test_df.iloc[:, [2]]

# Convert the pandas column to a ndarray and then into a FloatTensor
test_outcome_Tensor = torch.from_numpy(test_outcome.values).float()


# Make sure the variable is right
list(test_outcome)

['DAMAGE_PROPERTY']

In [9]:
# Get the test set predictors
test_predictors = test_df.loc[:, ['DURATION_SECONDS',
                                  'TOR_LENGTH',
                                  'TOR_WIDTH',
                                  'YEAR',
                                  'MULTI_VORT_IND',
                                  'TOR_AREA',
                                  'TIME_SPLINE_1',
                                  'TIME_SPLINE_2',
                                  'TIME_SPLINE_3',
                                  'TIME_SPLINE_4',
                                  'TIME_SPLINE_5',
                                  'TIME_SPLINE_6',
                                  'TIME_SPLINE_7',
                                  'TIME_SPLINE_8',
                                  'JULIAN_SPLINE_1',
                                  'JULIAN_SPLINE_2',
                                  'JULIAN_SPLINE_3',
                                  'JULIAN_SPLINE_4',
                                  'JULIAN_SPLINE_5',
                                  'JULIAN_SPLINE_6',
                                  'JULIAN_SPLINE_7',
                                  'JULIAN_SPLINE_8',
                                  'JULIAN_SPLINE_9',
                                  'JULIAN_SPLINE_10',
                                  'JULIAN_SPLINE_11',
                                  'JULIAN_SPLINE_12']]

# Make the test set predictors into a numpy array
test_predictors_Tensor = torch.from_numpy(test_predictors.values).float()


# Make sure the variables are right
list(test_predictors)

['DURATION_SECONDS',
 'TOR_LENGTH',
 'TOR_WIDTH',
 'YEAR',
 'MULTI_VORT_IND',
 'TOR_AREA',
 'TIME_SPLINE_1',
 'TIME_SPLINE_2',
 'TIME_SPLINE_3',
 'TIME_SPLINE_4',
 'TIME_SPLINE_5',
 'TIME_SPLINE_6',
 'TIME_SPLINE_7',
 'TIME_SPLINE_8',
 'JULIAN_SPLINE_1',
 'JULIAN_SPLINE_2',
 'JULIAN_SPLINE_3',
 'JULIAN_SPLINE_4',
 'JULIAN_SPLINE_5',
 'JULIAN_SPLINE_6',
 'JULIAN_SPLINE_7',
 'JULIAN_SPLINE_8',
 'JULIAN_SPLINE_9',
 'JULIAN_SPLINE_10',
 'JULIAN_SPLINE_11',
 'JULIAN_SPLINE_12']

In [62]:
# Save the stored predictions and outcomes as a .csv
stored_cv_pred_df.to_csv("12_cv_perf.csv")

stored_test_pred_df.to_csv("12_test_perf.csv")

metrics_df.to_csv("12_metrics.csv")