In [None]:
plot_train_perf(predictors_Tensor, outcome_Tensor_binary, classifier, 0.0025, True,
                -0.05, 1.05, 'Observed Label', 'Probability of Label 1', False)

In [None]:
# Get predicted probabilities as binary values
classifier_pred = np.round(classifier(Variable(predictors_Tensor, requires_grad = False)).data.numpy())

# Determine if those predictions are right
number_correct_preds = (classifier_pred == outcome_Tensor_binary.numpy())

# Get the accuracy of those predictions
train_accuracy = (sum(number_correct_preds) / len(number_correct_preds))[0]


# Lets get it in words
'Training set accuracy is: ' + str(train_accuracy)

In [None]:
cv_classifier_pred = np.round(classifier(Variable(cv_predictors_Tensor, requires_grad = False)).data.numpy())

number_correct_cv_preds = (cv_classifier_pred == cv_outcome_Tensor_binary.numpy())

cv_accuracy = (sum(number_correct_cv_preds) / len(number_correct_cv_preds))[0]


'Cross-validation set accuracy is: ' + str(cv_accuracy)

In [None]:
test_classifier_pred = np.round(classifier(Variable(test_predictors_Tensor, requires_grad = False)).data.numpy())

number_correct_test_preds = (test_classifier_pred == test_outcome_Tensor_binary.numpy())

test_accuracy = (sum(number_correct_test_preds) / len(number_correct_test_preds))[0]


'Test set accuracy is: ' + str(test_accuracy)

In [None]:
# Get the false positive and true positive rates using scikit-learn
fpr_a, tpr_a, _ = roc_curve(y_true = test_outcome_Tensor_binary.numpy(),
                            y_score = classifier(Variable(test_predictors_Tensor, requires_grad = False)).data.numpy())

# Calculate the area under the curve
area_under = auc(x = fpr_a,
                 y = tpr_a)


# Plot it all
plt.plot(fpr_a, tpr_a, 'deeppink')
plt.plot(fpr_a, fpr_a, 'black')
plt.xlabel('False Positive Rate', size = 14)
plt.ylabel('True Positive Rate', size = 14)
plt.title('ROC/AUC Test Set Performance', size = 16)
plt.text(0.26, 0.05, 'Area under Curve = %s'%(area_under), size = 13)
axes = plt.gca()
axes.set_xlim([0, 1])
axes.set_ylim([0, 1]);

In [None]:
plt.scatter(test_outcome_Tensor_binary.numpy(),
            classifier(Variable(test_predictors_Tensor, requires_grad = False)).data.numpy(),
            alpha = 0.0075,
            facecolor = 'k')
axes = plt.gca()
plt.xticks([0, 1], ['\nNo damage', '\nDamage'])
plt.xlim(-0.05, 1.05)
plt.ylim(0, 1)
axes.grid(False)
axes.set_xlabel('True Outcome', size = 14)
axes.set_ylabel('Predicted Probability of Damage', size = 14)
plt.axhline(y = 0.5, color = 'red', linestyle = '--')

## 3. Get and evaluate the test set expected values

Import the data to undo model-needed processing

In [None]:
unproc_tor_df = pd.read_csv("/home/jeremydiaz/tornadoesr/data/raw/tor_data_with_derived.csv")

mean_log_dam = np.mean(np.log(unproc_tor_df['DAMAGE_PROPERTY'] + 1))
stand_dev_log_dam = np.std(np.log(unproc_tor_df['DAMAGE_PROPERTY'] + 1))

Computations

In [None]:
# Conditional predictions
# Get the model-scale predictions
test_conditional_predictions_raw = model(Variable(test_predictors_Tensor, requires_grad = False))

# Convert to numpy
test_conditional_predictions = test_conditional_predictions_raw.data.numpy()

# Convert that to natural-log-scale
test_conditional_predictions = (test_conditional_predictions * stand_dev_log_dam) + mean_log_dam

# Convert that to natural scale 
test_conditional_predictions = np.exp(test_conditional_predictions)

# Convert that to log-10 scale
test_conditional_predictions = np.log10(test_conditional_predictions)


# Probabilities
test_probabilities_raw = classifier(Variable(test_predictors_Tensor, requires_grad = False))

# Convert to numpy
test_probabilities = test_probabilities_raw.data.numpy()


# Expected values
test_expected_values = test_conditional_predictions * test_probabilities

Save them

In [None]:
# Get the observed damages in the same scale
test_df['DAMAGE_PROPERTY'] = (test_df['DAMAGE_PROPERTY'] * stand_dev_log_dam) + mean_log_dam
# Convert that to natural scale 
test_df['DAMAGE_PROPERTY'] = np.exp(test_df['DAMAGE_PROPERTY'])
# Convert that to log-10 scale
test_df['DAMAGE_PROPERTY'] = np.log10(test_df['DAMAGE_PROPERTY'])



# Get those expected values into the grid DataFrame and save it
test_ev_df = pd.DataFrame(test_expected_values)
test_ev_df.columns = ['EXPECTED_VALUE']
test_df = pd.concat([test_ev_df, test_df], axis = 1)
test_df.to_csv('test_with_expectated_values.csv')

In [None]:
test_df['JITTER'] = np.random.uniform(low = 0.05, high = 0.75, size = len(test_df['DAMAGE_PROPERTY']))

test_df.loc[test_df.DAMAGE_PROPERTY != test_df.DAMAGE_PROPERTY.min(), 'JITTER'] *= 0

In [None]:
test_df['PROB'] = test_probabilities

plt.scatter(test_df['DAMAGE_PROPERTY'] + test_df['JITTER'],
            test_df['EXPECTED_VALUE'],
            c = test_df['PROB'])
plt.ylabel('Expected Value', size = 14)
plt.xlabel('Observed Value', size = 14)
axes = plt.gca()
axes.grid(False)
axes.set_xlim([-0.1, 9.5])
axes.set_ylim([-0.1, 9.5])
plt.plot([-0.5, 9.5], [-0.5, 9.5], c = 'grey', linestyle = '--');

In [None]:
log10_predictions = torch.from_numpy(test_df['EXPECTED_VALUE'].values)
log10_observed = torch.from_numpy(test_df['DAMAGE_PROPERTY'].values).float()

test_observed_mean_log10 = (sum(test_df['DAMAGE_PROPERTY']) / len(test_df['DAMAGE_PROPERTY']))
    
test_outcomes_mean_log10 = np.repeat(test_observed_mean_log10, len(test_df['DAMAGE_PROPERTY']))
    
test_outcomes_mean_log10 = Variable(torch.from_numpy(test_outcomes_mean_log10).float())


loss_fn = torch.nn.MSELoss(size_average = False)
TSS = loss_fn(test_outcomes_mean_log10, Variable(log10_observed))
RSS = loss_fn(Variable(log10_predictions), Variable(log10_observed))

R_squared = 1 - (RSS / TSS)

In [None]:
log10_MSE = (RSS.data.numpy()[0] / len(test_df['DAMAGE_PROPERTY']))
print("MSE in log-10 scale is %0.6f" % log10_MSE)

In [None]:
print("R-sqaured in log-10 scale is %0.6f" % R_squared.data.numpy())

In [None]:
plt.scatter(test_conditional_predictions,
            test_df['EXPECTED_VALUE'],
            facecolor = 'none',
            edgecolor = 'k',
            alpha = 0.2)
axes = plt.gca()
axes.set_xlim([0, 8])
axes.set_ylim([0, 8])
plt.xlabel("Conditional Predictions", size = 15)
plt.ylabel("Expected Value", size = 15)
plt.plot([-1, 9.5], [-1, 9.5]);

In [None]:
plt.scatter(test_conditional_predictions,
            test_probabilities,
            facecolor = 'none',
            edgecolor = 'k',
            alpha = 0.2)
axes = plt.gca()
plt.xlabel("Conditional Prediction", size = 15)
plt.ylabel("Probability of Damage", size = 15)
axes.set_xlim([0, 8])
axes.set_ylim([0, 1.05]);

In [None]:
print("Having to use very low alpha values because there are %0.6f" % int(len(test_conditional_predictions))
      + " data points")

## 4. Get the expected values for 2018

Get the gridded and cities DataFrames of assumed storm characteristics, true geographic values, and potential dates/times