In [1]:
import numpy as np
import matplotlib.pyplot as plt
import sklearn as sk
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import calendar
import seaborn as sns
import sqlite3

In [2]:
# import dataset
cnx = sqlite3.connect("FPA_FOD_20170508.sqlite")
kaggle = pd.read_sql_query("SELECT * FROM Fires LIMIT 10000", cnx)
print("Original Kaggle shape: ", np.shape(kaggle))
# print("Dimensions: ", kaggle.columns.values.tolist())

Original Kaggle shape:  (10000, 39)


In [3]:
# removing some columns
kaggle = kaggle.drop(['Shape', 'FPA_ID'], axis=1)


le = LabelEncoder()
kaggle['SOURCE_SYSTEM_TYPE'] = kaggle[['SOURCE_SYSTEM_TYPE']].apply(le.fit_transform)
kaggle[['SOURCE_SYSTEM']] = kaggle[['SOURCE_SYSTEM']].apply(le.fit_transform)
kaggle[['NWCG_REPORTING_AGENCY']] = kaggle[['NWCG_REPORTING_AGENCY']].apply(le.fit_transform)
kaggle[['NWCG_REPORTING_UNIT_ID']] = kaggle[['NWCG_REPORTING_UNIT_ID']].apply(le.fit_transform)
kaggle[['NWCG_REPORTING_UNIT_NAME']] = kaggle[['NWCG_REPORTING_UNIT_NAME']].apply(le.fit_transform)
kaggle[['SOURCE_REPORTING_UNIT_NAME']] = kaggle[['SOURCE_REPORTING_UNIT_NAME']].apply(le.fit_transform)
kaggle[['FIRE_SIZE_CLASS']] = kaggle[['FIRE_SIZE_CLASS']].apply(le.fit_transform)
kaggle[['OWNER_DESCR']] = kaggle[['OWNER_DESCR']].apply(le.fit_transform)
kaggle[['STATE']] = kaggle[['STATE']].apply(le.fit_transform)
kaggle[['FIPS_NAME']] = kaggle[['FIPS_NAME']].fillna(value="Unknown").apply(le.fit_transform)



kaggle = kaggle.drop(['LOCAL_FIRE_REPORT_ID', 'LOCAL_INCIDENT_ID', 'FIRE_CODE', 'FIRE_NAME', 'ICS_209_INCIDENT_NUMBER', 'ICS_209_NAME', 'MTBS_ID', 'MTBS_FIRE_NAME', 'COMPLEX_NAME', 'STAT_CAUSE_CODE'], axis=1)

kaggle = kaggle.drop(['SOURCE_REPORTING_UNIT'], axis=1)
kaggle['COUNTY'] = pd.to_numeric(kaggle['COUNTY'])
kaggle['FIPS_CODE'] = pd.to_numeric(kaggle['FIPS_CODE'])
kaggle['DISCOVERY_TIME'] = pd.to_numeric(kaggle['DISCOVERY_TIME'])
kaggle['CONT_TIME'] = pd.to_numeric(kaggle['CONT_TIME'])


# Could not find 'STATE_CAUSE_DESCR' to drop
print("New Kaggle shape", np.shape(kaggle))

New Kaggle shape (10000, 26)


In [4]:
# print(kaggle['STAT_CAUSE_DESCR'])
# kaggle.head(5)
# convert the cause to numbers
unique = np.unique(kaggle['STAT_CAUSE_DESCR'])
desc2index = {v: k for k, v in enumerate(unique)}
index2desc = {k: v for k, v in enumerate(desc2index)}
kaggle['STAT_CAUSE_DESCR'] = kaggle['STAT_CAUSE_DESCR'].map(desc2index)
#Remove any Nan or numbers with too large of a magnitude
kaggle = kaggle.replace([np.inf, -np.inf], np.nan).dropna(axis=1)
#Get the labels
labels = np.array(kaggle['STAT_CAUSE_DESCR'])
#drop the labels from the data
kaggle= kaggle.drop('STAT_CAUSE_DESCR', axis = 1)
# print("Dimensions: ", kaggle.columns.values.tolist())
feature_list = list(kaggle.columns)
kaggle = np.array(kaggle)
print("New Kaggle shape", np.shape(kaggle))
print("New labels shape", np.shape(labels))


New Kaggle shape (10000, 20)
New labels shape (10000,)


In [5]:
# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split
# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(kaggle, labels, test_size = 0.25)

print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

Training Features Shape: (7500, 20)
Training Labels Shape: (7500,)
Testing Features Shape: (2500, 20)
Testing Labels Shape: (2500,)


In [6]:
# The baseline predictions uses the mode of the labels
counts = np.bincount(train_labels)
mode = np.argmax(counts)
baseline_preds = np.ones_like(test_labels) * mode
# print(np.shape(baseline_preds), baseline_preds)
# print(np.shape(test_labels), test_labels)
# Baseline errors, and display average baseline error
baseline_errors = abs(baseline_preds - test_labels)
correct = np.sum(baseline_preds == test_labels)
print('Average baseline error: ', round(np.mean(baseline_errors), 2))
print("correct: ", correct, "wrong: ", np.size(test_labels) - correct)
print("---------")
print("Accuracy: ", correct / np.size(test_labels))

Average baseline error:  1.5
correct:  1205 wrong:  1295
---------
Accuracy:  0.482


In [7]:
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor
maxAccuracy = 0.0
maxAccuracyDepth = 0.0
maxAccuracyEstimators = 0.0
maxAccuracyFeatures = 0.0
for i in range(0, 1001, 5): 
    if (i == 0):
        continue
    for j in range (1, 51, 1):
        for k in range(1, 10, 1):
            rf = RandomForestRegressor(n_estimators = j, max_depth=i, max_features=0.1 * k)
            # Train the model on training data
            rf.fit(train_features, train_labels)
            # Use the forest's predict method on the test data
            predictions = rf.predict(test_features)
            predictions = np.rint(predictions)
            # print(predictions[: 20], "...")
            # print("---------")
            # print(test_labels[: 20], "...")
            # Calculate the absolute errors
            correct = np.sum(predictions == test_labels)
            accuracy = correct / np.size(test_labels)
            # print("max depth ", i, "max estimators", j)
            # print("correct: ", correct, "wrong: ", np.size(test_labels) - correct)
            # print("Accuracy: ", accuracy)
            # print("---------")
            if (accuracy > maxAccuracy):
                maxAccuracy = accuracy
                maxAccuracyDepth = i
                maxAccuracyEstimators = j
                maxAccuracyFeatures = k
            # Print out the mean absolute error (mae)
            # print('Mean Absolute Error:', np.mean(errors))
    print ("Max accuracy of ", maxAccuracy, " given by max_depth of", maxAccuracyDepth, ", n_estimators of ", maxAccuracyEstimators, ", and max_features of ", maxAccuracyFeatures)
print("-----------")
print ("Max accuracy of ", maxAccuracy, " given by max_depth of", maxAccuracyDepth, ", n_estimators of ", maxAccuracyEstimators, ", and max_features of ", maxAccuracyFeatures)

In [8]:
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor
# Instantiate model with n decision trees
rf = RandomForestRegressor(n_estimators = 2, max_depth=20, max_features=0.7)
# Train the model on training data
rf.fit(train_features, train_labels)

RandomForestRegressor(max_depth=20, max_features=0.7, n_estimators=2)

In [9]:
# Use the forest's predict method on the test data
predictions = rf.predict(test_features)
predictions = np.rint(predictions)
print(predictions[: 20], "...")
print("---------")
print(test_labels[: 20], "...")
# Calculate the absolute errors
correct = np.sum(predictions == test_labels)
print("correct: ", correct, "wrong: ", np.size(test_labels) - correct)
print("---------")
print("Accuracy: ", correct / np.size(test_labels))
# Print out the mean absolute error (mae)
# print('Mean Absolute Error:', np.mean(errors))

[5. 5. 6. 4. 6. 5. 5. 5. 5. 5. 5. 5. 5. 5. 5. 2. 3. 6. 5. 3.] ...
---------
[5 5 4 6 5 5 5 5 5 1 5 1 5 5 5 6 4 1 5 0] ...
correct:  1226 wrong:  1274
---------
Accuracy:  0.4904


In [10]:
# # Use the forest's predict method on the test data
# predictions = rf.predict(test_features)
# predictions = np.rint(predictions)
# print(predictions[: 20], "...")
# print("---------")
# print(test_labels[: 20], "...")
# # Calculate the absolute errors
# errors = abs(predictions - test_labels)
# print(errors)
# # Print out the mean absolute error (mae)
# print('Mean Absolute Error:', np.mean(errors))

In [11]:
# # Calculate mean absolute percentage error (MAPE)
# mape = 100 * ((errors+1)/ (test_labels + 1))
# print(mape)
# print(np.mean(mape))
# # Calculate and display accuracy
# accuracy = 100 - np.mean(mape)
# print('Accuracy:', accuracy, '%.')