In [None]:
import sqlite3
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as ss

In [None]:
# Loading the dataset.
con=sqlite3.connect('/Users/bhuvanagopalakrishnabasapur/PycharmProjects/Practise/Assignments/Wildfire_Project/Wildfire_project/FPA_FOD_20170508.sqlite')
df = pd.read_sql_query("SELECT * FROM Fires", con)
pd.set_option('display.max_columns', None)
df.head()

In [None]:
def missing_percentage(df):
    """This function takes a DataFrame(df) as input and returns two columns, total missing values and total missing values percentage"""
    total = df.isnull().sum().sort_values(ascending = False)
    percent = round(df.isnull().sum().sort_values(ascending = False)/len(df)*100,2)
    return pd.concat([total, percent], axis=1, keys=['Total','Percent'])

In [None]:
missing_percentage(df)

In [None]:
#Feature Reduction

In [None]:
# Dropping all those columns whose missing values are greater than 70%
df = df.drop(['COMPLEX_NAME', 'MTBS_ID', 'MTBS_FIRE_NAME',
            'ICS_209_INCIDENT_NUMBER', 'ICS_209_NAME', 'FIRE_CODE', 'LOCAL_FIRE_REPORT_ID'], axis=1)

In [None]:
#Removing the following columns as similar attributes are present with no missing values
#CONT_TIME, FIRE_NAME, CONT_DOY, CONT_DATE, DISCOVERY_TIME
df = df.drop(['CONT_TIME', 'CONT_DOY', 'CONT_DATE', 'DISCOVERY_TIME'], axis=1)

In [None]:
# Removing the columns which may not affect the model.
df = df.drop(['FIRE_NAME', 'LOCAL_INCIDENT_ID', 
              'FIPS_NAME' , 
              'FIPS_CODE', 'NWCG_REPORTING_UNIT_NAME', 
              'NWCG_REPORTING_UNIT_ID','NWCG_REPORTING_AGENCY', 
              'SOURCE_REPORTING_UNIT', 'SOURCE_REPORTING_UNIT_NAME', 
              'SOURCE_SYSTEM','SOURCE_SYSTEM_TYPE', 
              'FPA_ID', 'FOD_ID', 'OWNER_CODE', 'OWNER_DESCR', 
              'COUNTY'], axis=1)

In [None]:
#Observed that the vales in 'Shape' attribute are very long and not understandable. 
# As the shape does not affect the model will be removing Shape as well
df = df.drop(['Shape'], axis=1)

In [None]:
# Removing the attribute STAT_CAUSE_DESCR as STAT_CAUSE_DESCR is the description of STAT_CAUSE_CODE
df = df.drop(['STAT_CAUSE_DESCR'], axis=1)

In [None]:
df.head()

In [None]:
# Creating the varibales X and y and converting them into numpy array

#Changing pandas dataframe to numpy array
y = df['FIRE_SIZE'].values
X = np.concatenate( (df['DISCOVERY_DATE'].values.reshape(-1,1),df['DISCOVERY_DOY'].values.reshape(-1,1), 
                     df['STAT_CAUSE_CODE'].values.reshape(-1,1),df['LATITUDE'].values.reshape(-1,1), 
                     df['LONGITUDE'].values.reshape(-1,1)),axis = 1 )
print(y[0:3])
print(X.shape)

In [None]:
#Normalizing the data
from sklearn.preprocessing import StandardScaler 
sc = StandardScaler() 
X = sc.fit_transform(X)

In [None]:
# Importing the necessary modules.
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import mean_absolute_error as MAE

In [None]:
#case 1
#Training data size = 90%
#Test data size = 10%

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.10)
print("# Train: {} , #Test: {}".format(X_train.shape[0], X_test.shape[0]))
print("# inputs: {}".format(X_train.shape[1]))
n = X_train.shape[1]

In [None]:
plt.hist(y, bins=np.logspace(-4,6, 50));
plt.ylabel('Number of Fires')
plt.xlabel('Fire Size')
plt.gca().set_yscale("log")
plt.gca().set_xscale("log")
plt.savefig("kaggle_fires_dist_log.png")
plt.show()

print("Mean fire size: {}".format(np.mean(y)))
print("Median fire size: {}".format(np.median(y)))
print("Var in fire size: {}".format(np.var(y)))

In [None]:
#Data Modeling

In [None]:
from sklearn.linear_model import LinearRegression

# This creates a LinearRegression object
lm = LinearRegression()

# Fit a linear model, calculate the root mean squared error 
# and the R2 score.
lm.fit(X_train, y_train)

y_train_predict  = lm.predict(X_train)
y_test_predict = lm.predict(X_test)

# Metrics for evaluation for train set.
rmse_train = np.sqrt(mean_squared_error(y_train, y_train_predict))
r2_train = r2_score(y_train, y_train_predict)

# Metrics for evaluation for test set.
rmse = np.sqrt(mean_squared_error(y_test, y_test_predict))
r2 = r2_score(y_test, y_test_predict)

print('Root mean squared error on Training Set', rmse_train)
print('R2 score on Training Set: ', r2_train)

print('Root mean squared error on Testing Set', rmse)
print('R2 score on Testing Set: ', r2)

In [None]:
def mean_absolute_percentage_error(y_true, y_pred):
    '''Writing a function to calculate Mean Absolute Percentage Error'''
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [None]:
# Mean absolute error (MAE).
print('The MAE of the training set is ',
       MAE(y_train, y_train_predict))
print('The MAE of the testing set is ',
       MAE(y_test, y_test_predict))
# Mean absolute percentage error (MAPE).
print('The MAPE of the training set is ', 
      mean_absolute_percentage_error(y_train, y_train_predict))
print('The MAPE of the testing set is ',
      mean_absolute_percentage_error(y_test, y_test_predict))

In [None]:
# Predicted vs Actual scatterplot

plt.scatter(y_test, y_test_predict, s=0.3, marker='.', c='r')
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('Linear Regression')
plt.gca().set_xscale("log")
plt.gca().set_yscale("log")
plt.plot([.1,20,1000],[.1,20,1000],c='k',linestyle='--')

In [None]:
import warnings
warnings.filterwarnings('ignore')

from collections import OrderedDict
from sklearn.ensemble import RandomForestRegressor

In [None]:
#Random Forest Regressor
clf = RandomForestRegressor(n_estimators=100,
                               warm_start=True, oob_score=True,
                               max_features="sqrt", bootstrap = True,
                               random_state=42)
clf.fit(X_train, y_train)

In [None]:
# Predicting on the train set
y_train_predicted_rf = clf.predict(X_train)
# Predicting on the test set
y_test_predicted_rf = clf.predict(X_test)

# The Root mean squared error for train set.
rmse_train = np.sqrt(mean_squared_error(y_train, y_train_predicted_rf))
# The Root mean squared error for test set.
rmse_test = np.sqrt(mean_squared_error(y_test, y_test_predicted_rf))
                    
# R2 score for train set.
r2_train_rf = r2_score(y_train, y_train_predicted_rf)
# R2 score for test set.
r2_test_rf = r2_score(y_test, y_test_predicted_rf)

print("Root Mean squared error for train set: ", rmse_train)  
print("Root Mean squared error for test set: ", rmse_test)  
print('R2 score for train set: ', r2_train_rf)
print('R2 score for test set: ', r2_test_rf)

In [None]:
# Mean absolute error (MAE).
print('The MAE of the training set is ',
       MAE(y_train, y_train_predicted_rf))
print('The MAE of the testing set is ',
       MAE(y_test, y_test_predicted_rf))

# Mean absolute percentage error (MAPE) for Random Forest Regressor Model
print('The MAPE of the training set is ', 
      mean_absolute_percentage_error(y_train, y_train_predicted_rf))
print('The MAPE of the testing set is ',
      mean_absolute_percentage_error(y_test, y_test_predicted_rf))

In [None]:
# Predicted vs Actual scatterplot

plt.scatter(y_test, y_test_predicted_rf, s=0.3, marker='.', c='r')
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('Random Forest')
plt.gca().set_xscale("log")
plt.gca().set_yscale("log")
plt.plot([.1,20,1000],[.1,20,1000],c='k',linestyle='--')

In [None]:
# SVM
from sklearn import svm

n = 10000
svm_reg = svm.SVR(kernel='rbf', degree=3, gamma='auto')

svm_reg.fit(X_train[0:n,:],y_train[0:n].reshape(-1))

In [None]:
y_train_predict_svm  = svm_reg.predict(X_train[0:n,:])
y_test_predict_svm = svm_reg.predict(X_test[0:n,:])

In [None]:
# Metrics for evaluation for train set.
rmse_train_svm = np.sqrt(mean_squared_error(y_train[0:n], y_train_predict_svm))
r2_train_svm = r2_score(y_train[0:n], y_train_predict_svm)

# Metrics for evaluation for test set.
rmse_svm = np.sqrt(mean_squared_error(y_test[0:n], y_test_predict_svm))
r2_svm = r2_score(y_test[0:n], y_test_predict_svm)

print('Root mean squared error on Training Set', rmse_train_svm)
print('R2 score on Training Set: ', r2_train_svm)

print('Root mean squared error on Testing Set', rmse_svm)
print('R2 score on Testing Set: ', r2_svm)
print('----------------------------------------------------------')

In [None]:
# Mean absolute error (MAE).
print('The MAE of the training set is ',
       MAE(y_train[0:n], y_train_predict_svm))
print('The MAE of the testing set is ',
       MAE(y_test[0:n], y_test_predict_svm))

# Mean absolute percentage error (MAPE).
print('The MAPE of the training set is ', 
      mean_absolute_percentage_error(y_train[0:n], y_train_predict_svm))
print('The MAPE of the testing set is ',
      mean_absolute_percentage_error(y_test[0:n], y_test_predict_svm))

In [None]:
# Predicted vs Actual scatterplot

plt.scatter(y_test[0:n], y_test_predict_svm, s=0.3, marker='.', c='r')
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('SVM')
plt.gca().set_xscale("log")
plt.gca().set_yscale("log")
plt.plot([.1,20,1000],[.1,20,1000],c='k',linestyle='--')

In [None]:
#Knn
# K Nearest Neighbours
from sklearn.neighbors import KNeighborsRegressor

n = 50000

for nbs in [3,5,10,15,20]:
    for wts in ['distance','uniform']:
        K_nn = KNeighborsRegressor(n_neighbors=nbs,
                                   weights=wts)

        K_nn.fit(X_train[0:n,:],y_train[0:n].reshape(-1))
        y_train_predict_knn = K_nn.predict(X_train[0:n,:])
        y_test_predict_knn = K_nn.predict(X_test[0:n,:])

        print("nbs: {}, wts: {}".format(nbs, wts))
        print("MAE Train: {}".format(MAE(y_train[0:n],y_train_predict_knn)))
        print("MAE Test: {}".format(MAE(y_test[0:n],y_test_predict_knn)))
        print("r2 Train: {}".format(r2_score(y_train[0:n],y_train_predict_knn)))
        print("r2 Test: {}".format(r2_score(y_test[0:n],y_test_predict_knn)))
        print("---------------------------------------")

In [None]:
# Predicted vs Actual scatterplot

plt.scatter(y_test[0:n], y_test_predict_knn, s=0.3, marker='.', c='r')
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('Knn')
plt.gca().set_xscale("log")
plt.gca().set_yscale("log")
plt.plot([.1,20,1000],[.1,20,1000],c='k',linestyle='--')

In [None]:
# Decision Tree
from sklearn import tree

tree_clf = tree.DecisionTreeRegressor(criterion='mae')

tree_clf.fit(X_train[0:n,:],y_train[0:n].reshape(-1))

y_train_predict_dt = tree_clf.predict(X_train[0:n,:])
y_test_predict_dt = tree_clf.predict(X_test[0:n,:])

print("MAE Train: {}".format(MAE(y_train[0:n],y_train_predict_dt)))
print("MAE Test: {}".format(MAE(y_test[0:n],y_test_predict_dt)))
print("r2 Train: {}".format(r2_score(y_train[0:n],y_train_predict_dt)))
print("r2 Test: {}".format(r2_score(y_test[0:n],y_test_predict_dt)))
print("---------------------------------------")

In [None]:
# Predicted vs Actual scatterplot

plt.scatter(y_test[0:n], y_test_predict_dt, s=0.3, marker='.', c='r')
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('Decision Tree')
plt.gca().set_xscale("log")
plt.gca().set_yscale("log")
plt.plot([.1,20,1000],[.1,20,1000],c='k',linestyle='--')

In [None]:
# Stacked regressors
from mlxtend.regressor import StackingRegressor

n = 100000

estimators = [lm, tree_clf, K_nn, svm_reg]

regStack = StackingRegressor(regressors=estimators,
                            meta_regressor = lm)

regStack.fit(X_train[0:n,:],y_train[0:n].reshape(-1))

y_train_predict_regStack = regStack.predict(X_train[0:n,:])
y_test_predict_regStack = regStack.predict(X_test[0:n,:])

print("MAE Train: {}".format(MAE(y_train[0:n],y_train_predict_regStack)))
print("MAE Test: {}".format(MAE(y_test[0:n],y_test_predict_regStack)))
print("r2 Train: {}".format(r2_score(y_train[0:n],y_train_predict_regStack)))
print("r2 Test: {}".format(r2_score(y_test[0:n],y_test_predict_regStack)))
print("---------------------------------------")

In [None]:
# Predicted vs Actual scatterplot

plt.scatter(y_test[0:n], y_test_predict_regStack, s=0.3, marker='.', c='r')
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('Stacked Regressor')
plt.gca().set_xscale("log")
plt.gca().set_yscale("log")
plt.plot([.1,20,1000],[.1,20,1000],c='k',linestyle='--')