In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
import pandas_profiling
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [2]:
# Reading the combined CSV files
df = pd.read_csv('Wildfire.csv')
df = df.drop(columns=['fire_name','Unnamed: 0', 'Unnamed: 0.1','disc_date_final','cont_date_final','cont_clean_date','putout_time', 'fire_size_class', 'fire_mag', 'weather_file', 'state'])
df = df.drop(columns=['disc_clean_date', 'discovery_month', 'disc_date_pre', 'disc_pre_year', 'disc_pre_month']) # drop date columns
df = df.drop(columns=['wstation_usaf', 'dstation_m', 'wstation_wban', 'wstation_byear', 'wstation_eyear']) # drop station columns

#Get rid of outliers - fires of size larger than 5000 acres, and there are large number of small fires and other very less number are having the high 
# area of fires, because of which the deviation is very high
df = df.loc[df['fire_size'] < 5000]
df.columns
df.dtypes

fire_size           float64
stat_cause_descr     object
latitude            float64
longitude           float64
Vegetation            int64
Temp_pre_30         float64
Temp_pre_15         float64
Temp_pre_7          float64
Temp_cont           float64
Wind_pre_30         float64
Wind_pre_15         float64
Wind_pre_7          float64
Wind_cont           float64
Hum_pre_30          float64
Hum_pre_15          float64
Hum_pre_7           float64
Hum_cont            float64
Prec_pre_30         float64
Prec_pre_15         float64
Prec_pre_7          float64
Prec_cont           float64
remoteness          float64
dtype: object

In [None]:
profile = pandas_profiling.ProfileReport(df)
profile.to_file("Combined_Profile.html")

In [None]:
df['stat_cause_descr'].unique()

In [3]:
from utils import reduce_cause_labels
# Group campfire, 
df = reduce_cause_labels(df)
df['stat_cause_descr'].unique()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['stat_cause_descr'] = df['stat_cause_descr'].apply(lambda x: 'Other' if (x in reduced_labels) else x)


array(['Arson', 'Debris Burning', 'Miscellaneous', 'Other', 'Lightning'],
      dtype=object)

In [4]:
cause_encoded_dist = {
    'Missing/Undefined':0, 
    'Arson':1, 
    'Debris Burning':2, 
    'Miscellaneous':3,
    'Campfire':4, 
    'Fireworks':5, 
    'Children':6, 
    'Lightning':7, 
    'Equipment Use':8,
    'Smoking':9, 
    'Railroad':10, 
    'Structure':11, 
    'Powerline':12,
    'Other':13
}

# Encode categorical values to numeric
df['stat_cause_descr'] = df['stat_cause_descr'].apply(lambda x: cause_encoded_dist[x]).astype('int')
df.head()

Unnamed: 0,fire_size,stat_cause_descr,latitude,longitude,Vegetation,Temp_pre_30,Temp_pre_15,Temp_pre_7,Temp_cont,Wind_pre_30,...,Wind_cont,Hum_pre_30,Hum_pre_15,Hum_pre_7,Hum_cont,Prec_pre_30,Prec_pre_15,Prec_pre_7,Prec_cont,remoteness
1,3.0,1,35.03833,-87.61,15,7.553433,7.01,0.343529,10.448298,2.709764,...,2.12232,70.84,65.858911,55.505882,81.682678,59.8,8.4,0.0,86.8,0.184355
2,60.0,1,34.9478,-88.7225,16,4.97193,5.782766,5.55875,13.6966,3.364499,...,3.36905,75.531629,75.868613,76.812834,65.0638,168.8,42.2,18.1,124.5,0.194544
3,1.0,2,39.6414,-119.3083,0,16.275967,18.996181,18.142564,0.0,4.054982,...,0.0,44.778429,37.140811,35.353846,0.0,10.4,7.2,0.0,0.0,0.487447
4,2.0,3,30.7006,-90.5914,12,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.214633
5,1.0,2,32.0639,-82.4178,12,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.139643


In [None]:
df['fire_size'] = df['fire_size'].apply(lambda x: np.log(x * 10))

In [None]:
df.head()

In [4]:
# Use subset df with cont columns
df_cause = df[['stat_cause_descr', 'latitude', 'longitude', 'Vegetation', 'Temp_cont', 'Wind_cont', 'Hum_cont', 'Prec_cont', 'remoteness']]
df_cont = df[['fire_size', 'latitude', 'longitude', 'Vegetation', 'Temp_cont', 'Wind_cont', 'Hum_cont', 'Prec_cont', 'remoteness']]
df_pre_7 = df[['fire_size', 'latitude', 'longitude', 'Vegetation', 'Temp_pre_7', 'Wind_pre_7', 'Hum_pre_7', 'Prec_pre_7', 'remoteness']]
df_pre_15 = df[['fire_size', 'latitude', 'longitude', 'Vegetation', 'Temp_pre_15', 'Wind_pre_15', 'Hum_pre_15', 'Hum_pre_15', 'remoteness']]
df_pre_30 = df[['fire_size', 'latitude', 'longitude', 'Vegetation', 'Temp_pre_30', 'Wind_pre_30', 'Hum_pre_30', 'Hum_pre_30', 'remoteness']]


In [6]:
df_dummies_X = df_cause

In [7]:
# df_dummies_X = df_pre_7.drop(columns=['stat_cause_descr'])
df_dummies_X = pd.get_dummies(df_dummies_X)
X = df_dummies_X
Y = df['stat_cause_descr']

In [6]:
X = df_dummies_X.drop(columns=['fire_size'])
Y = df['fire_size']

NameError: name 'df_dummies_X' is not defined

In [27]:
X = df_dummies_X.drop(columns=['stat_cause_descr'])
Y = df['stat_cause_descr']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.15, random_state=50)

In [None]:
sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

In [None]:
from sklearn.decomposition import PCA
pca = PCA()
#
# Determine transformed features
#
X_train_pca = pca.fit_transform(X_train_std)
X_test_pca = pca.transform(X_test_std)

In [None]:
X_train_pca

In [None]:
X_test_pca

In [None]:
import plotly.express as px
pca = PCA()
pca.fit(df)
exp_var_cumul = np.cumsum(pca.explained_variance_ratio_)

px.area(
    x=range(1, exp_var_cumul.shape[0] + 1),
    y=exp_var_cumul,
    labels={"x": "# Components", "y": "Explained Variance"}
)

In [None]:
# Feature importances using RandomForest
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor()

model.fit(X_train, y_train)

print(f'model score on training data: {model.score(X_train, y_train)}')
print(f'model score on testing data: {model.score(X_test, y_test)}')

# training_accuracy = accuracy_score(y_train, model.predict(X_train))
# print(training_accuracy)

pred = model.predict(X_test)

print(pred)

# test_accuracy = accuracy_score(y_test, pred)
# print(test_accuracy)


In [None]:
# print(y_test)
test_accuracy = accuracy_score(y_test.values, pred)
print(test_accuracy)

In [None]:
importances = model.feature_importances_
indices = np.argsort(importances)

fig, ax = plt.subplots()
ax.barh(range(len(importances)), importances[indices])
ax.set_yticks(range(len(importances)))
_ = ax.set_yticklabels(np.array(X_train.columns)[indices])

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

# Gradient Boosting
sc = StandardScaler()
X_train_std = sc.fit_transform(X_train)
X_test_std = sc.transform(X_test)
#
# Hyperparameters for GradientBoostingRegressor
#
gbr_params = {'n_estimators': 1000,
          'max_depth': 3,
          'min_samples_split': 5,
          'learning_rate': 0.01,
          'loss': 'ls'}
#
# Create an instance of gradient boosting regressor
#
gbr = GradientBoostingRegressor(**gbr_params)
#
# Fit the model
#
gbr.fit(X_train_std, y_train)
#
# Print Coefficient of determination R^2
#
print("Model Accuracy: %.3f" % gbr.score(X_test_std, y_test))
#
# Create the mean squared error
#
mse = mean_squared_error(y_test, gbr.predict(X_test_std))
print("The mean squared error (MSE) on test set: {:.4f}".format(mse))

In [None]:
# Random Forest Classifier
classifier = RandomForestClassifier(n_estimators=500, max_depth=50, min_samples_leaf=50, random_state=123)

classifier.fit(X_train, y_train)
training_accuracy = accuracy_score(y_train, classifier.predict(X_train))
print(training_accuracy)

pred = classifier.predict(X_test)
test_accuracy = accuracy_score(y_test, pred)
print(test_accuracy)

In [17]:
%load_ext autoreload
%autoreload 2
from utils import run_all_regressors

run_all_regressors(X_train, y_train, X_test, y_test)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Running RandomForestRegressor
Score on training data: 0.9809916856157387
Score on testing data: 0.8747771134914175
Mean Absolute Error:  70.87635904272949
R Squared:  0.8747771134914175
Adjusted R Squared:  0.8747608695016643

Running GradientBoostingRegressor
Score on training data: 0.8489945268650858
Score on testing data: 0.8593170712562382
Mean Absolute Error:  76.11121227159751
R Squared:  0.8593170712562382
Adjusted R Squared:  0.8592988217803104

Running DecisionTreeRegressor
Score on training data: 0.999936184756783
Score on testing data: 0.7267981245450101
Mean Absolute Error:  82.76235076147114
R Squared:  0.7267981245450101
Adjusted R Squared:  0.7267626846299181

Running ExtraTreesRegressor
Score on training data: 0.9999361713289749
Score on testing data: 0.876027232133217
Mean Absolute Error:  71.38689486701446
R Squared:  0.876027232133217
Adjusted R Squared:  0.8760111503096218



In [9]:
%load_ext autoreload
%autoreload 2
from utils import run_all_classifiers

run_all_classifiers(X_train, y_train, X_test, y_test)

Running RandomForestClassifier
['Debris Burning' 'Debris Burning' 'Other' 'Other' 'Debris Burning']
48521    Debris Burning
21980    Debris Burning
20536             Other
30335             Other
31232    Debris Burning
Name: stat_cause_descr, dtype: object
Accuracy Score:  1.0
Running GradientBoostingClassifier
['Debris Burning' 'Debris Burning' 'Other' 'Other' 'Debris Burning']
48521    Debris Burning
21980    Debris Burning
20536             Other
30335             Other
31232    Debris Burning
Name: stat_cause_descr, dtype: object
Accuracy Score:  1.0
Running DecisionTreeClassifier
['Debris Burning' 'Debris Burning' 'Other' 'Other' 'Debris Burning']
48521    Debris Burning
21980    Debris Burning
20536             Other
30335             Other
31232    Debris Burning
Name: stat_cause_descr, dtype: object
Accuracy Score:  1.0
Running ExtraTreesClassifier
['Debris Burning' 'Debris Burning' 'Other' 'Other' 'Debris Burning']
48521    Debris Burning
21980    Debris Burning
20536        

In [None]:
print()