In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
import pandas_profiling
import seaborn as sns
import matplotlib.pyplot as plt



In [None]:
# Reading the combined CSV files
df = pd.read_csv('Wildfire.csv')
df = df.drop(columns=['fire_name','Unnamed: 0', 'Unnamed: 0.1','disc_date_final','cont_date_final','cont_clean_date','putout_time', 'fire_size_class', 'fire_mag', 'weather_file', 'state'])
df = df.drop(columns=['disc_clean_date', 'discovery_month', 'disc_date_pre', 'disc_pre_year', 'disc_pre_month']) # drop date columns
df = df.drop(columns=['wstation_usaf', 'dstation_m', 'wstation_wban', 'wstation_byear', 'wstation_eyear']) # drop station columns

#Get rid of outliers - fires of size larger than 5000 acres, and there are large number of small fires and other very less number are having the high 
# area of fires, because of which the deviation is very high
df = df.loc[df['fire_size'] < 5000]
df.columns
df.dtypes

In [None]:
profile = pandas_profiling.ProfileReport(df)
profile.to_file("Combined_Profile.html")

In [None]:
df['stat_cause_descr'].unique()

In [None]:
cause_encoded_dist = {
    'Missing/Undefined':0, 
    'Arson':1, 
    'Debris Burning':2, 
    'Miscellaneous':3,
    'Campfire':4, 
    'Fireworks':5, 
    'Children':6, 
    'Lightning':7, 
    'Equipment Use':8,
    'Smoking':9, 
    'Railroad':10, 
    'Structure':11, 
    'Powerline':12
}

# Encode categorical values to numeric
df['stat_cause_descr'] = df['stat_cause_descr'].apply(lambda x: cause_encoded_dist[x]).astype(np.number)
df.head()

In [None]:
# df = pd.get_dummies(df, prefix=['Cause'], columns = ['stat_cause_descr'], drop_first=True)

In [None]:
df.head()

In [None]:
# Use subset df with cont columns
df_cont = df[['fire_size', 'latitude', 'longitude', 'Vegetation', 'Temp_cont', 'Wind_cont', 'Hum_cont', 'Prec_cont', 'remoteness']]
df_pre_7 = df[['fire_size', 'latitude', 'longitude', 'Vegetation', 'Temp_pre_7', 'Wind_pre_7', 'Hum_pre_7', 'Prec_pre_7', 'remoteness']]
df_pre_15 = df[['fire_size', 'latitude', 'longitude', 'Vegetation', 'Temp_pre_15', 'Wind_pre_15', 'Hum_pre_15', 'Hum_pre_15', 'remoteness']]
df_pre_30 = df[['fire_size', 'latitude', 'longitude', 'Vegetation', 'Temp_pre_30', 'Wind_pre_30', 'Hum_pre_30', 'Hum_pre_30', 'remoteness']]


In [None]:
df_dummies_X = df_pre_7

In [None]:
# df_dummies_X = df_pre_7.drop(columns=['stat_cause_descr'])
df_dummies_X = pd.get_dummies(df_dummies_X)
X = df_dummies_X
Y = df['stat_cause_descr']

In [None]:
X = df_dummies_X.drop(columns=['fire_size'])
Y = df['fire_size']

In [None]:
X = df_dummies_X.drop(columns=['stat_cause_descr'])
Y = df['stat_cause_descr']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.15, random_state=50)

In [None]:
sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

In [None]:
from sklearn.decomposition import PCA
pca = PCA()
#
# Determine transformed features
#
X_train_pca = pca.fit_transform(X_train_std)
X_test_pca = pca.transform(X_test_std)

In [None]:
X_train_pca

In [None]:
X_test_pca

In [None]:
import plotly.express as px
pca = PCA()
pca.fit(df)
exp_var_cumul = np.cumsum(pca.explained_variance_ratio_)

px.area(
    x=range(1, exp_var_cumul.shape[0] + 1),
    y=exp_var_cumul,
    labels={"x": "# Components", "y": "Explained Variance"}
)

In [None]:
# Feature importances using RandomForest
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor()

model.fit(X_train, y_train)

print(f'model score on training data: {model.score(X_train, y_train)}')
print(f'model score on testing data: {model.score(X_test, y_test)}')

# training_accuracy = accuracy_score(y_train, model.predict(X_train))
# print(training_accuracy)

pred = model.predict(X_test)

print(pred)

# test_accuracy = accuracy_score(y_test, pred)
# print(test_accuracy)


In [None]:
# print(y_test)
test_accuracy = accuracy_score(y_test.values, pred)
print(test_accuracy)

In [None]:
importances = model.feature_importances_
indices = np.argsort(importances)

fig, ax = plt.subplots()
ax.barh(range(len(importances)), importances[indices])
ax.set_yticks(range(len(importances)))
_ = ax.set_yticklabels(np.array(X_train.columns)[indices])

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

# Gradient Boosting
sc = StandardScaler()
X_train_std = sc.fit_transform(X_train)
X_test_std = sc.transform(X_test)
#
# Hyperparameters for GradientBoostingRegressor
#
gbr_params = {'n_estimators': 1000,
          'max_depth': 3,
          'min_samples_split': 5,
          'learning_rate': 0.01,
          'loss': 'ls'}
#
# Create an instance of gradient boosting regressor
#
gbr = GradientBoostingRegressor(**gbr_params)
#
# Fit the model
#
gbr.fit(X_train_std, y_train)
#
# Print Coefficient of determination R^2
#
print("Model Accuracy: %.3f" % gbr.score(X_test_std, y_test))
#
# Create the mean squared error
#
mse = mean_squared_error(y_test, gbr.predict(X_test_std))
print("The mean squared error (MSE) on test set: {:.4f}".format(mse))

In [None]:
# Random Forest Classifier
classifier = RandomForestClassifier(n_estimators=500, max_depth=50, min_samples_leaf=50, random_state=123)

classifier.fit(X_train, y_train)
training_accuracy = accuracy_score(y_train, classifier.predict(X_train))
print(training_accuracy)

pred = classifier.predict(X_test)
test_accuracy = accuracy_score(y_test, pred)
print(test_accuracy)

In [None]:
dectr = DecisionTreeRegressor()
dectr.fit(X_train, y_train)

predictions = dectr.predict(X_test)
print('Mean Absolute Error:', mean_absolute_error(y_test, predictions))
print('R Squared:', r2_score(y_test, predictions))

In [None]:
gr_boost = GradientBoostingRegressor()
gr_boost.fit(X_train, y_train)

predictions = gr_boost.predict(X_test)
print('Mean Absolute Error:', mean_absolute_error(y_test, predictions))
print('R Squared:', r2_score(y_test, predictions))

In [None]:
rf = RandomForestRegressor()
rf.fit(X_train, y_train)

predictions = rf.predict(df1[1])
print('Mean Absolute Error:', mean_absolute_error(y_test, predictions))
print('R Squared:', r2_score(y_test, predictions))

In [None]:
%load_ext autoreload
%autoreload 2
from utils import run_all_regressors

run_all_regressors(X_train, y_train, X_test, y_test)


In [None]:
print()