In [14]:
import numpy as np
import pandas as pd
from scipy import stats

import geopandas as gpd

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, r2_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler

import matplotlib.pyplot as plt
import seaborn as sns

from tqdm import tqdm
import os,sys,glob,warnings,joblib

In [2]:
warnings.filterwarnings("ignore", category=UserWarning)

In [3]:
plt.style.use("bmh")

# Data Prepration

In [4]:
data = pd.read_csv('../Data/results/Train[FeatEng].csv')
data.columns

Index(['lat', 'lon', 'date_month', 'climate_aet', 'climate_def',
       'climate_pdsi', 'climate_pet', 'climate_pr', 'climate_ro',
       'climate_soil', 'climate_srad', 'climate_tmmn', 'climate_tmmx',
       'climate_vap', 'climate_vpd', 'climate_vs', 'precipitation',
       'climate_aet_lag_1', 'climate_def_lag_1', 'climate_pdsi_lag_1',
       'climate_pet_lag_1', 'climate_pr_lag_1', 'climate_ro_lag_1',
       'climate_soil_lag_1', 'climate_srad_lag_1', 'climate_tmmn_lag_1',
       'climate_tmmx_lag_1', 'climate_vap_lag_1', 'climate_vpd_lag_1',
       'climate_vs_lag_1', 'precipitation_lag_1', 'climate_aet_lag_2',
       'climate_def_lag_2', 'climate_pdsi_lag_2', 'climate_pet_lag_2',
       'climate_pr_lag_2', 'climate_ro_lag_2', 'climate_soil_lag_2',
       'climate_srad_lag_2', 'climate_tmmn_lag_2', 'climate_tmmx_lag_2',
       'climate_vap_lag_2', 'climate_vpd_lag_2', 'climate_vs_lag_2',
       'precipitation_lag_2', 'burn_area', 'elevation', 'landcover_0',
       'landcover_1', '

In [5]:
X = data.drop(["burn_area", "date"],axis=1)
y = data["burn_area"]

In [8]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.33, random_state=42)
print(X_train.shape, X_test.shape)

(46424, 55) (22866, 55)


# Baseline Model

In [9]:
lr= LinearRegression()
lr.fit(X_train,y_train)
y_pred = lr.predict(X_train)

print(f"The Train R² for linear model is: {r2_score(y_train,y_pred):.3f}")
print(f"The Train MAE for linear model is: {mean_absolute_error(y_train,y_pred):.3f}")
print(f"The Train RMSE for linear model is: {root_mean_squared_error(y_train,y_pred):.3f}")

The Train R² for linear model is: 0.212
The Train MAE for linear model is: 0.012
The Train RMSE for linear model is: 0.028


In [10]:
y_pred = lr.predict(X_valid)

print(f"The Test R² for linear model is: {r2_score(y_valid,y_pred):.3f}")
print(f"The Test MAE for linear model is: {mean_absolute_error(y_valid,y_pred):.3f}")
print(f"The Test RMSE for linear model is: {root_mean_squared_error(y_valid,y_pred):.3f}")

The Test R² for linear model is: 0.208
The Test MAE for linear model is: 0.013
The Test RMSE for linear model is: 0.029


# Random Forest

In [11]:
rf = RandomForestRegressor(n_jobs=-1,random_state=143)
rf.fit(X_train,y_train)
y_pred = rf.predict(X_train)

print(f"The Train R² for RF model is: {r2_score(y_train,y_pred):.3f}")
print(f"The Train MAE for RF model is: {mean_absolute_error(y_train,y_pred):.3f}")
print(f"The Train RMSE for RF model is: {root_mean_squared_error(y_train,y_pred):.3f}")

The Train R² for RF model is: 0.879
The Train MAE for RF model is: 0.003
The Train RMSE for RF model is: 0.011


In [12]:
y_pred = rf.predict(X_valid)

print(f"The Test R² for RF model is: {r2_score(y_valid,y_pred):.3f}")
print(f"The Test MAE for RF model is: {mean_absolute_error(y_valid,y_pred):.3f}")
print(f"The Test RMSE for RF model is: {root_mean_squared_error(y_valid,y_pred):.3f}")

The Test R² for RF model is: 0.349
The Test MAE for RF model is: 0.008
The Test RMSE for RF model is: 0.027


In [15]:
# set up the model
rf = RandomForestRegressor(n_jobs=-1,random_state=569)

# determine the hyperparameter space
param_grid = dict(
    n_estimators=stats.randint(10, 120),
    min_samples_split=stats.uniform(0, 1),
    max_depth=stats.randint(1, 12),
    max_features=('sqrt', 'log2', None)
    )


# set up the search
final_model = RandomizedSearchCV(rf,
                            param_grid,
                            scoring='neg_root_mean_squared_error',
                            cv=3,
                            n_iter = 60,
                            random_state=10,
                            n_jobs=15,
                            refit=True)


# find best hyperparameters
final_model.fit(X_train, y_train)

In [16]:
final_model.best_params_

{'max_depth': 10,
 'max_features': 'log2',
 'min_samples_split': 0.0207519493594015,
 'n_estimators': 74}

In [17]:
y_pred = final_model.predict(X_train)

print(f"The Train R² for RF model is: {r2_score(y_train,y_pred):.3f}")
print(f"The Train MAE for RF model is: {mean_absolute_error(y_train,y_pred):.3f}")
print(f"The Train RMSE for RF model is: {root_mean_squared_error(y_train,y_pred):.3f}")

The Train R² for RF model is: 0.314
The Train MAE for RF model is: 0.009
The Train RMSE for RF model is: 0.026


In [18]:
y_pred = final_model.predict(X_valid)

print(f"The Test R² for RF model is: {r2_score(y_valid,y_pred):.3f}")
print(f"The Test MAE for RF model is: {mean_absolute_error(y_valid,y_pred):.3f}")
print(f"The Test RMSE for RF model is: {root_mean_squared_error(y_valid,y_pred):.3f}")

The Test R² for RF model is: 0.290
The Test MAE for RF model is: 0.010
The Test RMSE for RF model is: 0.028
