In [1]:
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.model_selection import cross_validate
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, r2_score

import matplotlib.pyplot as plt
import seaborn as sns

import os,sys,glob,warnings
from tqdm import tqdm

os.chdir("../")
from src.utils import temporalKFold
from spatialkfold.blocks import spatial_blocks
import geopandas as gpd

In [2]:
warnings.filterwarnings("ignore", category=UserWarning)

# Data Prepration

In [3]:
data = pd.read_csv("./results/CSVs/02_Train_generated_features.csv", parse_dates=["date"])

In [4]:
data.dropna(inplace=True) # Drop NAN generated by Lag features

In [5]:
# Create spatial folds
gdf = gpd.GeoDataFrame(data,geometry=gpd.points_from_xy(data.lng, data.lat, crs=4326))

blocks = spatial_blocks(gdf=gdf, width=0.05, height=0.05, 
                                  method='continuous', orientation='tb-lr' ,
                                  nfolds=10, random_state= 175)

stn_block = gpd.overlay (gdf, blocks)

In [6]:
# Create temporal folds
dataTpSp = temporalKFold(stn_block,num_folds=10,random_state= 175)
dataTpSp.iloc[:,-5:].head()

Unnamed: 0,quarter_sin,quarter_cos,folds,geometry,temp_fold
4,1.0,6.123234000000001e-17,8,POINT (9.25751 45.49678),1
5,1.0,6.123234000000001e-17,8,POINT (9.24874 45.49163),1
8,1.0,6.123234000000001e-17,8,POINT (9.25751 45.49678),1
9,1.0,6.123234000000001e-17,8,POINT (9.24874 45.49163),1
16,1.0,6.123234000000001e-17,8,POINT (9.25751 45.49678),1


In [7]:
selected_FTs = ['temperature', 
                'global_radiation', 
                'pm25_aqi', 
                'temperature_lag_1', 
                'temperature_lag_3', 
                'temperature_lag_7', 
                'temperature_lag_15', 
                'o3_lag_30', 
                'day_of_year_sin', 
                'day_of_year_cos', 
                'day_of_week_sin', 
                'day_of_week_cos']

In [8]:
X = dataTpSp[selected_FTs]
y = dataTpSp["pm25"]
sp_folds = dataTpSp.folds.values.ravel()
tmp_folds = dataTpSp.temp_fold.values.ravel()

In [9]:
data.shape, stn_block.shape, dataTpSp.shape

((90051, 105), (90051, 107), (90051, 108))

# Baseline Models

## Linear Model

In [10]:
lr_full_data = LinearRegression()
lr_full_data.fit(X,y)
y_pred = lr_full_data.predict(X)

print(f"The Train R² for linear model is: {r2_score(y,y_pred):.2f}")
print(f"The Train MAE for linear model is: {mean_absolute_error(y,y_pred):.2f}")
print(f"The Train RMSE for linear model is: {root_mean_squared_error(y,y_pred):.2f}")

The Train R² for linear model is: 0.86
The Train MAE for linear model is: 4.27
The Train RMSE for linear model is: 5.65


### Validation

In [11]:
lr = LinearRegression()
group_cvs =  LeaveOneGroupOut()

### Spatial Cross-validation

In [12]:
lr_cvsp_results = cross_validate(lr, X, y, cv=group_cvs.split(X, y, sp_folds), 
                                 scoring= ["r2", "neg_root_mean_squared_error", "neg_mean_absolute_error"], 
                                 error_score='raise')


lr_spatial_r2 = lr_cvsp_results["test_r2"].mean()
lr_spatial_mae = np.abs(lr_cvsp_results["test_neg_mean_absolute_error"]).mean()
lr_spatial_rmse = np.abs(lr_cvsp_results["test_neg_root_mean_squared_error"]).mean()

print(f"The Spatial CV R² for linear model is: {lr_spatial_r2:.2f}")
print(f"The Spatial CV MAE for linear model is: {lr_spatial_mae:.2f}")
print(f"The Spatial CV RMSE for linear model is: {lr_spatial_rmse:.2f}")

### Temporal Cross-validdation

In [14]:
lr_cvtmp_results = cross_validate(lr, X, y, cv=group_cvs.split(X, y, tmp_folds), 
                                  scoring= ["r2", "neg_root_mean_squared_error", "neg_mean_absolute_error"],
                                  error_score='raise')


lr_temporal_r2 = lr_cvtmp_results["test_r2"].mean()
lr_temporal_mae = np.abs(lr_cvtmp_results["test_neg_mean_absolute_error"]).mean()
lr_temporal_rmse = np.abs(lr_cvtmp_results["test_neg_root_mean_squared_error"]).mean()

print(f"The Temporal CV R² for linear model is: {lr_temporal_r2:.2f}")
print(f"The Temporal CV MAE for linear model is: {lr_temporal_mae:.2f}")
print(f"The Temporal CV RMSE for linear model is: {lr_temporal_rmse:.2f}")

## Random forest

In [16]:
rf_full_data = RandomForestRegressor(n_jobs=8,random_state=143)
rf_full_data.fit(X,y)
y_pred = rf_full_data.predict(X)

print(f"The Train R² for RF model is: {r2_score(y,y_pred):.2f}")
print(f"The Train MAE for RF model is: {mean_absolute_error(y,y_pred):.2f}")
print(f"The Train RMSE for RF model is: {root_mean_squared_error(y,y_pred):.2f}")

The Train R² for RF model is: 1.00
The Train MAE for RF model is: 0.17
The Train RMSE for RF model is: 0.35


### Validation

In [None]:
rf = RandomForestRegressor(n_jobs=8,random_state=143)
group_cvs =  LeaveOneGroupOut()

### Spatial Cross-validation

In [None]:
rf_cvsp_results = cross_validate(rf, X, y, cv=group_cvs.split(X, y, sp_folds),
                                 scoring= ["r2", "neg_root_mean_squared_error", "neg_mean_absolute_error"],
                                 error_score='raise',
                                 n_jobs = -1)

rf_spatial_r2 = cvsp_results["test_r2"].mean()
rf_spatial_mae = np.abs(cvsp_results["test_neg_mean_absolute_error"]).mean()
rf_spatial_rmse = np.abs(cvsp_results["test_neg_root_mean_squared_error"]).mean()

print(f"The Spatial CV R² for RF model is: {rf_spatial_r2:.2f}")
print(f"The Spatial CV MAE for RF model is: {rf_spatial_mae:.2f}")
print(f"The Spatial CV RMSE for RF model is: {rf_spatial_rmse:.2f}")

### Temporal Cross-validdation

In [None]:
rf_cvtmp_results = cross_validate(lr, X, y, cv=group_cvs.split(X, y, tmp_folds), 
                                  scoring= ["r2", "neg_root_mean_squared_error", "neg_mean_absolute_error"], 
                                  error_score='raise',
                                  n_jobs=-1)

rf_temporal_r2 = rf_cvtmp_results["test_r2"].mean()
rf_temporal_mae = np.abs(rf_cvtmp_results["test_neg_mean_absolute_error"]).mean()
rf_temporal_rmse = np.abs(rf_cvtmp_results["test_neg_root_mean_squared_error"]).mean()

print(f"The Temporal CV R² for RF model is: {rf_temporal_r2:.2f}")
print(f"The Temporal CV MAE for RF model is: {rf_temporal_mae:.2f}")
print(f"The Temporal CV RMSE for RF model is: {rf_temporal_rmse:.2f}")