In [87]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error, mean_absolute_error

from sklearn.ensemble import ExtraTreesRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from xgboost import XGBRegressor
import re

In [61]:
df = pd.read_csv("../data/processed/train.csv")

In [62]:
train_set, test_set = train_test_split(df, test_size=0.2, random_state=42)

In [63]:
train_set.columns

Index(['Date', 'Time', 'CO(GT)', 'PT08.S1(CO)', 'C6H6(GT)', 'PT08.S2(NMHC)',
       'NOx(GT)', 'PT08.S3(NOx)', 'NO2(GT)', 'PT08.S4(NO2)', 'PT08.S5(O3)',
       'T', 'RH', 'AH'],
      dtype='object')

In [64]:
train_set.set_index("Date", inplace=True)
train_set.index = pd.to_datetime(train_set.index)

In [65]:
train_set_no2 = train_set.copy()

In [66]:
X_train, X_val, y_train, y_val = train_test_split(train_set_no2.drop(['NOx(GT)','T','Time'], axis=1), train_set_no2[["NOx(GT)"]], test_size=0.2, random_state=42)

In [67]:
df["NO2(GT)"].describe()

count   7485.00
mean     109.37
std       46.22
min        2.00
25%       75.00
50%      103.00
75%      135.00
max      340.00
Name: NO2(GT), dtype: float64

In [104]:
model_list = [LinearRegression, XGBRegressor, ExtraTreesRegressor]

def model_r2(model_list,X = None,y = None, cv = 5, scoring = "r2"):
    resultados = {}
    for model in model_list:
        model_name = re.match("<class '(.*?)'>",str(model)).group(1).split(".")[-1]
        resultados[model_name] = cross_val_score(model(), X,y, cv = cv, scoring = scoring)
        print("<%s>: %.4f" %(model_name,np.mean(resultados[model_name])))
    return resultados

def model_mse(model_list,X = None,y = None, cv = 5, scoring = "neg_root_mean_squared_error"):
    resultados = {}
    for model in model_list:
        model_name = re.match("<class '(.*?)'>",str(model)).group(1).split(".")[-1]
        resultados[model_name] = cross_val_score(model(), X,y, cv = cv, scoring = scoring)
        print("<%s>: %.4f" %(model_name,np.mean(resultados[model_name])))
    return resultados

def model_mae(model_list,X = None,y = None, cv = 5, scoring = "neg_mean_absolute_error"):
    resultados = {}
    for model in model_list:
        model_name = re.match("<class '(.*?)'>",str(model)).group(1).split(".")[-1]
        resultados[model_name] = cross_val_score(model(), X,y, cv = cv, scoring = scoring)
        print("<%s>: %.4f" %(model_name,np.mean(resultados[model_name])))
    return resultados

In [105]:
model_mse(model_list,X_train,y_train)

<LinearRegression>: -6234.2259
<XGBRegressor>: -3470.2195
<ExtraTreesRegressor>: -2923.5610


{'LinearRegression': array([-5839.2637358 , -7008.98688707, -6680.92400559, -5576.6708746 ,
        -6065.28395566]),
 'XGBRegressor': array([-3397.27827362, -3516.61336241, -3567.3242067 , -3593.93304015,
        -3275.94872713]),
 'ExtraTreesRegressor': array([-2573.2169011 , -3336.66249647, -3039.05136284, -2727.65920614,
        -2941.21481971])}

In [106]:
model_mae(model_list,X_train,y_train)

<LinearRegression>: -57.6544
<XGBRegressor>: -36.6789
<ExtraTreesRegressor>: -33.0096


{'LinearRegression': array([-56.09383987, -59.79144311, -59.11664381, -55.61574647,
        -57.65422091]),
 'XGBRegressor': array([-35.99523987, -36.55264134, -37.92506595, -36.73589804,
        -36.18580143]),
 'ExtraTreesRegressor': array([-32.11912451, -34.37662352, -33.2947016 , -31.30042928,
        -33.95703673])}

In [76]:
print(models.head(5)), 

                               Adjusted R-Squared  R-Squared  RMSE  Time Taken
Model                                                                         
ExtraTreesRegressor                          0.94       0.94 50.78        1.10
RandomForestRegressor                        0.93       0.93 55.16        2.45
LGBMRegressor                                0.93       0.93 55.29        0.08
HistGradientBoostingRegressor                0.92       0.92 56.47        0.56
XGBRegressor                                 0.92       0.92 57.50        0.12


In [None]:
print(models.head(5))

                               Adjusted R-Squared  R-Squared  RMSE  Time Taken
Model                                                                         
HistGradientBoostingRegressor                0.97       0.97  9.62        0.38
XGBRegressor                                 0.97       0.97  9.78        0.19
LGBMRegressor                                0.97       0.97  9.88        0.11
ExtraTreesRegressor                          0.97       0.97 10.61        0.97
RandomForestRegressor                        0.96       0.96 11.06        2.39


In [55]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4790 entries, 3299 to 3475
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Date           4790 non-null   object 
 1   CO(GT)         4790 non-null   float64
 2   PT08.S1(CO)    4790 non-null   float64
 3   C6H6(GT)       4790 non-null   float64
 4   PT08.S2(NMHC)  4790 non-null   float64
 5   PT08.S3(NOx)   4790 non-null   float64
 6   NO2(GT)        4790 non-null   float64
 7   PT08.S4(NO2)   4790 non-null   float64
 8   PT08.S5(O3)    4790 non-null   float64
 9   RH             4790 non-null   float64
 10  AH             4790 non-null   float64
dtypes: float64(10), object(1)
memory usage: 449.1+ KB


In [68]:
# Crear el modelo ExtraTreesRegressor
model = ExtraTreesRegressor(n_estimators=100, random_state=42)
# Entrenar el modelo
model.fit(X_train, y_train)

In [69]:
# Hacer predicciones en el conjunto de prueba
predictions = model.predict(X_val)

In [72]:
print("RMSE:", np.sqrt(mean_squared_error(y_val, predictions)))
print("Coeficiente de determinación R^2:", r2_score(y_val, predictions))
print("MAE", mean_absolute_error(y_val, predictions))
print("MAPE", mean_absolute_percentage_error(y_val, predictions))

RMSE: 50.78053039026918
Coeficiente de determinación R^2: 0.9383408660211014
MAE 31.93231968487457
MAPE 0.16201827848540024
