In [230]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error, mean_absolute_error

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
import re

from sklearn.model_selection import GridSearchCV

In [231]:
df = pd.read_csv("../data/processed/train.csv")

In [232]:
X_train, X_val, y_train, y_val  = train_test_split(df.drop(['NOx(GT)', 'clasificacion_nox','clasificacion_benceno', 'T','Time'], axis=1), df[["clasificacion_nox"]], test_size=0.2, random_state=42)

In [233]:
X_train.set_index("Date", inplace=True)
X_train.index = pd.to_datetime(X_train.index)
X_val.set_index("Date", inplace=True)
X_val.index = pd.to_datetime(X_val.index)

In [234]:
model_list = [LogisticRegression, RandomForestClassifier, SVC]

def model_r2(model_list,X = None,y = None, cv = 5, scoring = "accuracy"):
    resultados = {}
    for model in model_list:
        model_name = re.match("<class '(.*?)'>",str(model)).group(1).split(".")[-1]
        resultados[model_name] = cross_val_score(model(), X,y, cv = cv, scoring = scoring)
        print("<%s>: %.4f" %(model_name,np.mean(resultados[model_name])))
    return resultados

def model_rmse(model_list,X = None,y = None, cv = 5, scoring = "neg_mean_squared_error"):
    resultados = {}
    for model in model_list:
        model_name = re.match("<class '(.*?)'>",str(model)).group(1).split(".")[-1]
        resultados[model_name] = cross_val_score(model(), X,y, cv = cv, scoring = scoring)
        print("<%s>: %.4f" %(model_name,np.sqrt(-1*np.mean(resultados[model_name]))))
    return resultados

In [235]:
print("R^2")
a = model_r2(model_list,X_train,y_train)
print("RMSE")
a = model_rmse(model_list,X_train,y_train)

R^2
<LogisticRegression>: 0.7430
<RandomForestClassifier>: 0.8707
<SVC>: 0.7260
RMSE
<LogisticRegression>: 0.5502
<RandomForestClassifier>: 0.3707
<SVC>: 0.5933


Como vemos el random forest obtienen los mejores valores pero tenemos que tener cuidado de no hacer overfitting

In [236]:
# Crear el modelo
model = RandomForestClassifier(n_estimators=100, random_state=42)
# Entrenar el modelo
model.fit(X_train, y_train)

In [249]:
# Hacer predicciones en el conjunto de prueba
predictions = model.predict(X_val)

In [250]:
print("RMSE:", np.sqrt(mean_squared_error(y_val, predictions)))
print("Coeficiente de determinación R^2:", r2_score(y_val, predictions))
print("MAE", mean_absolute_error(y_val, predictions))

RMSE: 0.36091622440960475
Coeficiente de determinación R^2: 0.8550486076145475
MAE 0.12758851035404142


In [239]:
rand_forest_param = {
    'n_estimators': [100, 200, 400],
    'max_depth': [4,6,8],
    'max_features': [2,4,6,7],
    'random_state':[42]
    }

gs_rand_forest = GridSearchCV(RandomForestClassifier(),
        rand_forest_param,
        cv=5,
        scoring="accuracy",
        verbose=1,
        n_jobs=8)


gs_rand_forest.fit(X_train, y_train)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


In [240]:
gs_rand_forest.best_estimator_

In [251]:
# Hacer predicciones en el conjunto de prueba
predictions = gs_rand_forest.predict(X_val)

print("RMSE:", np.sqrt(mean_squared_error(y_val, predictions)))
print("Coeficiente de determinación R^2:", r2_score(y_val, predictions))
print("MAE", mean_absolute_error(y_val, predictions))

RMSE: 0.3842246615124569
Coeficiente de determinación R^2: 0.8357217552964871
MAE 0.1449565798263193


Después de hacer el gridsearch vemos que no conseguimos mejorar el modelo por lo que lo dejo por defecto que es como mejor resultado me ha dado.

In [252]:
# Crear el modelo ExtraTreesRegressor
model = RandomForestClassifier(random_state=42)
# Entrenar el modelo
model.fit(X_train, y_train)
# Hacer predicciones en el conjunto de prueba
predictions = model.predict(X_val)

print("RMSE:", np.sqrt(mean_squared_error(y_val, predictions)))
print("Coeficiente de determinación R^2:", r2_score(y_val, predictions))
print("MAE", mean_absolute_error(y_val, predictions))

RMSE: 0.36091622440960475
Coeficiente de determinación R^2: 0.8550486076145475
MAE 0.12758851035404142


In [254]:
df_test = pd.read_csv("../data/processed/test.csv")

In [255]:
df_test.set_index("Date", inplace=True)
df_test.index = pd.to_datetime(df_test.index)

In [256]:
X = df_test.drop(['NOx(GT)', 'clasificacion_nox','clasificacion_benceno', 'T','Time'], axis=1)
Y = df_test[["clasificacion_nox"]]

In [257]:
# Hacer predicciones en el conjunto de prueba
predictions = model.predict(X)

In [258]:
print("RMSE:", np.sqrt(mean_squared_error(Y, predictions)))
print("Coeficiente de determinación R^2:", r2_score(Y, predictions))
print("MAE", mean_absolute_error(Y, predictions))

RMSE: 0.3625055260409921
Coeficiente de determinación R^2: 0.8560964756585954
MAE 0.12606837606837606


Como podemos ver, el modelo predice bastante mejor en la muestra de test por lo que no tenemos overfitting. En este caso clasificamos los valores de Óxidos de nitrogeno