In [214]:
import pandas as pd
import numpy as np
import time
import pickle
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor

# Machine Learning

## Carga de los dataset ya preparados

In [212]:
X_train_full = pd.read_csv("../data/X_train_full.csv")
y_train_full = pd.read_csv("../data/y_train_full.csv").values.ravel()
X_test = pd.read_csv("../data/X_test.csv")
y_test = pd.read_csv("../data/y_test.csv").values.ravel()
X_train = pd.read_csv("../data/X_train.csv")
y_train = pd.read_csv("../data/y_train.csv").values.ravel()
X_val = pd.read_csv("../data/X_val.csv")
y_val = pd.read_csv("../data/y_val.csv").values.ravel()

## Función para calcular diferentes métricas

In [56]:
def calculate_train_val(clf, X_train, y_train, X_val, y_val):
    clf.fit(X_train, y_train)
    y_train_pred = clf.predict(X_train)
    print("--------------------------------------------\n", clf, "\n--------------------------------------------")
    print("Train RMSLE:", np.sqrt(mean_squared_error(np.log(y_train), np.log(y_train_pred))))
    print("Train RMSE:", np.sqrt(mean_squared_error(y_train, y_train_pred)))
    print("Train MAE:", mean_absolute_error(y_train, y_train_pred))
    print("Train R^2:", r2_score(y_train, y_train_pred), "\n")
    y_val_pred = clf.predict(X_val) 
    print("Validation RMSLE:", np.sqrt(mean_squared_error(np.log(y_val), np.log(y_val_pred))))
    print("Validation RMSE:", np.sqrt(mean_squared_error(y_val, y_val_pred)))
    print("Validation MAE:", mean_absolute_error(y_val, y_val_pred))
    print("Validation R^2:", r2_score(y_val, y_val_pred))

## Baseline

In [57]:
clf = DummyRegressor()

In [58]:
calculate_train_val(clf, X_train, y_train, X_val, y_val)

--------------------------------------------
 DummyRegressor() 
--------------------------------------------
Train RMSLE: 0.4524639971016908
Train RMSE: 257.1718838742388
Train MAE: 176.44950355497159
Train R^2: 0.0 

Validation RMSLE: 0.4499698050847289
Validation RMSE: 253.29672169429384
Validation MAE: 175.9480864613195
Validation R^2: -7.871366087397469e-05


## Probar diferentes modelos

In [204]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
import lightgbm as lgb
from sklearn.ensemble import StackingRegressor
from sklearn.ensemble import VotingRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import HistGradientBoostingRegressor

In [213]:
estimators = [
    ('RFR', RandomForestRegressor()),
    ('XGBR', XGBRegressor()),
    ('CAT', CatBoostRegressor(verbose=0)), 
    ]
models = {
    'LR': LinearRegression(),
    'RFR': RandomForestRegressor(),
    'KNN': KNeighborsRegressor(),
    'GBR': GradientBoostingRegressor(),
    'LGBM': lgb.LGBMRegressor(verbose=-1),
    'XGBR': XGBRegressor(),
    'CAT': CatBoostRegressor(verbose=0),
    'HistGBR': HistGradientBoostingRegressor(),
    #'SR': StackingRegressor(estimators=estimators, final_estimator=RandomForestRegressor()),
    #'VR': VotingRegressor(estimators),
}

results = pd.DataFrame(columns=["Model", "RMSLE", "RMSE", "MAE", "R^2", "Time"])
results_list = []
for model_name, model in models.items():
    start_time = time.time()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    rmsle = np.sqrt(mean_squared_error(np.log(y_val), np.log(y_pred)))
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    mae = mean_absolute_error(y_val, y_pred)
    r2 = r2_score(y_val, y_pred)
    elapsed_time = time.time() - start_time
    
    results[model_name] = rmse
    #print(f'{model_name}: {rmsle}')
    results_list.append({
        "Model": model_name,
        "RMSLE": rmsle,
        "RMSE": rmse,
        "MAE": mae,
        "R^2": r2,
        "Time": f"{elapsed_time:.2f}s"
    })

results = pd.DataFrame(results_list)
results = results.sort_values(by="RMSLE", ascending=True).reset_index(drop=True)

def highlight_min(series):
    is_min = series == series.min()
    return ['background-color: yellow' if v else '' for v in is_min]

def highlight_max(series):
    is_max = series == series.max()
    return ['background-color: yellow' if v else '' for v in is_max]

results = results.style.apply(highlight_min, subset=["RMSLE", "RMSE", "MAE", "Time"]).apply(highlight_max, subset=["R^2"])
results

Unnamed: 0,Model,RMSLE,RMSE,MAE,R^2,Time
0,RFR,0.046536,35.25972,15.260376,0.980621,22.36s
1,XGBR,0.05191,36.229888,17.862954,0.97954,0.33s
2,CAT,0.054008,37.070198,18.781402,0.97858,6.08s
3,LGBM,0.067114,43.937693,22.822676,0.969908,0.25s
4,HistGBR,0.067744,43.651354,23.033331,0.970299,0.93s
5,KNN,0.123373,106.344858,44.613456,0.823718,1.83s
6,GBR,0.132336,78.232958,47.194755,0.904599,7.61s
7,LR,0.172244,104.222067,61.228801,0.830685,0.18s


## Optimización modelos

## Comprobación del modelo

## Guardado del modelo

In [154]:
def save_model(model, X, y, name_model="model1"):
    model.fit(X, y)
    with open(f'../models/{name_model}.pkl', 'wb') as file:
        pickle.dump(model, file)

**CAMBIAR EL RandomForestRegressor() POR EL MEJOR MODELO**

In [164]:
save_model(RandomForestRegressor(random_state=42), X_train, y_train)

#### Comprobar si se ha guardado bien

In [165]:
def loaded_model(name_model="model1"):
    with open(f'../models/model1.pkl', 'rb') as file:
        model = pickle.load(file)
    return model

In [167]:
loaded_model1 = loaded_model()
print("Validation RMSLE:", np.sqrt(mean_squared_error(np.log(y_val), np.log(loaded_model1.predict(X_val)))))

Validation RMSLE: 0.04655770070954239
