In [1]:
import numpy as np
import xgboost
from sklearn.metrics import explained_variance_score, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
import pandas as pd

# Modèle qui marche bien
def f():
    X_train = np.load("datasets/train/X_big.npy", allow_pickle=True)
    y_train = pd.read_csv("datasets/train/y_big.csv")["energy_consumption_per_annum"]
    ind = np.where(abs(y_train) < 1000)
    y_train = y_train[abs(y_train) < 1000]
    X_train = X_train[ind]
    X_train, X_test, y_train, y_test = train_test_split(
        X_train, y_train, test_size=0.20
    )
    model = xgboost.XGBRegressor(
        n_estimators=500, max_depth=5, eta=0.3, subsample=0.7, colsample_bytree=0.8
    )
    model.fit(X_train, y_train)
    y_pred = model.predict(X_train)
    print(f"Explained variance train : {explained_variance_score(y_train, y_pred):.3f}")
    print(f"R2 score train : {r2_score(y_train, y_pred):.3f}")
    print(f"MAE score train : {mean_absolute_error(y_train, y_pred):.3f}")
    y_pred = model.predict(X_test)
    print(f"Explained variance test : {explained_variance_score(y_test, y_pred):.3f}")
    print(f"R2 score test : {r2_score(y_test, y_pred):.3f}")
    print(f"MAE score test : {mean_absolute_error(y_test, y_pred):.3f}")


f()

Explained variance train : 0.797
R2 score train : 0.797
MAE score train : 35.316
Explained variance test : 0.775
R2 score test : 0.775
MAE score test : 36.649


In [2]:
import xgboost
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
import pandas as pd
import numpy as np
from sklearn.metrics import explained_variance_score, mean_absolute_error, r2_score
from sklearn.model_selection import (
    train_test_split,
    GridSearchCV,
    RandomizedSearchCV,
    ParameterGrid,
)
import datetime

N_JOBS = 10

In [3]:
X_train = np.load("datasets/train/X_big.npy", allow_pickle=True)
y_train = pd.read_csv("datasets/train/y_big.csv")["energy_consumption_per_annum"]

In [4]:
ind = np.where(abs(y_train) < 1000)
y_train = y_train[abs(y_train) < 1000]
X_train = X_train[ind]

In [5]:
X_train.shape

(1010268, 223)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.20)

In [7]:
def gridsearch(model, cv_params, X_train, y_train, scoring, n_iter=None, n_jobs=1):
    grid_params = {key: value for key, value in cv_params.items()}
    print("RandomizedSearchCV" if n_iter else "GridSearchCV")
    print(f"Model : {model.__class__.__name__}")
    print("******************")
    print(f"Number of total parameters combinations : {len(ParameterGrid(cv_params))}")
    if n_iter:
        print(f"Number of total parameters combinations tested : {n_iter}\n")

    if not n_iter:
        search = GridSearchCV(
            model, grid_params, scoring=scoring, cv=3, verbose=1, n_jobs=n_jobs
        )
    else:
        search = RandomizedSearchCV(
            model,
            grid_params,
            n_iter=n_iter,
            scoring=scoring,
            cv=3,
            verbose=1,
            n_jobs=n_jobs,
        )
    clf = search.fit(X_train, y_train)

    cv_results = pd.DataFrame(search.cv_results_).sort_values(
        by="mean_test_score", ascending=False
    )

    print(f"Best params : {clf.best_params_}")

    print("Top 5 :")
    display(cv_results.head(5))
    date = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    filepath = f"cv_results/{date}.pickle"
    cv_results.to_pickle(filepath)
    print(f"CV results saved in {filepath}")

In [8]:
model = xgboost.XGBRegressor(
    eta=0.3, subsample=0.7, colsample_bytree=0.8, n_jobs=N_JOBS
)

cv_params = {"n_estimators": [200, 300, 500, 700], "max_depth": [4, 5, 6]}

regr = gridsearch(model, cv_params, X_train, y_train, "explained_variance")

GridSearchCV
Model : XGBRegressor
******************
Number of total parameters combinations : 12
Fitting 3 folds for each of 12 candidates, totalling 36 fits
Best params : {'max_depth': 5, 'n_estimators': 700}
Top 5 :


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
7,229.051931,0.115223,1.542183,0.024567,5,700,"{'max_depth': 5, 'n_estimators': 700}",0.774769,0.774171,0.773438,0.774126,0.000544,1
11,270.55241,0.567286,1.636882,0.024094,6,700,"{'max_depth': 6, 'n_estimators': 700}",0.7736,0.771555,0.773394,0.77285,0.000919,2
10,194.063862,0.277247,1.539046,0.020046,6,500,"{'max_depth': 6, 'n_estimators': 500}",0.773349,0.77127,0.772756,0.772458,0.000875,3
6,164.742435,0.350668,1.50411,0.019978,5,500,"{'max_depth': 5, 'n_estimators': 500}",0.772954,0.772324,0.771522,0.772267,0.000586,4
3,189.222854,0.551321,1.54709,0.024097,4,700,"{'max_depth': 4, 'n_estimators': 700}",0.772797,0.771982,0.771815,0.772198,0.000429,5


CV results saved in cv_results/2023-01-15_07-00-38.pickle


In [None]:
model = RandomForestRegressor(n_jobs=N_JOBS)

cv_params = {"n_estimators": [200, 300, 500, 700], "max_depth": [4, 5, 6]}

regr = gridsearch(model, cv_params, X_train, y_train, "explained_variance")

In [None]:
model = xgboost.XGBRegressor(n_jobs=N_JOBS)

cv_params = {
    "n_estimators": [300, 500],
    "max_depth": [4, 5, 6],
    "subsample": [0.3, 0.5, 0.7],
    "eta": [0.1, 0.2, 0.3],
    "colsample_bytree": [0.3, 0.5, 0.7],
    "max_leaves": [0, 2, 10],
}

regr = gridsearch(model, cv_params, X_train, y_train, "explained_variance", n_iter=10)

In [None]:
model = RandomForestRegressor(n_jobs=N_JOBS)

cv_params = {
    "n_estimators": [300, 500],
    "max_depth": [4, 5, 6],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [2, 5, 10],
}

regr = gridsearch(model, cv_params, X_train, y_train, "explained_variance", n_iter=10)

In [None]:
model = GradientBoostingRegressor()

cv_params = {
    "n_estimators": [300, 500],
    "max_depth": [4, 5, 6],
    "subsample": [0.3, 0.5, 0.7],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [2, 5, 10],
}

regr = gridsearch(
    model, cv_params, X_train, y_train, "explained_variance", n_iter=10, n_jobs=N_JOBS
)

In [None]:
model = GradientBoostingRegressor()

cv_params = {"n_estimators": [200, 300, 500, 700], "max_depth": [4, 5, 6]}

regr = gridsearch(
    model, cv_params, X_train, y_train, "explained_variance", n_jobs=N_JOBS
)