# GridSearch para regresión (ventas)

**Candidatos**
- BaggingRegressor
- GradientBoostingRegressor
- RandomForestRegressor

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, PowerTransformer, OneHotEncoder
from sklearn.feature_selection import SelectPercentile, f_classif
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
import pandas as pd
import re
import numpy as np
np.seterr(divide='ignore', invalid='ignore');  # warnings inutiles apagados!

In [2]:
from preprocessing import Nothing, CategoriesTokenizer, custom_features


boc_some_values = CountVectorizer(
    tokenizer = CategoriesTokenizer(),
    max_df = 1.0,
    min_df = 0.05  # hiperparametro a optimizar
    # valores para GridSearch : [5%, 10%, 15%] ???
    )


boc_many_values = CountVectorizer(
    tokenizer = CategoriesTokenizer(),
    max_df = 1.0,
    min_df = 1  # hiperparametro a optimizar
    # valores para GridSearch : [5, 10, 15] ???
    )


preprocessing = ColumnTransformer(
    transformers=[
        ('BoC-plat',boc_some_values,'platforms'),
        ('BoC-cat',boc_some_values,'categories'),
        ('BoC-genres',boc_some_values,'genres'),
        ('BoC-tags',boc_some_values,'tags'),

        ('BoC-dev',boc_many_values,'developer'),
        ('BoC-pub',boc_many_values,'publisher'),

        ('OneHotEncoder',OneHotEncoder(handle_unknown='ignore'),['month']),
        ('MinMaxScaler',MinMaxScaler(),['required_age','price','release_date']),
        ('BoxCox',PowerTransformer(method='yeo-johnson'),['achievements','average_playtime','revenue']),
        ('unchanged',Nothing(),['english'])
])

In [3]:
df_train = pd.read_pickle('train.pickle')
df_train = custom_features(df_train)

#########################
df_train = df_train[:100]
#########################

X_train, X_eval, y_train, y_eval = train_test_split(df_train, df_train['estimated_sells'], test_size=0.3, random_state=0)

## Regresor Bagging

In [4]:
from sklearn.ensemble import BaggingRegressor

In [12]:
pipeline = Pipeline(steps = [
    ('procesamiento',preprocessing),
    ("selector", SelectPercentile(f_classif, percentile=95)),
    ("regressor",BaggingRegressor(random_state=0)),
])

grilla_bagging = {
    "regressor": [BaggingRegressor(random_state=0)],
    "selector__percentiler" : [20, 40, 80, 95, 100],
    "selector__score_func" : [f_classif],
    "regresor__n_estimators" : [5, 10, 15, 20],
    'regresor__base_estimator__max_depth' : [1, 4, 8, None],
    'regresor__max_samples' : [0.05, 0.2, 0.5, 1.0]
}

In [14]:
gs = HalvingGridSearchCV(
    pipeline,
    grilla_bagging,
    scoring = 'r2',
    # n_jobs=-1,
    verbose = 10
)

gs.fit(X_train, y_train)

n_iterations: 2
n_required_iterations: 6
n_possible_iterations: 2
min_resources_: 10
max_resources_: 70
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 320
n_resources: 10
Fitting 5 folds for each of 320 candidates, totalling 1600 fits
[CV 1/5; 1/320] START regresor__base_estimator__max_depth=1, regresor__max_samples=0.05, regresor__n_estimators=5, regressor=BaggingRegressor(random_state=0), selector__percentiler=20, selector__score_func=<function f_classif at 0x7f503d18f2e0>


ValueError: Invalid parameter 'regresor' for estimator Pipeline(steps=[('procesamiento',
                 ColumnTransformer(transformers=[('BoC-plat',
                                                  CountVectorizer(min_df=0.05,
                                                                  tokenizer=<preprocessing.CategoriesTokenizer object at 0x7f502bbfeda0>),
                                                  'platforms'),
                                                 ('BoC-cat',
                                                  CountVectorizer(min_df=0.05,
                                                                  tokenizer=<preprocessing.CategoriesTokenizer object at 0x7f502bbfc280>),
                                                  'categories'),
                                                 ('BoC-genres',
                                                  CountVectorizer(min_df=0.05...
                                                  'publisher'),
                                                 ('OneHotEncoder',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['month']),
                                                 ('MinMaxScaler',
                                                  MinMaxScaler(),
                                                  ['required_age', 'price',
                                                   'release_date']),
                                                 ('BoxCox', PowerTransformer(),
                                                  ['achievements',
                                                   'average_playtime',
                                                   'revenue']),
                                                 ('unchanged', Nothing(),
                                                  ['english'])])),
                ('selector', SelectPercentile(percentile=95)),
                ('regressor', BaggingRegressor(random_state=0))]). Valid parameters are: ['memory', 'steps', 'verbose'].

El mejor modelo encontrado es el siguiente

In [8]:
gs.best_params_ 

AttributeError: 'HalvingGridSearchCV' object has no attribute 'best_params_'

Y sus métricas son:

In [None]:
print("Resultados clasificador ganador de GridSearch Bagging Regressor")

y_pred = gs.predict(X_eval)
print("Error cuadrático medio = {}".format(mean_squared_error(y_eval,y_pred)))
print("Score R2 = {}".format(r2_score(y_eval,y_pred)))

## Regresor GradientBoosting

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

## Regresor RandomForest

In [None]:
from sklearn.ensemble import RandomForestRegressor