# GridSearch para regresión (ventas)

**Candidatos**
- BaggingRegressor
- GradientBoostingRegressor
- RandomForestRegressor

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, PowerTransformer, OneHotEncoder
from sklearn.feature_selection import SelectPercentile, f_classif
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
import pandas as pd
import re
import numpy as np
np.seterr(divide='ignore', invalid='ignore');  # warnings inutiles apagados!

In [2]:
def custom_features(dataframe_in):
    df = dataframe_in.copy(deep=True)

    df['month'] = pd.to_datetime(df['release_date']).dt.month
    df['release_date'] = pd.to_datetime(df['release_date']).apply(lambda x: x.to_julian_date())

    df['revenue'] = pd.Series([0 for _ in range(len(dataframe_in))])

    df.loc[df.publisher.str.match('.*microsoft.*', flags=re.IGNORECASE).values, 'revenue'] = 10.260
    df.loc[df.publisher.str.match('.*netease.*', flags=re.IGNORECASE).values, 'revenue'] = 6.668
    df.loc[df.publisher.str.match('.*activision.*', flags=re.IGNORECASE).values, 'revenue'] = 6.388
    df.loc[df.publisher.str.match('.*electronic.*', flags=re.IGNORECASE).values, 'revenue'] = 5.537
    df.loc[df.publisher.str.match('.*bandai.*', flags=re.IGNORECASE).values, 'revenue'] = 3.018
    df.loc[df.publisher.str.match('.*square.*', flags=re.IGNORECASE).values, 'revenue'] = 2.386
    df.loc[df.publisher.str.match('.*nexon.*', flags=re.IGNORECASE).values, 'revenue'] = 2.286
    df.loc[df.publisher.str.match('.*ubisoft.*', flags=re.IGNORECASE).values, 'revenue'] = 1.446
    df.loc[df.publisher.str.match('.*konami.*', flags=re.IGNORECASE).values, 'revenue'] = 1.303
    df.loc[df.publisher.str.match('.*SEGA.*').values, 'revenue'] = 1.153
    df.loc[df.publisher.str.match('.*capcom.*', flags=re.IGNORECASE).values, 'revenue'] = 0.7673
    df.loc[df.publisher.str.match('.*warner.*', flags=re.IGNORECASE).values, 'revenue'] = 0.7324

    return df

In [3]:
from preprocessing import Nothing, CategoriesTokenizer


boc_some_values = CountVectorizer(
    tokenizer = CategoriesTokenizer(),
    max_df = 1.0,
    min_df = 0.05  # hiperparametro a optimizar
    # valores para GridSearch : [5%, 10%, 15%] ???
    )


boc_many_values = CountVectorizer(
    tokenizer = CategoriesTokenizer(),
    max_df = 1.0,
    min_df = 1  # hiperparametro a optimizar
    # valores para GridSearch : [5, 10, 15] ???
    )


preprocessing = ColumnTransformer(
    transformers=[
        ('BoC-plat',boc_some_values,'platforms'),
        ('BoC-cat',boc_some_values,'categories'),
        ('BoC-genres',boc_some_values,'genres'),
        ('BoC-tags',boc_some_values,'tags'),

        ('BoC-dev',boc_many_values,'developer'),
        ('BoC-pub',boc_many_values,'publisher'),

        ('OneHotEncoder',OneHotEncoder(handle_unknown='ignore'),['month']),
        ('MinMaxScaler',MinMaxScaler(),['required_age','price','release_date']),
        ('BoxCox',PowerTransformer(method='yeo-johnson'),['achievements','average_playtime','revenue']),
        ('unchanged',Nothing(),['english'])
])

In [4]:
df_train = pd.read_pickle('train.pickle')
df_train = custom_features(df_train)

#########################
# df_train = df_train[:1000]
#########################

X_train, X_eval, y_train, y_eval = train_test_split(df_train, df_train['estimated_sells'], test_size=0.3, random_state=0)

## Regresor Bagging

In [5]:
from sklearn.ensemble import BaggingRegressor

In [6]:
pipeline = Pipeline(steps = [
    ('procesamiento',preprocessing),
    ("selector", SelectPercentile(f_classif, percentile=95)),
    ("regressor",BaggingRegressor(random_state=0)),
])

grilla_bagging = {
    "regressor": [BaggingRegressor(random_state=0)],
    "selector__percentile" : [75, 85, 95, 100],
    "selector__score_func" : [f_classif],
    "regressor__n_estimators" : [5, 10, 15, 20],
    # 'regressor__base_estimator__max_depth' : [1, 4, 8, None],
    # 'regressor__max_samples' : [0.25, 0.5, 1.0]
    'regressor__max_samples' : [0.5, 0.75, 1.0]
}

In [7]:
gs = HalvingGridSearchCV(
    pipeline,
    grilla_bagging,
    scoring = 'r2',
    n_jobs=-1,
    verbose = 0
)

In [8]:
import numpy as np
np.seterr(divide='ignore', invalid='ignore');

gs.fit(X_train, y_train)

 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] are constant.
  f = msb / msw
  f = msb / msw
  diff_b_a = subtract(b, a)
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

El mejor modelo encontrado es el siguiente

In [9]:
gs.best_params_ 

{'regressor': BaggingRegressor(max_samples=0.75, n_estimators=15, random_state=0),
 'regressor__max_samples': 0.75,
 'regressor__n_estimators': 15,
 'selector__percentile': 75,
 'selector__score_func': <function sklearn.feature_selection._univariate_selection.f_classif(X, y)>}

Y sus métricas son:

In [10]:
print("Resultados clasificador ganador de GridSearch Bagging Regressor")

y_pred = gs.predict(X_eval)
print("Error cuadrático medio = {}".format(mean_squared_error(y_eval,y_pred)))
print("Score R2 = {}".format(r2_score(y_eval,y_pred)))

Resultados clasificador ganador de GridSearch Bagging Regressor
Error cuadrático medio = 928108780472.7578
Score R2 = 0.4823003253459167


## Regresor GradientBoosting

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

## Regresor RandomForest

In [None]:
from sklearn.ensemble import RandomForestRegressor