# <font color=BLUE size=10> 9ª Competição FLAI de 
# <font color=BLUE size=12> MACHINE LEARNING

***
**Author**: [Camila Maestrelli](https://www.linkedin.com/in/camila-maestrelli-leobons/)

Este notebook é relativo à Competição de Machine Learning da Flai -

## <b>Etapas do projeto: </b>

### 1. Definição do problema
### 2. Importação dos dados
### 3. Entendimento dos dados
### 4. Limpeza dos dados
### 5. Análise exploratória
### 6. Pré-processamento
### 7. Algoritmo e criação do modelo
### 8. Interpretação dos resultados e avaliação dos modelos


## <font color=green> <b> 1. Definição do problema </b>

#### <b> Objetivo: </b>

* Prever a demanda de aluguéis de bicicleta a partir de variáveis de clima e dia, obtendo o menor valor de RMSE


## <font color=green> <b> 2. Importação dos dados </b>

In [4]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 30)
pd.set_option('display.max_rows', 100)

# visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
import sweetviz as sv

# ANOVA
#!pip install statsmodels
from statsmodels.stats.multicomp import pairwise_tukeyhsd
import scipy.stats as stats

# Machine Learning library
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, BaggingRegressor
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import VotingRegressor

#!pip install lightgbm
from lightgbm.sklearn import LGBMRegressor
#!pip install xgboost
from xgboost import XGBRegressor

from sklearn.model_selection import cross_validate, RepeatedKFold, KFold, cross_val_predict
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline, FeatureUnion


from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import minmax_scale
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler # used when there are many outliers
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import FunctionTransformer

#!pip install category_encoders
import category_encoders as ce   

from sklearn.impute import SimpleImputer, KNNImputer      
from sklearn.compose import ColumnTransformer
from sklearn.compose import TransformedTargetRegressor

#metrics
from sklearn.metrics import mean_absolute_error , r2_score, mean_squared_error

import warnings
from datetime import datetime

import joblib

In [5]:
train_url = 'https://raw.githubusercontent.com/camilamaestrelli/Demand-Forecasting-Regression_Competition/main/treino.csv'
test_url = 'https://raw.githubusercontent.com/camilamaestrelli/Demand-Forecasting-Regression_Competition/main/teste.csv'

treino = pd.read_csv(train_url)
teste = pd.read_csv(test_url)

display(treino.head())
treino.shape

Unnamed: 0,hora,dia,feriado,estação,temperatura,chuva,umidade,sol,visibilidade,vento,aluguéis
0,16,sábado,não,verão,39.4,0.0,61.1,2.19,92.0%,3.49,1318
1,21,sexta,não,primavera,22.2,0.0,63.3,0.0,32.0%,2.89,686
2,16,segunda,não,verão,40.5,0.0,52.8,2.51,93.0%,3.97,831
3,7,segunda,sim,outono,25.6,0.0,70.9,0.0,97.0%,1.09,15
4,9,segunda,não,verão,39.9,0.0,58.1,1.69,62.0%,1.09,865


(4500, 11)

In [6]:
treino['visibilidade'] = treino['visibilidade'].apply(lambda x: x.split('%')[0])
treino['visibilidade'] = pd.to_numeric(treino['visibilidade'])
treino.visibilidade = treino.visibilidade / 100

teste['visibilidade'] = teste['visibilidade'].apply(lambda x: x.split('%')[0])
teste['visibilidade'] = pd.to_numeric(teste['visibilidade'])
teste.visibilidade = teste.visibilidade / 100

In [7]:
treino.feriado = treino.feriado.map({'sim': 1, 'não': 0})
teste.feriado = teste.feriado.map({'sim': 1, 'não': 0})

In [8]:
dict_dia = {'domingo': 1,
            'segunda': 2,
            'terça': 3,
            'quarta': 4,
            'quinta': 5,
            'sexta': 6, 
            'sábado': 7}



treino.dia = treino.dia.map(dict_dia)
teste.dia = teste.dia.map(dict_dia)

In [9]:
dict_estacao = {'inverno': 1,
                    'primavera': 2,
                    'outono': 3,
                    'verão': 4,
                   }


dict_estacao_reverso = {1: 'inverno',
                    2: 'primavera',
                    3: 'outono',
                    4: 'verão',
                   }


treino['estação'] = treino['estação'].map(dict_estacao)
teste['estação'] = teste['estação'].map(dict_estacao)

In [10]:
np.sqrt(treino.eval('''
    teste = temperatura * visibilidade * sol * vento * hora / umidade
    teste_final = (teste - aluguéis) ** 2
''')['teste_final'].mean())

960.2251272166366

In [11]:
treino

Unnamed: 0,hora,dia,feriado,estação,temperatura,chuva,umidade,sol,visibilidade,vento,aluguéis
0,16,7,0,4,39.4,0.0,61.1,2.19,0.92,3.49,1318
1,21,6,0,2,22.2,0.0,63.3,0.00,0.32,2.89,686
2,16,2,0,4,40.5,0.0,52.8,2.51,0.93,3.97,831
3,7,2,1,3,25.6,0.0,70.9,0.00,0.97,1.09,15
4,9,2,0,4,39.9,0.0,58.1,1.69,0.62,1.09,865
...,...,...,...,...,...,...,...,...,...,...,...
4495,6,6,0,4,30.4,0.0,84.5,0.03,0.13,1.57,723
4496,16,4,0,3,28.5,0.0,48.3,0.87,0.67,3.49,1151
4497,15,6,0,2,35.1,0.0,37.0,2.20,0.95,3.49,1321
4498,18,1,0,3,32.7,0.0,85.2,0.18,0.17,3.85,533


## <font color=green> <b> 6. Pré-processamento </b>

# Transformações

### Funções para transformação

* Função para criar coluna `dia útil`

In [12]:
def aux_dia_util(df):
    df['dia_util'] = df[['feriado', 'dia']].apply(lambda x: 
                                                  0 if ((x.feriado == 1) or (x.dia == 1) or (x.dia == 7)) 
                                                  else 1, axis = 1)   
    print('dia util')
    return df[['dia_util']]

func_dia_util = FunctionTransformer(aux_dia_util, check_inverse=False)

###############################################################################################################################


def aux_dia_util(x):
    if (x['feriado'] == 1) or (x['dia'] == 1 or x['dia'] == 7):
        return 0
    else:
        return 1
    
def dados_dia_util(df):
    df['dia_util'] = df[['feriado', 'dia']].apply(aux_dia_util, axis = 1)   
    return df

treino['dia_util'] = treino[['feriado', 'dia']].apply(aux_dia_util, axis = 1)   
teste['dia_util'] = teste[['feriado', 'dia']].apply(aux_dia_util, axis = 1)


* Funções para criar colunas `horas pico`

In [13]:
def aux_horas_pico_manha(df):
    df['horas_pico_manha'] = df.hora.map(lambda x: 1 if ((x >= 7) and (x <= 9)) else 0)
    print('pico manha')
    return df[['horas_pico_manha']]
    
func_pico_manha = FunctionTransformer(aux_horas_pico_manha, check_inverse=False)
    
def aux_horas_pico_tarde(df):
    df['horas_pico_tarde'] = df.hora.map(lambda x: 1 if ((x >= 17) and (x <= 19)) else 0)
    print('pico tarde')
    return df[['horas_pico_tarde']]

func_pico_tarde = FunctionTransformer(aux_horas_pico_tarde, check_inverse=False)

def aux_horas_picos(df):
    df['horas_picos'] = df[['hora', 'dia_util', 'dia']].apply(lambda x: 1 if ((((x.dia == 18) or (x.dia == 8)) and x.dia_util == 1) or ((x.dia == 1) and (x.hora == 18)) or ((x.dia == 7) and ( 15 <=x.hora <= 18))) else 0, 
                                                       axis=1)
    print('picos')
    return df[['horas_picos']]

func_picos = FunctionTransformer(aux_horas_picos, check_inverse=False)



* Função para criar coluna `horas vale`

In [14]:
def aux_horas_vale(df):
    df['horas_vale'] = df.hora.map(lambda x: 1 if ((x >= 1) and (x <= 6)) else 0)
    print('vale')
    return df[['horas_vale']]

func_horas_vale = FunctionTransformer(aux_horas_vale, check_inverse=False)

* Função para criar coluna `período do dia`

In [15]:
def aux_periodo(x):
    
    if x < 6:
        return 0 # madrugada
    elif x < 12:
        return 1 # manhã
    elif x < 18:
        return 2 # tarde
    else:
        return 3 # noite

def aux_periodo_dia(df):
    df['periodo_dia'] = df.hora.map(aux_periodo)
    print('periodo do dia')
    return df[['periodo_dia']]

func_periodo_dia = FunctionTransformer(aux_periodo_dia, check_inverse=False)

* Funções para trasnformação das variáveis contínuas

In [16]:
# transformações para as colunas numéricas

from sklearn.base import TransformerMixin, BaseEstimator

class CustomInverser(BaseEstimator, TransformerMixin):
    def __init__(self, value = 1):
        self.value = value
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        colunas = X.columns
        X[colunas] = np.divide(self.value, X[colunas])
        X.fillna(0, inplace=True)
        return X
    def fit_transform(self, X, y=None):
        self.fit(X)
        return self.transform(X)

    
    
def aux_inverso(df):
    coluna = df.columns[0]
    df['inverso_'+coluna] = np.divide(1, df[coluna])
    df.fillna(0, inplace=True)

    print('inverso ', coluna)
    return df[['inverso_'+coluna]]

def aux_raiz(df):
    coluna = df.columns[0]
    df['raiz_'+coluna] = np.sqrt(df[coluna])

    print('raiz ', coluna)
    return df[['raiz_'+coluna]]

def aux_inverso_raiz(df):
    coluna = df.columns[0]
    df['inverso_raiz_'+coluna] = np.divide(1, np.sqrt(df[coluna]))
    df.fillna(0, inplace=True)

    print('inverso da raiz ', coluna)
    return df[['inverso_raiz_'+coluna]]

# para variáveis numéricas
func_inverso = FunctionTransformer(aux_inverso, check_inverse=False)
func_raiz = FunctionTransformer(aux_raiz, check_inverse=False)
func_inverso_raiz = FunctionTransformer(aux_inverso_raiz, check_inverse=False)


In [17]:
# feature: vento
top_vento = treino['vento'].quantile(0.975)


def aux_vento(df):
    
    df.loc[df['vento']>top_vento] = top_vento
    print('auxiliar vento')
    return df

func_vento = FunctionTransformer(aux_vento, check_inverse=False)

In [None]:
# feature: chuva
top_chuva = 10

def aux_chuva(df):
    
    df.loc[df['chuva']>top_chuva] = top_chuva
    print('auxiliar chuva')
    return df

func_chuva = FunctionTransformer(aux_chuva, check_inverse=False)

In [None]:
# feature: chuva


def aux_chuva_binario(df):
    df['chuva'] = df['chuva'].map(lambda x: (0, 1)[x > 0])
    return df
    
#    df.loc[df['chuva']>0] = 1
#    print('auxiliar chuva binario')
#    return df

func_chuva_bin = FunctionTransformer(aux_chuva_binario, check_inverse=False)


def aux_sol_binario(df):
    df['sol'] = df['sol'].map(lambda x: (0, 1)[x > 0])
    return df

func_sol_bin = FunctionTransformer(aux_sol_binario, check_inverse=False)

In [None]:
# feature: umidade
bottom_umidade = treino.umidade.quantile(0.025)


def aux_umidade(df):
    
    df.loc[df['umidade']<bottom_umidade] = bottom_umidade
    print('auxiliar umidade')
    return df

func_umidade = FunctionTransformer(aux_umidade, check_inverse=False)

In [None]:
# feature: temperatura
top_temperatura = treino['temperatura'].quantile(0.975)

bottom_temperatura = treino['temperatura'].quantile(0.025)

def aux_temperatura(df):
    
    df.loc[df['temperatura']<bottom_temperatura] = bottom_temperatura
    print('auxiliar temperatura')
    return df

func_temperatura = FunctionTransformer(aux_temperatura, check_inverse=False)

In [None]:
# feature: visibilidade
top_visibilidade = 0.35


def aux_visibilidade(df):
    
    df.loc[df['visibilidade']>top_visibilidade] = top_visibilidade
    print('auxiliar visibilidade')
    return df

func_visibilidade = FunctionTransformer(aux_visibilidade, check_inverse=False)

In [18]:
# condições ideais
def aux_ideal(df):
    df['ideal'] = df[['temperatura', 'umidade', 'visibilidade', 'dia_util']].apply(lambda x: (0, 1)[x['temperatura'] > 27 and x['umidade'] > 75  and x['visibilidade'] < 0.35  and x['dia_util']==1], axis = 1)
    return df[['ideal']]

func_ideal = FunctionTransformer(aux_ideal, check_inverse=False)


* Função para não alterar nada, retornando o mesmo valor de entrada

In [19]:
def aux_no_change(df):
    print('no change', df.shape)
    return df


no_change = FunctionTransformer(aux_no_change)
  

    
# transformer mixin : https://stackoverflow.com/questions/46162855/fit-transform-takes-2-positional-arguments-but-3-were-given-with-labelbinarize

# Transformações do SKlearn

In [20]:
# Numerical transformations
max_abs = MaxAbsScaler()
sc = StandardScaler()
min_max = MinMaxScaler()
robust = RobustScaler() 
norma = Normalizer()
quanti_uni = QuantileTransformer(output_distribution='uniform')
quanti_norm = QuantileTransformer(output_distribution='normal')
box_cox = PowerTransformer(method = 'box-cox')
log1 = FunctionTransformer(np.log1p, check_inverse=False)
log = FunctionTransformer(np.log, check_inverse=False) 


poly = PolynomialFeatures(degree=2, 
                          interaction_only = True,
                          include_bias = False
                         )

# Categorical encoders
bi_encoder = ce.BinaryEncoder()
target_encoder = ce.target_encoder.TargetEncoder()
out_encoder = ce.leave_one_out.LeaveOneOutEncoder() 
median_encoder = ce.quantile_encoder.QuantileEncoder()

def aux_one_hot(df):
    print('one hot', df.shape)
    return df.iloc[:,:-1]
func_one_hot = FunctionTransformer(aux_one_hot)
  
one_hot = Pipeline(    steps = [
    ("trans", ce.one_hot.OneHotEncoder(return_df=True)),
    ("encode", func_one_hot),  
])

one_hot = ce.one_hot.OneHotEncoder(return_df=True)

one_hot = OneHotEncoder(drop = 'first')




# Criando o pipeline

In [22]:


# Transformações para adicionar colunas na df

# Período do dia
periodo_dia_pipeline = Pipeline(
    steps = [
        ("trans", func_periodo_dia),
        ("encode", one_hot),  # out_encoder, one_hot
        ])

prep_periodo_dia = ColumnTransformer([('periodo_dia', periodo_dia_pipeline, ['hora']),                                                     
                                 ], 
                                remainder='drop',
                                    )

# Dia útil
prep_dia_util = ColumnTransformer([('dia_util', func_dia_util, ['dia', 'feriado']),                                                     
                                 ], 
                                remainder='drop')

# Pico da manhã
prep_horas_pico_manha = ColumnTransformer([('pico_manha', func_pico_manha, ['hora']),                                                     
                                 ], 
                                remainder='drop')

# Pico da tarde
prep_horas_pico_tarde = ColumnTransformer([('pico_tarde', func_pico_tarde, ['hora']),                                                     
                                 ], 
                                remainder='drop')

# máximos dos picos
prep_horas_picos = ColumnTransformer([('picos', func_picos, ['hora', 'dia_util', 'dia']),                                                     
                                 ], 
                                remainder='drop')

# Horas vale
prep_horas_vale = ColumnTransformer([('horas_vale', func_horas_vale, ['hora']),                                                     
                                 ], 
                                remainder='drop')


# Transformações nas colunas previamente existentes

estacao_pipeline = Pipeline(
    steps = [
        ("trans", ce.ordinal.OrdinalEncoder()),
        ("encode", out_encoder),  # out_encoder, no_change ou one_hot
        ])

temp_pipeline = Pipeline(
    steps = [
        ("trans", no_change), # func_inverso, func_raiz, func_inverso_raiz, no_change
        ("scaler", sc),  #
        ])

chuva_pipeline = Pipeline(
    steps = [
        ("trans", no_change), # func_inverso, func_raiz, func_inverso_raiz, no_change
        ("scaler", sc),  #
        ])

umidade_pipeline = Pipeline(
    steps = [
        ("trans", no_change), # func_inverso, func_raiz, func_inverso_raiz, no_change
        ("scaler", sc),  #
        ])

sol_pipeline = Pipeline(
    steps = [
        ("trans", no_change), # func_inverso, func_raiz, func_inverso_raiz, no_change
        ("scaler", sc),  #
        ])

visibilidade_pipeline = Pipeline(
    steps = [
        ("trans", no_change), # func_inverso, func_raiz, func_inverso_raiz, no_change
        ("scaler", sc),  #
        ])

vento_pipeline = Pipeline(
    steps = [
        ("trans", func_vento), # func_inverso, func_raiz, func_inverso_raiz, no_change
        ("scaler", sc),  #
        ])


prep_colunas = ColumnTransformer([('hora', out_encoder, ['hora']), # out_encoder, one_hot, no_change, spline
                                  ('dia', out_encoder, ['dia']), 
                                  ('estacao', estacao_pipeline , ['estação']),
                                  ('temperatura', temp_pipeline,['temperatura']),  
                                  ('chuva', chuva_pipeline,['chuva']),      
                                  ('umidade', umidade_pipeline,['umidade']),  
                                  ('sol', sol_pipeline,['sol']),          
                                  ('visibilidade', visibilidade_pipeline, ['visibilidade']),  
                                  ('vento', vento_pipeline, ['vento']),                                                       
                                 ], 
                                remainder='passthrough',
                                #sparse_threshold = 0, 
                                 verbose=True)

prep_inverso = ColumnTransformer([
                                  ('temperatura', func_inverso,['temperatura']),  
                                     
                                  ('umidade', func_inverso,['umidade']),  
                                         
                                  ('visibilidade', func_inverso, ['visibilidade']),  
                                  ('vento', func_inverso, ['vento']),                                                       
                                 ], 
                                remainder='drop',
                                #sparse_threshold = 0, 
                                 verbose=True)

prep_raiz = ColumnTransformer([
                                  ('temperatura', func_raiz,['temperatura']),  
                                  ('chuva', func_raiz,['chuva']),      
                                  ('umidade', func_raiz,['umidade']),  
                                  ('sol', func_raiz,['sol']),          
                                  ('visibilidade', func_raiz, ['visibilidade']),  
                                  ('vento', func_raiz, ['vento']),                                                       
                                 ], 
                                remainder='drop',
                                #sparse_threshold = 0, 
                                 verbose=True)

prep_inverso_raiz = ColumnTransformer([
                                  ('temperatura', func_inverso_raiz,['temperatura']),  
                          
                                  ('umidade', func_inverso_raiz,['umidade']),  
              
                                  ('visibilidade', func_inverso_raiz, ['visibilidade']),  
                                  ('vento', func_inverso_raiz, ['vento']),                                                       
                                 ], 
                                remainder='drop',
                                #sparse_threshold = 0, 
                                 verbose=True)



# Unindo as transformações criadas acima
uniao = FeatureUnion([('ct1', prep_periodo_dia), 
                      ('ct2', prep_dia_util),
                      ('ct3', prep_horas_pico_manha),
                      ('ct4', prep_horas_pico_tarde),
                      ('ct5', prep_horas_picos),
                      ('ct6', prep_horas_vale),
                      ('ct7', prep_colunas),
                     # ('ct8', prep_inverso),
                     # ('ct9', prep_raiz),
                     # ('ct10', prep_inverso_raiz)
                     ], verbose=True)


preprocessor = Pipeline(steps=[
    ("uniao", uniao),
    ('poly', PolynomialFeatures(degree=2, interaction_only = True, include_bias = False)),
                               
                              ],
                       verbose=True)

lgbm = LGBMRegressor()

pipe_model = Pipeline(steps=[('prep', preprocessor), ('modelo', lgbm)])

pipe_final = TransformedTargetRegressor(regressor=pipe_model,
                                   transformer = StandardScaler())





# Definindo os parâmetros para o Random Search

In [23]:

import scipy.stats.distributions as dists

param_grid = [{
    # periodo do dia
    #'regressor__prep__uniao__ct1': [no_change, prep_periodo_dia],     
    
    # dia útil
    #'regressor__prep__uniao__ct2': [no_change, prep_dia_util],    
    
    # horas pico manhã
    #'regressor__prep__uniao__ct3': [no_change, prep_horas_pico_manha],
    
    # horas pico tarde
    #'regressor__prep__uniao__ct4': [no_change, prep_horas_pico_tarde],
    
    # máximas horas de pico
    #'regressor__prep__uniao__ct5': [no_change, prep_horas_picos],
    
    # horas vale
    #'regressor__prep__uniao__ct6': [no_change, prep_horas_vale],  
    
    
    
    # coluna hora
    'regressor__prep__uniao__ct7__hora': [out_encoder, one_hot, no_change,  #spline_hora
                                         ],             
    
    # coluna dia        
    'regressor__prep__uniao__ct7__dia': [out_encoder, no_change, one_hot,  # spline_dia
                                                ],
    
    # coluna estação
    'regressor__prep__uniao__ct7__estacao': [out_encoder, one_hot],
        
    # coluna temperatura        
    'regressor__prep__uniao__ct7__temperatura': [no_change,sc, min_max, log1, max_abs, robust, func_raiz, func_inverso_raiz, func_inverso],
   # 'regressor__prep__ct7__temperatura__scaler': [sc, min_max, log1, max_abs, robust, no_change],
    
    # coluna chuva
    'regressor__prep__uniao__ct7__chuva': [no_change, sc, min_max, log1, max_abs, robust, func_raiz],
    #'regressor__prep__ct7__chuva__scaler': [sc, min_max, log1, max_abs, robust, no_change],
    
    # coluna umidade
    'regressor__prep__uniao__ct7__umidade': [no_change, sc, min_max, log1, max_abs, robust, func_raiz, func_inverso_raiz, func_inverso],
    #'regressor__prep__ct7__umidade__scaler': [sc, min_max, log1, max_abs, robust, no_change],
    
    # coluna sol
    'regressor__prep__uniao__ct7__sol': [no_change, sc, min_max, log1, max_abs, robust, func_raiz],
    #'regressor__prep__ct7__sol__scaler': [sc, min_max, log1, max_abs, robust, no_change],
    
    # coluna visibilidade,
    'regressor__prep__uniao__ct7__visibilidade': [no_change, sc, min_max, log1, max_abs, robust, func_raiz, func_inverso_raiz, func_inverso],
    #'regressor__prep__ct7__visibilidade__scaler': [sc, min_max, log1, max_abs, robust, no_change],
    
    # coluna vento
    'regressor__prep__uniao__ct7__vento__trans': [no_change, func_vento],
    'regressor__prep__uniao__ct7__vento__scaler': [no_change, sc, min_max, log1, max_abs, robust, func_raiz, func_inverso_raiz, func_inverso],
    #'regressor__prep__ct7__vento__scaler': [sc, min_max, log1, max_abs, robust, no_change],
    
    
    
    # Polynomial Transformer    
    'regressor__prep__poly__interaction_only': [True, False],
    
    
    
    # Parámetros do modelo de regressão LGBM
    'regressor__modelo__reg_alpha': dists.loguniform(1e-8,10,loc=0, scale=1),
    'regressor__modelo__reg_lambda': dists.loguniform(1e-8,10,loc=0, scale=1),
    'regressor__modelo__num_leaves': dists.randint(low=2, high=256),
    'regressor__modelo__colsample_bytree': dists.uniform(loc=0.4, scale=0.6),
    
    'regressor__modelo__subsample': dists.uniform(loc=0.4, scale=0.6),
    'regressor__modelo__subsample_freq': dists.randint(low=1, high=7),
    'regressor__modelo__min_child_samples': dists.randint(low=5, high=100),
    'regressor__modelo__min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
    
    'regressor__modelo__max_depth': dists.randint(low=6, high=18),
    'regressor__modelo__learning_rate': dists.triang(loc=0.008, c = 0.1, scale=0.18),
    'regressor__modelo__n_estimators':  dists.randint(low=100, high=800),
    'regressor__modelo__min_split_gain': dists.loguniform(1e-8,15,loc=0, scale=1),
    'regressor__modelo__random_state': [1986],
    
    
    # Alterando as transformações na variável target
    'transformer': [sc, box_cox, robust, min_max, max_abs, no_change],
    


    
    }]


    

## Funções úteis

In [None]:
def rmse_eval(df, modelo, kf = KFold(n_splits = 15, random_state = 1986)):

    X = df.drop(columns = ['aluguéis'], axis = 1)
    y = df['aluguéis']

    scTreino = [] 
    scTeste = [] 

    df_treino = df.copy()

    for i_treino,i_teste in kf.split(X): 
        X_treino, y_treino = X.iloc[i_treino],y.iloc[i_treino] 
        X_teste, y_teste = X.iloc[i_teste],y.iloc[i_teste] 
        pipe_final.fit(X_treino,y_treino) 
        y_predTreino = pipe_final.predict(X_treino) 
        y_predTeste = pipe_final.predict(X_teste) 
        df_treino.loc[i_teste,'preditos'] = y_predTeste
        scTreino.append(np.sqrt(mean_squared_error(y_treino,y_predTreino))) 
        scTeste.append(np.sqrt(mean_squared_error(y_teste,y_predTeste))) 

    print("RSME treino e: {:.2f}, std: {:.2f}".format(np.mean(scTreino), np.std(scTreino))) 
    print("RSME teste  e: {:.2f}, std: {:.2f}".format(np.mean(scTeste), np.std(scTeste))) 
    
    df_treino['dif'] = df_treino['aluguéis'] - df_treino.preditos
    df_treino['abs_dif'] = abs(df_treino['aluguéis'] - df_treino.preditos)
    df_treino['erro_quadra'] = df_treino['abs_dif'].pow(2)
    
    return df_treino

def erro_groupby(df, coluna):
    
    df_analise = df.copy()
    
    df_analise = df_analise.groupby(coluna)['erro_quadra', 'abs_dif'].mean()
    df_analise['erro_quadra'] = np.sqrt(df_analise['erro_quadra'])
    
    return df_analise

# Random Search

In [91]:

df = treino.copy()  
    
X = df.drop(columns = ['aluguéis'], axis = 1)
y = df['aluguéis']


kf = KFold(n_splits = 5)

grid_search_pipe = RandomizedSearchCV(pipe_final    ,
                             param_grid,
                             cv=kf,
                             scoring='neg_mean_squared_error',
                             verbose=11,
                             n_jobs=-1,
                             n_iter=500,
                             return_train_score = True,          
                            )
grid_search_pipe.fit(X,y)      

resultados = pd.DataFrame(grid_search_pipe.cv_results_)
resultados.mean_test_score = np.sqrt(abs(resultados.mean_test_score))
resultados.mean_train_score = np.sqrt(abs(resultados.mean_train_score))
resultados = resultados.sort_values(by = 'rank_test_score')
display(resultados)
    
print(np.sqrt(abs(grid_search_pipe.best_score_)))


Fitting 5 folds for each of 500 candidates, totalling 2500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   10.6s
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   10.7s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:   10.7s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:   10.9s
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   15.5s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:   18.1s
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:   18.2s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:   18.2s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   21.6s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   23.9s
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed:   28.0s
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:   28.0s
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:   33.7s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:   35.5s
[Parallel(n_jobs=-1)]: Done  15 tasks      | elapsed:   

[Parallel(n_jobs=-1)]: Done 132 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 133 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 134 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 135 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 139 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 140 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 141 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 142 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 143 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 144 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 145 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 147 tasks      | elapsed:  3.2min
[Paralle

[Parallel(n_jobs=-1)]: Done 265 tasks      | elapsed:  5.8min
[Parallel(n_jobs=-1)]: Done 266 tasks      | elapsed:  5.8min
[Parallel(n_jobs=-1)]: Done 267 tasks      | elapsed:  5.8min
[Parallel(n_jobs=-1)]: Done 268 tasks      | elapsed:  5.8min
[Parallel(n_jobs=-1)]: Done 269 tasks      | elapsed:  5.8min
[Parallel(n_jobs=-1)]: Done 270 tasks      | elapsed:  5.9min
[Parallel(n_jobs=-1)]: Done 271 tasks      | elapsed:  5.9min
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:  5.9min
[Parallel(n_jobs=-1)]: Done 273 tasks      | elapsed:  5.9min
[Parallel(n_jobs=-1)]: Done 274 tasks      | elapsed:  5.9min
[Parallel(n_jobs=-1)]: Done 275 tasks      | elapsed:  5.9min
[Parallel(n_jobs=-1)]: Done 276 tasks      | elapsed:  6.0min
[Parallel(n_jobs=-1)]: Done 277 tasks      | elapsed:  6.0min
[Parallel(n_jobs=-1)]: Done 278 tasks      | elapsed:  6.0min
[Parallel(n_jobs=-1)]: Done 279 tasks      | elapsed:  6.0min
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:  6.0min
[Paralle

[Parallel(n_jobs=-1)]: Done 398 tasks      | elapsed:  8.5min
[Parallel(n_jobs=-1)]: Done 399 tasks      | elapsed:  8.5min
[Parallel(n_jobs=-1)]: Done 400 tasks      | elapsed:  8.5min
[Parallel(n_jobs=-1)]: Done 401 tasks      | elapsed:  8.5min
[Parallel(n_jobs=-1)]: Done 402 tasks      | elapsed:  8.5min
[Parallel(n_jobs=-1)]: Done 403 tasks      | elapsed:  8.5min
[Parallel(n_jobs=-1)]: Done 404 tasks      | elapsed:  8.5min
[Parallel(n_jobs=-1)]: Done 405 tasks      | elapsed:  8.5min
[Parallel(n_jobs=-1)]: Done 406 tasks      | elapsed:  8.6min
[Parallel(n_jobs=-1)]: Done 407 tasks      | elapsed:  8.6min
[Parallel(n_jobs=-1)]: Done 408 tasks      | elapsed:  8.6min
[Parallel(n_jobs=-1)]: Done 409 tasks      | elapsed:  8.6min
[Parallel(n_jobs=-1)]: Done 410 tasks      | elapsed:  8.6min
[Parallel(n_jobs=-1)]: Done 411 tasks      | elapsed:  8.6min
[Parallel(n_jobs=-1)]: Done 412 tasks      | elapsed:  8.7min
[Parallel(n_jobs=-1)]: Done 413 tasks      | elapsed:  8.7min
[Paralle

[Parallel(n_jobs=-1)]: Done 531 tasks      | elapsed: 10.6min
[Parallel(n_jobs=-1)]: Done 532 tasks      | elapsed: 10.7min
[Parallel(n_jobs=-1)]: Done 533 tasks      | elapsed: 10.7min
[Parallel(n_jobs=-1)]: Done 534 tasks      | elapsed: 10.7min
[Parallel(n_jobs=-1)]: Done 535 tasks      | elapsed: 10.7min
[Parallel(n_jobs=-1)]: Done 536 tasks      | elapsed: 10.7min
[Parallel(n_jobs=-1)]: Done 537 tasks      | elapsed: 10.7min
[Parallel(n_jobs=-1)]: Done 538 tasks      | elapsed: 10.8min
[Parallel(n_jobs=-1)]: Done 539 tasks      | elapsed: 10.8min
[Parallel(n_jobs=-1)]: Done 540 tasks      | elapsed: 10.8min
[Parallel(n_jobs=-1)]: Done 541 tasks      | elapsed: 10.8min
[Parallel(n_jobs=-1)]: Done 542 tasks      | elapsed: 10.8min
[Parallel(n_jobs=-1)]: Done 543 tasks      | elapsed: 10.8min
[Parallel(n_jobs=-1)]: Done 544 tasks      | elapsed: 10.9min
[Parallel(n_jobs=-1)]: Done 545 tasks      | elapsed: 10.9min
[Parallel(n_jobs=-1)]: Done 546 tasks      | elapsed: 10.9min
[Paralle

[Parallel(n_jobs=-1)]: Done 664 tasks      | elapsed: 13.7min
[Parallel(n_jobs=-1)]: Done 665 tasks      | elapsed: 13.7min
[Parallel(n_jobs=-1)]: Done 666 tasks      | elapsed: 13.7min
[Parallel(n_jobs=-1)]: Done 667 tasks      | elapsed: 13.7min
[Parallel(n_jobs=-1)]: Done 668 tasks      | elapsed: 13.7min
[Parallel(n_jobs=-1)]: Done 669 tasks      | elapsed: 13.7min
[Parallel(n_jobs=-1)]: Done 670 tasks      | elapsed: 13.7min
[Parallel(n_jobs=-1)]: Done 671 tasks      | elapsed: 14.0min
[Parallel(n_jobs=-1)]: Done 672 tasks      | elapsed: 14.0min
[Parallel(n_jobs=-1)]: Done 673 tasks      | elapsed: 14.0min
[Parallel(n_jobs=-1)]: Done 674 tasks      | elapsed: 14.0min
[Parallel(n_jobs=-1)]: Done 675 tasks      | elapsed: 14.2min
[Parallel(n_jobs=-1)]: Done 676 tasks      | elapsed: 14.2min
[Parallel(n_jobs=-1)]: Done 677 tasks      | elapsed: 14.2min
[Parallel(n_jobs=-1)]: Done 678 tasks      | elapsed: 14.2min
[Parallel(n_jobs=-1)]: Done 679 tasks      | elapsed: 14.2min
[Paralle

[Parallel(n_jobs=-1)]: Done 799 tasks      | elapsed: 17.1min
[Parallel(n_jobs=-1)]: Done 800 tasks      | elapsed: 17.1min
[Parallel(n_jobs=-1)]: Done 801 tasks      | elapsed: 17.1min
[Parallel(n_jobs=-1)]: Done 802 tasks      | elapsed: 17.1min
[Parallel(n_jobs=-1)]: Done 803 tasks      | elapsed: 17.2min
[Parallel(n_jobs=-1)]: Done 804 tasks      | elapsed: 17.2min
[Parallel(n_jobs=-1)]: Done 805 tasks      | elapsed: 17.2min
[Parallel(n_jobs=-1)]: Done 806 tasks      | elapsed: 17.2min
[Parallel(n_jobs=-1)]: Done 807 tasks      | elapsed: 17.2min
[Parallel(n_jobs=-1)]: Done 808 tasks      | elapsed: 17.2min
[Parallel(n_jobs=-1)]: Done 809 tasks      | elapsed: 17.2min
[Parallel(n_jobs=-1)]: Done 810 tasks      | elapsed: 17.3min
[Parallel(n_jobs=-1)]: Done 811 tasks      | elapsed: 17.3min
[Parallel(n_jobs=-1)]: Done 812 tasks      | elapsed: 17.3min
[Parallel(n_jobs=-1)]: Done 813 tasks      | elapsed: 17.3min
[Parallel(n_jobs=-1)]: Done 814 tasks      | elapsed: 17.3min
[Paralle

[Parallel(n_jobs=-1)]: Done 933 tasks      | elapsed: 19.7min
[Parallel(n_jobs=-1)]: Done 934 tasks      | elapsed: 19.7min
[Parallel(n_jobs=-1)]: Done 935 tasks      | elapsed: 19.7min
[Parallel(n_jobs=-1)]: Done 936 tasks      | elapsed: 19.7min
[Parallel(n_jobs=-1)]: Done 937 tasks      | elapsed: 19.7min
[Parallel(n_jobs=-1)]: Done 938 tasks      | elapsed: 19.8min
[Parallel(n_jobs=-1)]: Done 939 tasks      | elapsed: 19.8min
[Parallel(n_jobs=-1)]: Done 940 tasks      | elapsed: 19.8min
[Parallel(n_jobs=-1)]: Done 941 tasks      | elapsed: 19.8min
[Parallel(n_jobs=-1)]: Done 942 tasks      | elapsed: 19.8min
[Parallel(n_jobs=-1)]: Done 943 tasks      | elapsed: 19.8min
[Parallel(n_jobs=-1)]: Done 944 tasks      | elapsed: 19.9min
[Parallel(n_jobs=-1)]: Done 945 tasks      | elapsed: 19.9min
[Parallel(n_jobs=-1)]: Done 946 tasks      | elapsed: 19.9min
[Parallel(n_jobs=-1)]: Done 947 tasks      | elapsed: 19.9min
[Parallel(n_jobs=-1)]: Done 948 tasks      | elapsed: 19.9min
[Paralle

[Parallel(n_jobs=-1)]: Done 1065 tasks      | elapsed: 22.1min
[Parallel(n_jobs=-1)]: Done 1066 tasks      | elapsed: 22.2min
[Parallel(n_jobs=-1)]: Done 1067 tasks      | elapsed: 22.2min
[Parallel(n_jobs=-1)]: Done 1068 tasks      | elapsed: 22.2min
[Parallel(n_jobs=-1)]: Done 1069 tasks      | elapsed: 22.2min
[Parallel(n_jobs=-1)]: Done 1070 tasks      | elapsed: 22.2min
[Parallel(n_jobs=-1)]: Done 1071 tasks      | elapsed: 22.2min
[Parallel(n_jobs=-1)]: Done 1072 tasks      | elapsed: 22.2min
[Parallel(n_jobs=-1)]: Done 1073 tasks      | elapsed: 22.3min
[Parallel(n_jobs=-1)]: Done 1074 tasks      | elapsed: 22.3min
[Parallel(n_jobs=-1)]: Done 1075 tasks      | elapsed: 22.3min
[Parallel(n_jobs=-1)]: Done 1076 tasks      | elapsed: 22.3min
[Parallel(n_jobs=-1)]: Done 1077 tasks      | elapsed: 22.3min
[Parallel(n_jobs=-1)]: Done 1078 tasks      | elapsed: 22.3min
[Parallel(n_jobs=-1)]: Done 1079 tasks      | elapsed: 22.4min
[Parallel(n_jobs=-1)]: Done 1080 tasks      | elapsed: 

[Parallel(n_jobs=-1)]: Done 1196 tasks      | elapsed: 25.1min
[Parallel(n_jobs=-1)]: Done 1197 tasks      | elapsed: 25.1min
[Parallel(n_jobs=-1)]: Done 1198 tasks      | elapsed: 25.1min
[Parallel(n_jobs=-1)]: Done 1199 tasks      | elapsed: 25.1min
[Parallel(n_jobs=-1)]: Done 1200 tasks      | elapsed: 25.1min
[Parallel(n_jobs=-1)]: Done 1201 tasks      | elapsed: 25.1min
[Parallel(n_jobs=-1)]: Done 1202 tasks      | elapsed: 25.1min
[Parallel(n_jobs=-1)]: Done 1203 tasks      | elapsed: 25.1min
[Parallel(n_jobs=-1)]: Done 1204 tasks      | elapsed: 25.1min
[Parallel(n_jobs=-1)]: Done 1205 tasks      | elapsed: 25.2min
[Parallel(n_jobs=-1)]: Done 1206 tasks      | elapsed: 25.2min
[Parallel(n_jobs=-1)]: Done 1207 tasks      | elapsed: 25.2min
[Parallel(n_jobs=-1)]: Done 1208 tasks      | elapsed: 25.2min
[Parallel(n_jobs=-1)]: Done 1209 tasks      | elapsed: 25.2min
[Parallel(n_jobs=-1)]: Done 1210 tasks      | elapsed: 25.2min
[Parallel(n_jobs=-1)]: Done 1211 tasks      | elapsed: 

[Parallel(n_jobs=-1)]: Done 1328 tasks      | elapsed: 27.1min
[Parallel(n_jobs=-1)]: Done 1329 tasks      | elapsed: 27.1min
[Parallel(n_jobs=-1)]: Done 1330 tasks      | elapsed: 27.1min
[Parallel(n_jobs=-1)]: Done 1331 tasks      | elapsed: 27.2min
[Parallel(n_jobs=-1)]: Done 1332 tasks      | elapsed: 27.2min
[Parallel(n_jobs=-1)]: Done 1333 tasks      | elapsed: 27.2min
[Parallel(n_jobs=-1)]: Done 1334 tasks      | elapsed: 27.2min
[Parallel(n_jobs=-1)]: Done 1335 tasks      | elapsed: 27.2min
[Parallel(n_jobs=-1)]: Done 1336 tasks      | elapsed: 27.2min
[Parallel(n_jobs=-1)]: Done 1337 tasks      | elapsed: 27.2min
[Parallel(n_jobs=-1)]: Done 1338 tasks      | elapsed: 27.2min
[Parallel(n_jobs=-1)]: Done 1339 tasks      | elapsed: 27.3min
[Parallel(n_jobs=-1)]: Done 1340 tasks      | elapsed: 27.3min
[Parallel(n_jobs=-1)]: Done 1341 tasks      | elapsed: 27.4min
[Parallel(n_jobs=-1)]: Done 1342 tasks      | elapsed: 27.4min
[Parallel(n_jobs=-1)]: Done 1343 tasks      | elapsed: 

[Parallel(n_jobs=-1)]: Done 1459 tasks      | elapsed: 30.2min
[Parallel(n_jobs=-1)]: Done 1460 tasks      | elapsed: 30.2min
[Parallel(n_jobs=-1)]: Done 1461 tasks      | elapsed: 30.2min
[Parallel(n_jobs=-1)]: Done 1462 tasks      | elapsed: 30.2min
[Parallel(n_jobs=-1)]: Done 1463 tasks      | elapsed: 30.3min
[Parallel(n_jobs=-1)]: Done 1464 tasks      | elapsed: 30.3min
[Parallel(n_jobs=-1)]: Done 1465 tasks      | elapsed: 30.3min
[Parallel(n_jobs=-1)]: Done 1466 tasks      | elapsed: 30.3min
[Parallel(n_jobs=-1)]: Done 1467 tasks      | elapsed: 30.3min
[Parallel(n_jobs=-1)]: Done 1468 tasks      | elapsed: 30.3min
[Parallel(n_jobs=-1)]: Done 1469 tasks      | elapsed: 30.3min
[Parallel(n_jobs=-1)]: Done 1470 tasks      | elapsed: 30.4min
[Parallel(n_jobs=-1)]: Done 1471 tasks      | elapsed: 30.4min
[Parallel(n_jobs=-1)]: Done 1472 tasks      | elapsed: 30.4min
[Parallel(n_jobs=-1)]: Done 1473 tasks      | elapsed: 30.4min
[Parallel(n_jobs=-1)]: Done 1474 tasks      | elapsed: 

[Parallel(n_jobs=-1)]: Done 1591 tasks      | elapsed: 32.4min
[Parallel(n_jobs=-1)]: Done 1592 tasks      | elapsed: 32.4min
[Parallel(n_jobs=-1)]: Done 1593 tasks      | elapsed: 32.4min
[Parallel(n_jobs=-1)]: Done 1594 tasks      | elapsed: 32.4min
[Parallel(n_jobs=-1)]: Done 1595 tasks      | elapsed: 32.4min
[Parallel(n_jobs=-1)]: Done 1596 tasks      | elapsed: 32.5min
[Parallel(n_jobs=-1)]: Done 1597 tasks      | elapsed: 32.5min
[Parallel(n_jobs=-1)]: Done 1598 tasks      | elapsed: 32.5min
[Parallel(n_jobs=-1)]: Done 1599 tasks      | elapsed: 32.5min
[Parallel(n_jobs=-1)]: Done 1600 tasks      | elapsed: 32.5min
[Parallel(n_jobs=-1)]: Done 1601 tasks      | elapsed: 32.5min
[Parallel(n_jobs=-1)]: Done 1602 tasks      | elapsed: 32.5min
[Parallel(n_jobs=-1)]: Done 1603 tasks      | elapsed: 32.5min
[Parallel(n_jobs=-1)]: Done 1604 tasks      | elapsed: 32.5min
[Parallel(n_jobs=-1)]: Done 1605 tasks      | elapsed: 32.5min
[Parallel(n_jobs=-1)]: Done 1606 tasks      | elapsed: 

[Parallel(n_jobs=-1)]: Done 1722 tasks      | elapsed: 34.5min
[Parallel(n_jobs=-1)]: Done 1723 tasks      | elapsed: 34.6min
[Parallel(n_jobs=-1)]: Done 1724 tasks      | elapsed: 34.6min
[Parallel(n_jobs=-1)]: Done 1725 tasks      | elapsed: 34.6min
[Parallel(n_jobs=-1)]: Done 1726 tasks      | elapsed: 34.7min
[Parallel(n_jobs=-1)]: Done 1727 tasks      | elapsed: 34.7min
[Parallel(n_jobs=-1)]: Done 1728 tasks      | elapsed: 34.7min
[Parallel(n_jobs=-1)]: Done 1729 tasks      | elapsed: 34.7min
[Parallel(n_jobs=-1)]: Done 1730 tasks      | elapsed: 34.8min
[Parallel(n_jobs=-1)]: Done 1731 tasks      | elapsed: 34.8min
[Parallel(n_jobs=-1)]: Done 1732 tasks      | elapsed: 34.8min
[Parallel(n_jobs=-1)]: Done 1733 tasks      | elapsed: 34.8min
[Parallel(n_jobs=-1)]: Done 1734 tasks      | elapsed: 34.8min
[Parallel(n_jobs=-1)]: Done 1735 tasks      | elapsed: 34.8min
[Parallel(n_jobs=-1)]: Done 1736 tasks      | elapsed: 34.9min
[Parallel(n_jobs=-1)]: Done 1737 tasks      | elapsed: 

[Parallel(n_jobs=-1)]: Done 1853 tasks      | elapsed: 37.4min
[Parallel(n_jobs=-1)]: Done 1854 tasks      | elapsed: 37.4min
[Parallel(n_jobs=-1)]: Done 1855 tasks      | elapsed: 37.4min
[Parallel(n_jobs=-1)]: Done 1856 tasks      | elapsed: 37.4min
[Parallel(n_jobs=-1)]: Done 1857 tasks      | elapsed: 37.4min
[Parallel(n_jobs=-1)]: Done 1858 tasks      | elapsed: 37.4min
[Parallel(n_jobs=-1)]: Done 1859 tasks      | elapsed: 37.4min
[Parallel(n_jobs=-1)]: Done 1860 tasks      | elapsed: 37.4min
[Parallel(n_jobs=-1)]: Done 1861 tasks      | elapsed: 37.4min
[Parallel(n_jobs=-1)]: Done 1862 tasks      | elapsed: 37.5min
[Parallel(n_jobs=-1)]: Done 1863 tasks      | elapsed: 37.5min
[Parallel(n_jobs=-1)]: Done 1864 tasks      | elapsed: 37.5min
[Parallel(n_jobs=-1)]: Done 1865 tasks      | elapsed: 37.5min
[Parallel(n_jobs=-1)]: Done 1866 tasks      | elapsed: 37.6min
[Parallel(n_jobs=-1)]: Done 1867 tasks      | elapsed: 37.6min
[Parallel(n_jobs=-1)]: Done 1868 tasks      | elapsed: 

[Parallel(n_jobs=-1)]: Done 1984 tasks      | elapsed: 39.8min
[Parallel(n_jobs=-1)]: Done 1985 tasks      | elapsed: 39.8min
[Parallel(n_jobs=-1)]: Done 1986 tasks      | elapsed: 39.8min
[Parallel(n_jobs=-1)]: Done 1987 tasks      | elapsed: 39.8min
[Parallel(n_jobs=-1)]: Done 1988 tasks      | elapsed: 39.8min
[Parallel(n_jobs=-1)]: Done 1989 tasks      | elapsed: 39.8min
[Parallel(n_jobs=-1)]: Done 1990 tasks      | elapsed: 39.9min
[Parallel(n_jobs=-1)]: Done 1991 tasks      | elapsed: 39.9min
[Parallel(n_jobs=-1)]: Done 1992 tasks      | elapsed: 39.9min
[Parallel(n_jobs=-1)]: Done 1993 tasks      | elapsed: 39.9min
[Parallel(n_jobs=-1)]: Done 1994 tasks      | elapsed: 39.9min
[Parallel(n_jobs=-1)]: Done 1995 tasks      | elapsed: 39.9min
[Parallel(n_jobs=-1)]: Done 1996 tasks      | elapsed: 39.9min
[Parallel(n_jobs=-1)]: Done 1997 tasks      | elapsed: 39.9min
[Parallel(n_jobs=-1)]: Done 1998 tasks      | elapsed: 39.9min
[Parallel(n_jobs=-1)]: Done 1999 tasks      | elapsed: 

[Parallel(n_jobs=-1)]: Done 2115 tasks      | elapsed: 41.9min
[Parallel(n_jobs=-1)]: Done 2116 tasks      | elapsed: 41.9min
[Parallel(n_jobs=-1)]: Done 2117 tasks      | elapsed: 42.0min
[Parallel(n_jobs=-1)]: Done 2118 tasks      | elapsed: 42.0min
[Parallel(n_jobs=-1)]: Done 2119 tasks      | elapsed: 42.0min
[Parallel(n_jobs=-1)]: Done 2120 tasks      | elapsed: 42.0min
[Parallel(n_jobs=-1)]: Done 2121 tasks      | elapsed: 42.1min
[Parallel(n_jobs=-1)]: Done 2122 tasks      | elapsed: 42.1min
[Parallel(n_jobs=-1)]: Done 2123 tasks      | elapsed: 42.1min
[Parallel(n_jobs=-1)]: Done 2124 tasks      | elapsed: 42.1min
[Parallel(n_jobs=-1)]: Done 2125 tasks      | elapsed: 42.1min
[Parallel(n_jobs=-1)]: Done 2126 tasks      | elapsed: 42.1min
[Parallel(n_jobs=-1)]: Done 2127 tasks      | elapsed: 42.1min
[Parallel(n_jobs=-1)]: Done 2128 tasks      | elapsed: 42.1min
[Parallel(n_jobs=-1)]: Done 2129 tasks      | elapsed: 42.1min
[Parallel(n_jobs=-1)]: Done 2130 tasks      | elapsed: 

[Parallel(n_jobs=-1)]: Done 2246 tasks      | elapsed: 44.1min
[Parallel(n_jobs=-1)]: Done 2247 tasks      | elapsed: 44.1min
[Parallel(n_jobs=-1)]: Done 2248 tasks      | elapsed: 44.2min
[Parallel(n_jobs=-1)]: Done 2249 tasks      | elapsed: 44.2min
[Parallel(n_jobs=-1)]: Done 2250 tasks      | elapsed: 44.2min
[Parallel(n_jobs=-1)]: Done 2251 tasks      | elapsed: 44.2min
[Parallel(n_jobs=-1)]: Done 2252 tasks      | elapsed: 44.2min
[Parallel(n_jobs=-1)]: Done 2253 tasks      | elapsed: 44.2min
[Parallel(n_jobs=-1)]: Done 2254 tasks      | elapsed: 44.2min
[Parallel(n_jobs=-1)]: Done 2255 tasks      | elapsed: 44.3min
[Parallel(n_jobs=-1)]: Done 2256 tasks      | elapsed: 44.3min
[Parallel(n_jobs=-1)]: Done 2257 tasks      | elapsed: 44.3min
[Parallel(n_jobs=-1)]: Done 2258 tasks      | elapsed: 44.3min
[Parallel(n_jobs=-1)]: Done 2259 tasks      | elapsed: 44.3min
[Parallel(n_jobs=-1)]: Done 2260 tasks      | elapsed: 44.3min
[Parallel(n_jobs=-1)]: Done 2261 tasks      | elapsed: 

[Parallel(n_jobs=-1)]: Done 2378 tasks      | elapsed: 46.8min
[Parallel(n_jobs=-1)]: Done 2379 tasks      | elapsed: 46.8min
[Parallel(n_jobs=-1)]: Done 2380 tasks      | elapsed: 46.8min
[Parallel(n_jobs=-1)]: Done 2381 tasks      | elapsed: 47.0min
[Parallel(n_jobs=-1)]: Done 2382 tasks      | elapsed: 47.0min
[Parallel(n_jobs=-1)]: Done 2383 tasks      | elapsed: 47.0min
[Parallel(n_jobs=-1)]: Done 2384 tasks      | elapsed: 47.0min
[Parallel(n_jobs=-1)]: Done 2385 tasks      | elapsed: 47.1min
[Parallel(n_jobs=-1)]: Done 2386 tasks      | elapsed: 47.1min
[Parallel(n_jobs=-1)]: Done 2387 tasks      | elapsed: 47.1min
[Parallel(n_jobs=-1)]: Done 2388 tasks      | elapsed: 47.1min
[Parallel(n_jobs=-1)]: Done 2389 tasks      | elapsed: 47.1min
[Parallel(n_jobs=-1)]: Done 2390 tasks      | elapsed: 47.1min
[Parallel(n_jobs=-1)]: Done 2391 tasks      | elapsed: 47.2min
[Parallel(n_jobs=-1)]: Done 2392 tasks      | elapsed: 47.2min
[Parallel(n_jobs=-1)]: Done 2393 tasks      | elapsed: 

periodo do dia
[FeatureUnion] ........... (step 1 of 7) Processing ct1, total=   0.0s
dia util
[FeatureUnion] ........... (step 2 of 7) Processing ct2, total=   0.3s
pico manha
[FeatureUnion] ........... (step 3 of 7) Processing ct3, total=   0.0s
pico tarde
[FeatureUnion] ........... (step 4 of 7) Processing ct4, total=   0.0s
picos
[FeatureUnion] ........... (step 5 of 7) Processing ct5, total=   0.4s
vale
[FeatureUnion] ........... (step 6 of 7) Processing ct6, total=   0.0s
no change (4500, 1)
[ColumnTransformer] ......... (1 of 10) Processing hora, total=   0.0s
no change (4500, 1)
[ColumnTransformer] .......... (2 of 10) Processing dia, total=   0.0s
[ColumnTransformer] ...... (3 of 10) Processing estacao, total=   0.0s
raiz  temperatura
[ColumnTransformer] .. (4 of 10) Processing temperatura, total=   0.0s
[ColumnTransformer] ........ (5 of 10) Processing chuva, total=   0.0s
inverso  umidade
[ColumnTransformer] ...... (6 of 10) Processing umidade, total=   0.0s
raiz  sol
[Colum

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_regressor__modelo__colsample_bytree,param_regressor__modelo__learning_rate,param_regressor__modelo__max_depth,param_regressor__modelo__min_child_samples,param_regressor__modelo__min_child_weight,param_regressor__modelo__min_split_gain,param_regressor__modelo__n_estimators,param_regressor__modelo__num_leaves,param_regressor__modelo__random_state,param_regressor__modelo__reg_alpha,param_regressor__modelo__reg_lambda,...,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
432,9.764839,0.053694,0.451727,0.033491,0.573099,0.027359,16,17,10.0,0.000038,795,218,1986,2.70502,0.076022,...,-67692.594602,-93363.351161,-66687.643084,-70690.657085,-51873.786617,264.691531,13350.353160,1,-9461.799167,-9445.440702,-9668.451942,-9341.746815,-9756.242704,97.645974,153.286288
52,7.040711,0.313111,0.327819,0.037786,0.477877,0.064228,11,5,0.01,0.000886,427,44,1986,0.000397,0.000003,...,-69907.598318,-96042.573001,-69890.349921,-71234.735046,-55837.491926,269.411488,13015.295325,2,-1358.825530,-1368.281566,-1399.543465,-1373.062281,-1449.096019,37.279509,32.599257
434,3.424639,0.156007,0.296538,0.017340,0.570898,0.093392,15,40,10.0,0.000009,522,101,1986,1.101106,0.016042,...,-69690.863908,-97602.080571,-74957.491055,-76167.807947,-53476.858699,272.725174,14154.321516,3,-20424.507397,-20216.775195,-19578.307853,-20104.628271,-21987.160993,143.046412,811.946173
26,4.099700,0.066850,0.315551,0.022959,0.668983,0.070534,9,51,0.01,0.000014,630,189,1986,0.735385,0.836711,...,-72816.605707,-96362.828969,-73028.117354,-75612.363535,-55443.199547,273.226322,13017.419657,4,-16026.851859,-15162.765969,-14782.197549,-15891.310359,-16356.528370,125.075700,581.330245
228,3.085473,0.055269,0.311615,0.023659,0.471623,0.092158,7,17,10.0,0.001203,692,217,1986,0.000165,0.49203,...,-73445.945113,-97483.787868,-74292.870697,-74479.393137,-56465.344842,274.287200,13053.488203,5,-8124.658487,-8102.916893,-8070.603986,-8763.290687,-8637.304114,91.322258,297.563109
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
303,1.937628,0.082624,0.271822,0.035086,0.555681,0.042212,11,94,10000.0,0.0,365,45,1986,8.054538,0.000988,...,-960437.715843,-894521.956584,-917448.609705,-985796.560056,-933784.004891,968.709332,31992.641842,493,-932878.679999,-949353.098155,-943630.288660,-926566.201117,-939559.456542,968.709216,7989.923135
470,2.614561,0.103070,0.212702,0.015539,0.482818,0.139546,7,37,10000.0,0.0,649,182,1986,0.0,0.0,...,-960437.715843,-894521.956584,-917448.609705,-985796.560056,-933784.004891,968.709332,31992.641842,493,-932878.679999,-949353.098155,-943630.288660,-926566.201117,-939559.456542,968.709216,7989.923135
382,1.543719,0.057214,0.206553,0.011812,0.599817,0.063685,14,29,1000.0,0.025052,513,100,1986,0.0,0.001781,...,-960437.715843,-894521.956584,-917448.609705,-985796.560056,-933784.004891,968.709332,31992.641842,493,-932878.679999,-949353.098155,-943630.288660,-926566.201117,-939559.456542,968.709216,7989.923135
174,2.139870,0.463144,0.212889,0.009029,0.425257,0.06872,12,90,10000.0,0.0,688,211,1986,0.0641,0.000001,...,-960437.715843,-894521.956584,-917448.609705,-985796.560056,-933784.004891,968.709332,31992.641842,493,-932878.679999,-949353.098155,-943630.288660,-926566.201117,-939559.456542,968.709216,7989.923135


264.6915308610537


In [103]:
grid_best.regressor_[1]

LGBMRegressor(colsample_bytree=0.573099425153106,
              learning_rate=0.02735933533082987, max_depth=16,
              min_child_samples=17, min_child_weight=10.0,
              min_split_gain=3.832627927552628e-05, n_estimators=795,
              num_leaves=218, random_state=1986, reg_alpha=2.7050201034653627,
              reg_lambda=0.07602219372259673, subsample=0.7696869993717743,
              subsample_freq=2)

In [165]:
params = {'regressor__modelo__colsample_bytree': 0.573099425153106, 
           'regressor__modelo__learning_rate': 0.02735933533082987, 
           'regressor__modelo__max_depth': 16, 
           'regressor__modelo__min_child_samples': 17, 
           'regressor__modelo__min_child_weight': 10.0, 
           'regressor__modelo__min_split_gain': 3.832627927552628e-05, 
           'regressor__modelo__n_estimators': 795, 
           'regressor__modelo__num_leaves': 218, 
           'regressor__modelo__random_state': 1986, 
           'regressor__modelo__reg_alpha': 2.7050201034653627, 
           'regressor__modelo__reg_lambda': 0.07602219372259673, 
           'regressor__modelo__subsample': 0.7696869993717743, 
           'regressor__modelo__subsample_freq': 2, 
           'regressor__prep__poly__interaction_only': True, 
           'regressor__prep__uniao__ct7__chuva': StandardScaler(), 
           'regressor__prep__uniao__ct7__dia': no_change, 
           'regressor__prep__uniao__ct7__estacao': OneHotEncoder(drop='first'), 
           'regressor__prep__uniao__ct7__hora': no_change, 
           'regressor__prep__uniao__ct7__sol': func_raiz, 
           'regressor__prep__uniao__ct7__temperatura': func_raiz, 
           'regressor__prep__uniao__ct7__umidade': func_inverso, 
           'regressor__prep__uniao__ct7__vento__scaler': StandardScaler(), 
           'regressor__prep__uniao__ct7__vento__trans': func_vento, 
           'regressor__prep__uniao__ct7__visibilidade': log1, 
           'transformer': StandardScaler()}

In [166]:
SEED = 1986
np.random.seed(SEED)


grid_best = grid_search_pipe.best_estimator_.set_params(**params)
df_treino_one = rmse_eval(treino, grid_best)


# RSME teste  e: 258.39, std: 29.46



periodo do dia
[FeatureUnion] ........... (step 1 of 7) Processing ct1, total=   0.0s
dia util
[FeatureUnion] ........... (step 2 of 7) Processing ct2, total=   0.2s
pico manha
[FeatureUnion] ........... (step 3 of 7) Processing ct3, total=   0.0s
pico tarde
[FeatureUnion] ........... (step 4 of 7) Processing ct4, total=   0.0s
picos
[FeatureUnion] ........... (step 5 of 7) Processing ct5, total=   0.2s
vale
[FeatureUnion] ........... (step 6 of 7) Processing ct6, total=   0.0s
no change (4200, 1)
[ColumnTransformer] ......... (1 of 10) Processing hora, total=   0.0s
no change (4200, 1)
[ColumnTransformer] .......... (2 of 10) Processing dia, total=   0.0s
[ColumnTransformer] ...... (3 of 10) Processing estacao, total=   0.0s
raiz  temperatura
[ColumnTransformer] .. (4 of 10) Processing temperatura, total=   0.0s
[ColumnTransformer] ........ (5 of 10) Processing chuva, total=   0.0s
inverso  umidade
[ColumnTransformer] ...... (6 of 10) Processing umidade, total=   0.0s
raiz  sol
[Colum

periodo do dia
dia util
pico manha
pico tarde
picos
vale
no change (4200, 1)
no change (4200, 1)
raiz  temperatura
inverso  umidade
raiz  sol
auxiliar vento
periodo do dia
dia util
pico manha
pico tarde
picos
vale
no change (300, 1)
no change (300, 1)
raiz  temperatura
inverso  umidade
raiz  sol
auxiliar vento
periodo do dia
[FeatureUnion] ........... (step 1 of 7) Processing ct1, total=   0.0s
dia util
[FeatureUnion] ........... (step 2 of 7) Processing ct2, total=   0.2s
pico manha
[FeatureUnion] ........... (step 3 of 7) Processing ct3, total=   0.0s
pico tarde
[FeatureUnion] ........... (step 4 of 7) Processing ct4, total=   0.0s
picos
[FeatureUnion] ........... (step 5 of 7) Processing ct5, total=   0.2s
vale
[FeatureUnion] ........... (step 6 of 7) Processing ct6, total=   0.0s
no change (4200, 1)
[ColumnTransformer] ......... (1 of 10) Processing hora, total=   0.0s
no change (4200, 1)
[ColumnTransformer] .......... (2 of 10) Processing dia, total=   0.0s
[ColumnTransformer] ...

periodo do dia
dia util
pico manha
pico tarde
picos
vale
no change (4200, 1)
no change (4200, 1)
raiz  temperatura
inverso  umidade
raiz  sol
auxiliar vento
periodo do dia
dia util
pico manha
pico tarde
picos
vale
no change (300, 1)
no change (300, 1)
raiz  temperatura
inverso  umidade
raiz  sol
auxiliar vento
periodo do dia
[FeatureUnion] ........... (step 1 of 7) Processing ct1, total=   0.0s
dia util
[FeatureUnion] ........... (step 2 of 7) Processing ct2, total=   0.2s
pico manha
[FeatureUnion] ........... (step 3 of 7) Processing ct3, total=   0.0s
pico tarde
[FeatureUnion] ........... (step 4 of 7) Processing ct4, total=   0.0s
picos
[FeatureUnion] ........... (step 5 of 7) Processing ct5, total=   0.2s
vale
[FeatureUnion] ........... (step 6 of 7) Processing ct6, total=   0.0s
no change (4200, 1)
[ColumnTransformer] ......... (1 of 10) Processing hora, total=   0.0s
no change (4200, 1)
[ColumnTransformer] .......... (2 of 10) Processing dia, total=   0.0s
[ColumnTransformer] ...

periodo do dia
dia util
pico manha
pico tarde
picos
vale
no change (4200, 1)
no change (4200, 1)
raiz  temperatura
inverso  umidade
raiz  sol
auxiliar vento
periodo do dia
dia util
pico manha
pico tarde
picos
vale
no change (300, 1)
no change (300, 1)
raiz  temperatura
inverso  umidade
raiz  sol
auxiliar vento
RSME treino e: 96.04, std: 0.65
RSME teste  e: 258.39, std: 29.46


In [29]:
erro_groupby(df_treino_one, 'estação')

  df_analise = df_treino_one.groupby('estação')['erro_quadra', 'abs_dif'].mean()


Unnamed: 0_level_0,erro_quadra,abs_dif
estação,Unnamed: 1_level_1,Unnamed: 2_level_1
1,77.803779,52.359051
2,234.827545,150.259562
3,371.874561,233.138761
4,260.232834,165.964567


In [168]:
erro_groupby(df_treino_one, 'dia')

  df_analise = df_treino_one.groupby('dia')['erro_quadra', 'abs_dif'].mean()


Unnamed: 0_level_0,erro_quadra,abs_dif
dia,Unnamed: 1_level_1,Unnamed: 2_level_1
1,327.238826,200.785184
2,236.839312,138.233929
3,285.285587,158.705883
4,234.106594,139.065601
5,227.863262,137.829033
6,257.632709,151.848711
7,235.144493,137.078055


In [169]:
erro_groupby(df_treino_one, 'hora')

  df_analise = df_treino_one.groupby('hora')['erro_quadra', 'abs_dif'].mean()


Unnamed: 0_level_0,erro_quadra,abs_dif
hora,Unnamed: 1_level_1,Unnamed: 2_level_1
0,155.728892,100.886549
1,128.903799,95.869541
2,106.809209,74.648047
3,75.471061,52.721923
4,55.439143,38.213855
5,49.259737,34.769774
6,125.107703,90.031752
7,304.513796,213.354317
8,513.274691,389.149574
9,215.286001,148.702693


In [170]:
df_treino_one.describe()

Unnamed: 0,hora,dia,feriado,estação,temperatura,chuva,umidade,sol,visibilidade,vento,aluguéis,dia_util,preditos,dif,abs_dif,erro_quadra
count,4500.0,4500.0,4500.0,4500.0,4500.0,4500.0,4500.0,4500.0,4500.0,4500.0,4500.0,4500.0,4500.0,4500.0,4500.0,4500.0
mean,11.644889,3.972444,0.049778,2.527111,27.108356,0.161864,62.087133,0.578104,0.69694,2.322933,727.169556,0.689333,726.772599,0.396957,151.997425,67634.72
std,6.901146,1.976669,0.21751,1.110048,7.976684,1.179291,15.176118,0.878158,0.296047,1.243427,646.886075,0.462818,589.703707,260.095353,211.048331,241807.7
min,0.0,1.0,0.0,1.0,6.5,0.0,18.1,0.0,0.02,0.25,5.0,0.0,-45.199793,-2287.679031,0.021776,0.0004741998
25%,6.0,2.0,0.0,2.0,20.9,0.0,50.5,0.0,0.45,1.33,208.75,0.0,243.77364,-75.117696,30.986269,960.1489
50%,12.0,4.0,0.0,3.0,27.8,0.0,61.1,0.01,0.83,2.05,534.5,1.0,562.113089,-2.028427,79.235835,6278.322
75%,18.0,6.0,0.0,4.0,33.5,0.0,73.9,0.94,0.97,3.01,1096.0,1.0,1068.645301,83.222848,179.534604,32232.67
max,23.0,7.0,1.0,4.0,44.6,27.65,92.0,3.52,0.97,9.13,3566.0,1.0,3159.896101,1425.882888,2287.679031,5233475.0


In [172]:
np.sqrt(df_treino_one.erro_quadra.mean())

260.0667550460558

# Modelo 11

In [25]:
# Período do dia
periodo_dia_pipeline = Pipeline(
    steps = [
        ("trans", func_periodo_dia),
        ("encode", one_hot),  # out_encoder, one_hot
        ])

prep_periodo_dia = ColumnTransformer([('periodo_dia', periodo_dia_pipeline, ['hora']),                                                     
                                 ], 
                                remainder='drop',
                                    )

# Dia útil
prep_dia_util = ColumnTransformer([('dia_util', func_dia_util, ['dia', 'feriado']),                                                     
                                 ], 
                                remainder='drop')

# Pico da manhã
prep_horas_pico_manha = ColumnTransformer([('pico_manha', func_pico_manha, ['hora']),                                                     
                                 ], 
                                remainder='drop')

# Pico da tarde
prep_horas_pico_tarde = ColumnTransformer([('pico_tarde', func_pico_tarde, ['hora']),                                                     
                                 ], 
                                remainder='drop')

# máximos dos picos
prep_horas_picos = ColumnTransformer([('picos', func_picos, ['hora', 'dia_util', 'dia']),                                                     
                                 ], 
                                remainder='drop')

# Horas vale
prep_horas_vale = ColumnTransformer([('horas_vale', func_horas_vale, ['hora']),                                                     
                                 ], 
                                remainder='drop')


# Transformações nas colunas previamente existentes

vento_pipeline = Pipeline(
    steps = [
        ("trans", func_vento), # func_inverso, func_raiz, func_inverso_raiz, no_change
        ("scaler", sc),  #
        ])


prep_colunas = ColumnTransformer([('hora', no_change, ['hora']), # out_encoder, one_hot, no_change, spline
                                  ('dia', no_change, ['dia']), 
                                  ('estacao', one_hot , ['estação']),
                                  ('temperatura', func_raiz,['temperatura']),  
                                  ('chuva', sc,['chuva']),      
                                  ('umidade', func_inverso,['umidade']),  
                                  ('sol', func_raiz,['sol']),          
                                  ('visibilidade', log1, ['visibilidade']),  
                                  ('vento', vento_pipeline, ['vento']),                                                       
                                 ], 
                                remainder='passthrough',
                                #sparse_threshold = 0, 
                                 verbose=True)


# Unindo as transformações criadas acima
uniao = FeatureUnion([('ct1', prep_periodo_dia), 
                      ('ct2', prep_dia_util),
                      ('ct3', prep_horas_pico_manha),
                      ('ct4', prep_horas_pico_tarde),
                      ('ct5', prep_horas_picos),
                      ('ct6', prep_horas_vale),
                      ('ct7', prep_colunas),
                     # ('ct8', prep_inverso),
                     # ('ct9', prep_raiz),
                     # ('ct10', prep_inverso_raiz)
                     ], verbose=True)


preprocessor = Pipeline(steps=[
    ("uniao", uniao),
    ('poly', PolynomialFeatures(degree=2, interaction_only = True, include_bias = False)),
                               
                              ],
                       verbose=True)

params_ = {'colsample_bytree': 0.573099425153106, 
           'learning_rate': 0.02735933533082987, 
           'max_depth': 16, 
           'min_child_samples': 17, 
           'min_child_weight': 10.0, 
           'min_split_gain': 3.832627927552628e-05, 
           'n_estimators': 795, 
           'num_leaves': 218, 
           'random_state': 1986, 
           'reg_alpha': 2.7050201034653627, 
           'reg_lambda': 0.07602219372259673, 
           'subsample': 0.7696869993717743, 
           'subsample_freq': 2, 
          }

lgbm = LGBMRegressor(**params_)

pipe_model = Pipeline(steps=[('prep', preprocessor), ('modelo', lgbm)])

pipe_final = TransformedTargetRegressor(regressor=pipe_model,
                                   transformer = StandardScaler())



In [26]:
SEED = 1986
np.random.seed(SEED)


X = treino.drop(columns = ['aluguéis'], axis = 1)
y = treino['aluguéis']

kf = KFold(n_splits = 15)

scTreino = [] 
scTeste = [] 

df_treino_one = treino.copy()




for i_treino,i_teste in kf.split(X): 
    X_treino, y_treino = X.iloc[i_treino],y.iloc[i_treino] 
    X_teste, y_teste = X.iloc[i_teste],y.iloc[i_teste] 
    pipe_final.fit(X_treino,y_treino) 
    y_predTreino = pipe_final.predict(X_treino) 
    y_predTeste = pipe_final.predict(X_teste) 
    df_treino_one.loc[i_teste,'preditos'] = y_predTeste
    scTreino.append(np.sqrt(mean_squared_error(y_treino,y_predTreino))) 
    scTeste.append(np.sqrt(mean_squared_error(y_teste,y_predTeste))) 
    
print("RSME treino e: {:.2f}, std: {:.2f}".format(np.mean(scTreino), np.std(scTreino))) 
print("RSME teste  e: {:.2f}, std: {:.2f}".format(np.mean(scTeste), np.std(scTeste))) 

periodo do dia
[FeatureUnion] ........... (step 1 of 7) Processing ct1, total=   0.0s
dia util
[FeatureUnion] ........... (step 2 of 7) Processing ct2, total=   0.2s
pico manha
[FeatureUnion] ........... (step 3 of 7) Processing ct3, total=   0.0s
pico tarde
[FeatureUnion] ........... (step 4 of 7) Processing ct4, total=   0.0s
picos
[FeatureUnion] ........... (step 5 of 7) Processing ct5, total=   0.2s
vale
[FeatureUnion] ........... (step 6 of 7) Processing ct6, total=   0.0s
no change (4200, 1)
[ColumnTransformer] ......... (1 of 10) Processing hora, total=   0.0s
no change (4200, 1)
[ColumnTransformer] .......... (2 of 10) Processing dia, total=   0.0s
[ColumnTransformer] ...... (3 of 10) Processing estacao, total=   0.0s
raiz  temperatura
[ColumnTransformer] .. (4 of 10) Processing temperatura, total=   0.0s
[ColumnTransformer] ........ (5 of 10) Processing chuva, total=   0.0s
inverso  umidade
[ColumnTransformer] ...... (6 of 10) Processing umidade, total=   0.0s
raiz  sol
[Colum

periodo do dia
dia util
pico manha
pico tarde
picos
vale
no change (4200, 1)
no change (4200, 1)
raiz  temperatura
inverso  umidade
raiz  sol
auxiliar vento
periodo do dia
dia util
pico manha
pico tarde
picos
vale
no change (300, 1)
no change (300, 1)
raiz  temperatura
inverso  umidade
raiz  sol
auxiliar vento
periodo do dia
[FeatureUnion] ........... (step 1 of 7) Processing ct1, total=   0.0s
dia util
[FeatureUnion] ........... (step 2 of 7) Processing ct2, total=   0.3s
pico manha
[FeatureUnion] ........... (step 3 of 7) Processing ct3, total=   0.0s
pico tarde
[FeatureUnion] ........... (step 4 of 7) Processing ct4, total=   0.0s
picos
[FeatureUnion] ........... (step 5 of 7) Processing ct5, total=   0.3s
vale
[FeatureUnion] ........... (step 6 of 7) Processing ct6, total=   0.0s
no change (4200, 1)
[ColumnTransformer] ......... (1 of 10) Processing hora, total=   0.0s
no change (4200, 1)
[ColumnTransformer] .......... (2 of 10) Processing dia, total=   0.0s
[ColumnTransformer] ...

periodo do dia
dia util
pico manha
pico tarde
picos
vale
no change (4200, 1)
no change (4200, 1)
raiz  temperatura
inverso  umidade
raiz  sol
auxiliar vento
periodo do dia
dia util
pico manha
pico tarde
picos
vale
no change (300, 1)
no change (300, 1)
raiz  temperatura
inverso  umidade
raiz  sol
auxiliar vento
periodo do dia
[FeatureUnion] ........... (step 1 of 7) Processing ct1, total=   0.0s
dia util
[FeatureUnion] ........... (step 2 of 7) Processing ct2, total=   0.2s
pico manha
[FeatureUnion] ........... (step 3 of 7) Processing ct3, total=   0.0s
pico tarde
[FeatureUnion] ........... (step 4 of 7) Processing ct4, total=   0.0s
picos
[FeatureUnion] ........... (step 5 of 7) Processing ct5, total=   0.3s
vale
[FeatureUnion] ........... (step 6 of 7) Processing ct6, total=   0.0s
no change (4200, 1)
[ColumnTransformer] ......... (1 of 10) Processing hora, total=   0.0s
no change (4200, 1)
[ColumnTransformer] .......... (2 of 10) Processing dia, total=   0.0s
[ColumnTransformer] ...

periodo do dia
dia util
pico manha
pico tarde
picos
vale
no change (4200, 1)
no change (4200, 1)
raiz  temperatura
inverso  umidade
raiz  sol
auxiliar vento
periodo do dia
dia util
pico manha
pico tarde
picos
vale
no change (300, 1)
no change (300, 1)
raiz  temperatura
inverso  umidade
raiz  sol
auxiliar vento
RSME treino e: 96.04, std: 0.65
RSME teste  e: 258.39, std: 29.46


In [27]:
df_treino_one.to_csv('df_treino_one_grid.csv', index = False)

In [32]:
np.sqrt(df_treino_one.erro_quadra.mean())

260.0667550460558

In [35]:
np.sqrt(df_treino_one.loc[df_treino_one['estação'] != 1].erro_quadra.mean())

294.2565817837772

In [39]:
df_treinando5 = treino.copy()
df_testando5 = teste.copy()

SEED = 1986
np.random.seed(SEED)


X_final3 = df_treinando5.drop(columns = ['aluguéis'], axis = 1)
y_final3 = df_treinando5['aluguéis']

pipe_final.fit(X_final3, y_final3)
preditos_final_5 = pipe_final.predict(df_testando5)

df_testando5['preditos7'] = preditos_final_5

df_testando5.to_csv('df_teste_grid_one.csv', index = False)

periodo do dia
[FeatureUnion] ........... (step 1 of 7) Processing ct1, total=   0.0s
dia util
[FeatureUnion] ........... (step 2 of 7) Processing ct2, total=   0.3s
pico manha
[FeatureUnion] ........... (step 3 of 7) Processing ct3, total=   0.0s
pico tarde
[FeatureUnion] ........... (step 4 of 7) Processing ct4, total=   0.0s
picos
[FeatureUnion] ........... (step 5 of 7) Processing ct5, total=   0.4s
vale
[FeatureUnion] ........... (step 6 of 7) Processing ct6, total=   0.0s
no change (4500, 1)
[ColumnTransformer] ......... (1 of 10) Processing hora, total=   0.0s
no change (4500, 1)
[ColumnTransformer] .......... (2 of 10) Processing dia, total=   0.0s
[ColumnTransformer] ...... (3 of 10) Processing estacao, total=   0.0s
raiz  temperatura
[ColumnTransformer] .. (4 of 10) Processing temperatura, total=   0.0s
[ColumnTransformer] ........ (5 of 10) Processing chuva, total=   0.0s
inverso  umidade
[ColumnTransformer] ...... (6 of 10) Processing umidade, total=   0.0s
raiz  sol
[Colum

In [40]:
df_testando5

Unnamed: 0,hora,dia,feriado,estação,temperatura,chuva,umidade,sol,visibilidade,vento,dia_util,preditos7
0,21,5,0,4,33.3,0.12,72.4,0.0,0.88,2.05,1,608.752623
1,0,2,0,3,25.0,0.00,73.1,0.0,0.34,0.85,1,529.713532
2,4,2,0,3,22.1,0.00,67.9,0.0,0.97,1.09,1,161.726900
3,6,3,0,4,36.5,0.00,80.7,0.0,0.53,1.93,1,536.957270
4,21,5,0,4,34.2,0.00,65.6,0.0,0.71,2.05,1,2239.565172
...,...,...,...,...,...,...,...,...,...,...,...,...
2995,23,7,0,2,18.5,0.00,46.8,0.0,0.92,0.49,0,395.259717
2996,21,1,0,1,15.5,0.00,62.6,0.0,0.60,3.85,0,174.728660
2997,23,6,0,1,22.0,0.00,78.4,0.0,0.13,3.49,1,320.330093
2998,4,5,0,4,36.6,0.00,70.1,0.0,0.97,1.45,1,242.877326


In [34]:
df_treino_one

Unnamed: 0,hora,dia,feriado,estação,temperatura,chuva,umidade,sol,visibilidade,vento,aluguéis,dia_util,preditos,dif,abs_dif,erro_quadra
0,16,7,0,4,39.4,0.0,61.1,2.19,0.92,3.49,1318,0,1173.457155,144.542845,144.542845,20892.634080
1,21,6,0,2,22.2,0.0,63.3,0.00,0.32,2.89,686,1,599.705361,86.294639,86.294639,7446.764699
2,16,2,0,4,40.5,0.0,52.8,2.51,0.93,3.97,831,1,1003.956132,-172.956132,172.956132,29913.823690
3,7,2,1,3,25.6,0.0,70.9,0.00,0.97,1.09,15,0,153.647722,-138.647722,138.647722,19223.190680
4,9,2,0,4,39.9,0.0,58.1,1.69,0.62,1.09,865,1,997.553325,-132.553325,132.553325,17570.384021
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4495,6,6,0,4,30.4,0.0,84.5,0.03,0.13,1.57,723,1,575.560821,147.439179,147.439179,21738.311589
4496,16,4,0,3,28.5,0.0,48.3,0.87,0.67,3.49,1151,1,1233.086555,-82.086555,82.086555,6738.202522
4497,15,6,0,2,35.1,0.0,37.0,2.20,0.95,3.49,1321,1,1476.238909,-155.238909,155.238909,24099.118886
4498,18,1,0,3,32.7,0.0,85.2,0.18,0.17,3.85,533,0,948.611031,-415.611031,415.611031,172732.529050
