# 3.0 - Modelado, ajuste y evaluación

In [1]:
# librerias

import numpy as np
import pandas as pd

pd.set_option('display.max_columns', None)

from catboost import CatBoostRegressor as CTR

from sklearn.model_selection import train_test_split as tts 

from hyperopt import fmin, hp, tpe, Trials, space_eval, STATUS_OK

from sklearn.metrics import mean_squared_error as mse 
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import r2_score as r2

**Datos**

Selecciono la características previamente evaluadas y solamente cargo esas. Elimino los outliers y cambio el tipo de dato.

In [2]:
cols=['accommodates', 'air_conditioning', 'availability_30', 'availability_365', 'availability_60', 'availability_90',
      'bathrooms', 'bedrooms', 'beds', 'calculated_host_listings_count', 'calculated_host_listings_count_entire_homes',
      'calculated_host_listings_count_private_rooms', 'calculated_host_listings_count_shared_rooms', 'cleaning_fee',
      'dishwasher', 'extra_people', 'guests_included','latitude', 'longitude', 'maximum_nights', 'minimum_nights', 
      'number_of_reviews', 'number_of_reviews_ltm', 'room_type_private_room', 'room_type_shared_room', 'security_deposit',
      'price']

len(cols)

27

In [3]:
# carga de datos
listings=pd.read_csv('../data/transform_data/listings_normal.csv', usecols=cols)

listings=listings[(listings.price>=10) & (listings.price<=196)]  # eliminacion de outliers

# cambio en el tamaño del tipo de dato
for c in listings.select_dtypes(include='int'):
    listings[c]=pd.to_numeric(listings[c], downcast='integer')

for c in listings.select_dtypes(include='float'):
    listings[c]=pd.to_numeric(listings[c], downcast='float')

**Modelo**

Importo y entreno un Catboost Regressor, realizo predicción con el paquete de entrenamiento y de testeo, para darme cuenta de posible sobreajuste, midiendo RMSE, MAE y R2.

In [4]:
ctr=CTR(verbose=0)

In [5]:
X=listings.drop('price', axis=1)
y=listings.price

X_train, X_test, y_train, y_test = tts(X, y, train_size=0.8, test_size=0.2, random_state=42)

ctr.fit(X_train, y_train)

y_pred=ctr.predict(X_train)

print(f'Train RMSE: {mse(y_train, y_pred, squared=False)}')
print(f'Train MAE: {mae(y_train, y_pred)}')
print(f'Train R2: {r2(y_train, y_pred)}')

Train RMSE: 17.09389499575402
Train MAE: 11.909514534360929
Train R2: 0.7995457095749583


In [6]:
y_pred=ctr.predict(X_test)

print(f'Test RMSE: {mse(y_test, y_pred, squared=False)}')
print(f'Test MAE: {mae(y_test, y_pred)}')
print(f'Test R2: {r2(y_test, y_pred)}')

Test RMSE: 21.713858907632996
Test MAE: 14.82603810439655
Test R2: 0.7113638913656304


**Ajuste de hiperparámetros**

Ajuste bayesiano de hiperparámetros.

In [7]:
# aprioris, espacio de hiperparámetros

espacio={
    'n_estimators':hp.quniform('n_estimators', 100, 700, 25),
    
    'learning_rate':hp.uniform('learning_rate', 0.01, 1.0),
    
    'depth':hp.quniform('depth', 7, 16, 1)
}

In [8]:
def objetivo(espacio):
    
    modelo=CTR(verbose=0,
               n_estimators=int(espacio['n_estimators']),
               learning_rate=espacio['learning_rate'],
               depth=int(espacio['depth'])
              )
    
    eval_set=[(X_train, y_train), (X_test, y_test)]
    
    modelo.fit(X_train, y_train, eval_set=eval_set)
    
    y_pred=modelo.predict(X_test)
    
    rmse=mse(y_test, y_pred, squared=False)
    
    return {'loss': rmse, 'status': STATUS_OK}

In [9]:
mejor=fmin(fn=objetivo,
          space=espacio,
          algo=tpe.suggest,
          max_evals=20,
          trials=Trials())

mejor

100%|███████████████████████████████████████████████████████████████████████████| 20/20 [16:07<00:00, 48.39s/trial, best loss: 21.86404944782249]


{'depth': 10.0, 'learning_rate': 0.039754086894035215, 'n_estimators': 575.0}

In [10]:
modelo_ajustado=CTR(
    verbose=0,
    n_estimators=int(mejor['n_estimators']),
    learning_rate=mejor['learning_rate'],
    depth=int(mejor['depth']),
    )


modelo_ajustado.fit(X_train, y_train)

<catboost.core.CatBoostRegressor at 0x1793adb20>

In [11]:
y_pred=modelo_ajustado.predict(X_train)

print(f'Train RMSE: {mse(y_train, y_pred, squared=False)}')
print(f'Train MAE: {mae(y_train, y_pred)}')
print(f'Train R2: {r2(y_train, y_pred)}')

Train RMSE: 15.211099645927568
Train MAE: 10.496648155215967
Train R2: 0.8412716451101935


In [12]:
y_pred=modelo_ajustado.predict(X_test)

print(f'Test RMSE: {mse(y_test, y_pred, squared=False)}')
print(f'Test MAE: {mae(y_test, y_pred)}')
print(f'Test R2: {r2(y_test, y_pred)}')

Test RMSE: 21.864343257948462
Test MAE: 14.850326087135647
Test R2: 0.7073493375446744


**Evaluación**

###### MAE


$$MAE = \frac{1}{n}\sum_{i=1}^{n}|y_i-\hat{y}_i|$$


pertenece al intervalo [0, +$\infty$)

###### RMSE


$$RMSE = \sqrt{\frac{1}{n}\sum_{i=1}^{n}(y_i-\hat{y}_i)^{2}}$$


pertenece al intervalo [0, +$\infty$) y se cumple que:

$$MAE <= RMSE <= MAE · \sqrt{n}$$

In [15]:
listings.describe()

Unnamed: 0,latitude,longitude,accommodates,bathrooms,bedrooms,beds,price,security_deposit,cleaning_fee,guests_included,extra_people,minimum_nights,maximum_nights,availability_30,availability_60,availability_90,availability_365,number_of_reviews,number_of_reviews_ltm,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,room_type_private_room,room_type_shared_room,air_conditioning,dishwasher
count,18936.0,18936.0,18936.0,18936.0,18936.0,18936.0,18936.0,18936.0,18936.0,18936.0,18936.0,18936.0,18936.0,18936.0,18936.0,18936.0,18936.0,18936.0,18936.0,18936.0,18936.0,18936.0,18936.0,18936.0,18936.0,18936.0,18936.0
mean,40.420036,-3.695224,-0.062693,-0.045266,-0.071341,-0.059136,66.749789,-0.032686,-0.044413,-0.02336,-0.008899,0.004723,0.000949,-0.055918,-0.049728,-0.044702,-0.027016,0.051326,0.052987,-0.019191,-0.017139,-0.016245,0.009985,0.361164,0.014734,0.624578,0.229563
std,0.021906,0.02687,0.904652,0.969154,0.898778,0.911714,38.641353,0.860545,0.767335,0.906895,0.813106,0.999886,1.060908,0.972367,0.978803,0.983002,0.993797,1.037925,1.023062,0.9903,1.003426,0.963728,1.057526,0.480351,0.120489,0.484245,0.420563
min,40.33247,-3.86391,-1.139633,-1.747547,-1.574992,-1.292693,10.0,-0.459455,-0.703313,-0.548597,-0.522511,-0.157275,-0.008475,-1.115888,-1.231071,-1.303542,-1.193883,-0.57927,-0.678573,-0.32692,-0.301845,-0.357682,-0.103323,0.0,0.0,0.0,0.0
25%,40.409222,-3.70748,-0.645354,-0.347512,-0.411921,-0.64198,36.0,-0.459455,-0.703313,-0.548597,-0.522511,-0.157275,-0.008417,-1.115888,-1.231071,-1.274689,-1.047364,-0.548849,-0.678573,-0.32692,-0.301845,-0.357682,-0.103323,0.0,0.0,0.0,0.0
50%,40.41803,-3.70142,-0.151076,-0.347512,-0.411921,0.008732,60.0,-0.459455,-0.116803,-0.548597,-0.522511,-0.11098,-0.007005,-0.2301,-0.010284,0.081392,-0.19755,-0.396746,-0.40547,-0.295958,-0.270356,-0.357682,-0.103323,0.0,0.0,1.0,0.0
75%,40.428169,-3.69099,0.343203,-0.347512,0.75115,0.008732,89.0,0.200941,0.32308,0.1634,0.397414,-0.064685,-0.007005,0.832845,0.861707,0.860418,1.055193,0.196457,0.413838,-0.172109,-0.175891,-0.131436,-0.103323,1.0,0.0,1.0,0.0
max,40.56274,-3.52766,6.274544,15.052868,11.218792,31.242945,196.0,19.176319,13.959438,10.131353,16.097458,51.878078,145.258194,1.541475,1.384901,1.29321,1.480101,9.064079,8.834509,7.444613,7.633257,9.144665,18.479164,1.0,1.0,1.0,1.0


###### R2


$$R2 = 1 - \frac{\sum_{i=1}^{n}(y_i-\hat{y}_i)^{2}}{\sum_{i=1}^{n}(y_i-\bar{y})^{2}}$$

Se opta por el modelo Catboost con sus valores por defecto.

In [13]:
mae(y_test, y_pred)/listings.price.mean()*100

22.24774993678672