# 3.0 - Modelado, ajuste y evaluacion

In [1]:
# librerias

import numpy as np
import pandas as pd

pd.set_option('display.max_columns', None)

from catboost import CatBoostRegressor as CTR

from sklearn.model_selection import train_test_split as tts 

from hyperopt import fmin, hp, tpe, Trials, space_eval, STATUS_OK

from sklearn.metrics import mean_squared_error as mse 
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import r2_score as r2

**Datos**

In [2]:
cols=['accommodates', 'air_conditioning', 'availability_30', 'availability_365', 'availability_60', 
      'availability_90','bathrooms', 'bedrooms', 'beds', 'calculated_host_listings_count', 
      'calculated_host_listings_count_entire_homes',
      'calculated_host_listings_count_private_rooms', 'calculated_host_listings_count_shared_rooms', 
      'cleaning_fee','dishwasher', 'dryer', 'elevator', 'extra_people', 'guests_included', 'maximum_nights',
      'minimum_nights', 'number_of_reviews', 'number_of_reviews_ltm', 'room_type_private_room', 
      'room_type_shared_room', 'security_deposit', 'x', 'y', 'z', 'price']

In [3]:
# carga de datos
listings=pd.read_csv('../data/transform_data/listings_normal.csv', usecols=cols)

listings=listings[(listings.price>=10) & (listings.price<=196)]  # eliminacion de outliers

# cambio en el tamaño del tipo de dato
for c in listings.select_dtypes(include='int'):
    listings[c]=pd.to_numeric(listings[c], downcast='integer')

for c in listings.select_dtypes(include='float'):
    listings[c]=pd.to_numeric(listings[c], downcast='float')

In [4]:
ctr=CTR(verbose=0)

In [None]:
X=listings.drop('price', axis=1)
y=listings.price

X_train, X_test, y_train, y_test = tts(X, y, train_size=0.8, test_size=0.2, random_state=42)

ctr.fit(X_train, y_train)

y_pred=ctr.predict(X_train)

print(f'Train RMSE: {mse(y_train, y_pred, squared=False)}')
print(f'Train MAE: {mae(y_train, y_pred)}')
print(f'Train R2: {r2(y_train, y_pred)}')

In [None]:
y_pred=ctr.predict(X_test)

print(f'Test RMSE: {mse(y_test, y_pred, squared=False)}')
print(f'Test MAE: {mae(y_test, y_pred)}')
print(f'Test R2: {r2(y_test, y_pred)}')

In [None]:
# aprioris

space={
    'n_estimators':hp.quniform('n_estimators', 100, 700, 25),
    
    'learning_rate':hp.uniform('learning_rate', 0.01, 1.0),
    
    'depth':hp.quniform('depth', 7, 16, 1),
        
}

In [None]:
def objetivo(space):
    
    modelo=CTR(
        verbose=0,
        n_estimators=int(space['n_estimators']),
        learning_rate=space['learning_rate'],
        depth=int(space['depth'])
    )
    
    eval_set=[(X_train, y_train), (X_test, y_test)]
    
    modelo.fit(X_train, y_train, eval_set=eval_set)
    
    y_pred=modelo.predict(X_test)
    
    rmse=mse(y_test, y_pred, squared=False)
    
    return {'loss': rmse, 'status': STATUS_OK}

In [None]:
best=fmin(fn=objetivo,
          space=space,
          algo=tpe.suggest,
          max_evals=20,
          trials=Trials())

best

In [None]:
modelo_ajustado=CTR(
    verbose=0,
    n_estimators=int(best['n_estimators']),
    learning_rate=best['learning_rate'],
    depth=int(best['depth']),
    )


modelo_ajustado.fit(X_train, y_train)

In [None]:
y_pred=modelo_ajustado.predict(X_train)

print(f'Train RMSE: {mse(y_train, y_pred, squared=False)}')
print(f'Train MAE: {mae(y_train, y_pred)}')
print(f'Train R2: {r2(y_train, y_pred)}')

In [None]:
y_pred=modelo_ajustado.predict(X_test)

print(f'Test RMSE: {mse(y_test, y_pred, squared=False)}')
print(f'Test MAE: {mae(y_test, y_pred)}')
print(f'Test R2: {r2(y_test, y_pred)}')