In [1]:
import pandas as pd
import numpy as np
import pickle
import shap

import mlflow
import pycaret

from airbnb_prediction.mlflow_utils import UiConn
from pycaret.regression import *


pd.set_option('display.float_format', '{:.2f}'.format)

In [2]:
conn = UiConn()
conn.create_ui_session()

#conn.terminate_ui_session()

Access for UI at: http://127.0.0.1:5555


In [3]:
df = pickle.load(open('../data/processed/model_data.pickle', 'rb'))

In [4]:
categorical_features = [
    'host_response_time',
    'host_is_superhost',
    'room_type',
    'instant_bookable',
    'half_bath',
    'regiao',
    'property_type_refactor',
    'is_host_rj'    
]

numerical_features = [
 'accommodates',
 'bathrooms',
 'bedrooms',
 'beds',
 'minimum_nights',
 'maximum_nights',
 'number_of_reviews',
 'days_since_host',
 'delta_nights',
 'mean_reviews',
 'count_name',
 'count_description',
 'count_neighborhood_overview',
 'count_host_about'
]

Experiment with different:
- normalize_method
- transformation
- remove outliers
- feature_selection

In [5]:
session = setup(
    df.drop('id', axis=1),
    target='price',
    log_experiment=True, 
    log_plots=True,
    experiment_name='mlflow_pycaret', 
    log_profile=True,
    log_data=True,
    session_id=16,
    categorical_features=categorical_features,
    numeric_features=numerical_features,
    normalize=True,
    feature_selection=True,
    silent=True,
    verbose=False
               )

Summarize dataset:   0%|          | 0/36 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [6]:
best_models = compare_models(sort='RMSE')

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
en,Elastic Net,567.2252,8330457.3,2540.5997,0.1266,1.0733,1.5219,0.021
llar,Lasso Least Angle Regression,566.6005,8331816.0288,2542.0413,0.1243,1.0231,1.4715,0.019
br,Bayesian Ridge,624.9164,8300731.8003,2542.4655,0.1177,1.1533,1.8119,0.03
lasso,Lasso Regression,640.3346,8308957.0625,2546.188,0.1122,1.1613,1.9047,0.038
ridge,Ridge Regression,644.1634,8313972.35,2547.6104,0.1106,1.1672,1.9313,0.019
lr,Linear Regression,644.309,8314018.175,2547.6474,0.1105,1.1675,1.9324,0.21
lar,Least Angle Regression,646.1053,8316125.6555,2548.1521,0.11,1.1722,1.9428,0.025
omp,Orthogonal Matching Pursuit,616.8823,8347154.0697,2549.0484,0.115,1.266,1.7219,0.012
huber,Huber Regressor,455.1453,8602866.8533,2593.3318,0.0799,0.8118,0.7038,0.128
lightgbm,Light Gradient Boosting Machine,581.4271,8392817.6783,2600.9377,0.014,0.9761,1.4716,0.105


In [7]:
lightgbm = create_model('lightgbm')

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,559.3961,2374530.3727,1540.9511,0.156,0.9715,1.5364
1,619.6864,16522367.8656,4064.7716,0.1361,0.8848,1.308
2,502.1063,3287206.3779,1813.0655,0.0342,0.9308,1.265
3,525.3477,4555865.9622,2134.4475,0.1195,0.9531,1.4711
4,538.632,4166867.3334,2041.2906,0.1329,1.0392,1.6417
5,644.7397,7374146.5537,2715.538,-0.0354,1.0281,1.7104
6,745.5952,26659155.5521,5163.2505,0.0636,0.9151,1.1542
7,526.8571,1315934.1932,1147.1417,-0.4073,1.002,1.5995
8,554.1648,2071003.9617,1439.0983,-0.0344,1.0452,1.5927
9,597.7451,15601098.611,3949.8226,-0.0252,0.9912,1.4372


In [8]:
tuned_lightgbm = tune_model(lightgbm, optimize='RMSE')

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,481.5258,1979899.3758,1407.089,0.2963,0.9181,1.4418
1,571.9164,16885043.2807,4109.1414,0.1171,0.8549,1.2878
2,475.0367,3040973.4893,1743.8387,0.1065,0.915,1.4006
3,476.347,4555052.4486,2134.2569,0.1196,0.8868,1.4144
4,467.6159,3810632.986,1952.0843,0.2071,0.8846,1.3563
5,541.5397,6672282.2335,2583.0761,0.0632,0.9069,1.4449
6,738.5296,27312135.7276,5226.1014,0.0406,0.9003,1.238
7,440.6254,613140.6915,783.033,0.3443,0.9388,1.5354
8,469.5996,1604005.5497,1266.4934,0.1988,0.9332,1.4263
9,540.3869,14882848.6062,3857.8295,0.022,0.926,1.422


In [9]:
boosted_lightgbm = ensemble_model(tuned_lightgbm)

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,508.0808,2055704.3674,1433.7728,0.2694,0.9505,1.5526
1,592.3231,16960615.5225,4118.3268,0.1132,0.8844,1.3819
2,483.46,3046709.9693,1745.4827,0.1048,0.9402,1.4765
3,482.3691,4552057.7884,2133.5552,0.1202,0.8994,1.4501
4,480.01,3857452.9072,1964.0399,0.1973,0.905,1.4286
5,554.301,6688462.1291,2586.2061,0.0609,0.9285,1.5371
6,740.329,27415567.9481,5235.9878,0.037,0.9079,1.24
7,448.7487,626425.4717,791.4704,0.3301,0.9464,1.5479
8,485.9342,1629764.6849,1276.6224,0.1859,0.9571,1.5326
9,541.557,14940287.1048,3865.2668,0.0183,0.9345,1.444


In [10]:
finalize_model(boosted_lightgbm)

Finished loading model, total used 210 iterations
Finished loading model, total used 210 iterations
Finished loading model, total used 210 iterations
Finished loading model, total used 210 iterations
Finished loading model, total used 210 iterations
Finished loading model, total used 210 iterations
Finished loading model, total used 210 iterations
Finished loading model, total used 210 iterations
Finished loading model, total used 210 iterations
Finished loading model, total used 210 iterations
Finished loading model, total used 210 iterations
Finished loading model, total used 210 iterations
Finished loading model, total used 210 iterations
Finished loading model, total used 210 iterations
Finished loading model, total used 210 iterations
Finished loading model, total used 210 iterations
Finished loading model, total used 210 iterations
Finished loading model, total used 210 iterations
Finished loading model, total used 210 iterations
Finished loading model, total used 210 iterations


BaggingRegressor(base_estimator=LGBMRegressor(bagging_fraction=0.8,
                                              bagging_freq=6,
                                              boosting_type='gbdt',
                                              class_weight=None,
                                              colsample_bytree=1.0,
                                              feature_fraction=0.5,
                                              importance_type='split',
                                              learning_rate=0.01, max_depth=-1,
                                              min_child_samples=36,
                                              min_child_weight=0.001,
                                              min_split_gain=0,
                                              n_estimators=210, n_jobs=-1,
                                              num_leaves=256, objective=None,
                                              random_state=16, reg_alpha=1,
                   