In [1]:
import pandas as pd
import numpy as np
import pickle
import shap

import mlflow
import pycaret

from airbnb_prediction.mlflow_utils import UiConn
from pycaret.regression import *


pd.set_option('display.float_format', '{:.2f}'.format)

In [2]:
conn = UiConn()
conn.create_ui_session()

#conn.terminate_ui_session()

Access for UI at: http://127.0.0.1:5555


In [3]:
df = pickle.load(open('../data/processed/model_data.pickle', 'rb'))

In [4]:
categorical_features = [
    'host_response_time',
    'host_is_superhost',
    'room_type',
    'instant_bookable',
    'half_bath',
    'regiao',
    'property_type_refactor',
    'is_host_rj'    
]

numerical_features = [
 'accommodates',
 'bathrooms',
 'bedrooms',
 'beds',
 'minimum_nights',
 'maximum_nights',
 'number_of_reviews',
 'days_since_host',
 'delta_nights',
 'mean_reviews',
 'count_name',
 'count_description',
 'count_neighborhood_overview',
 'count_host_about'
]

Experiment with different:
- normalize_method
- transformation
- remove outliers
- feature_selection

In [5]:
session = setup(
    df.drop('id', axis=1),
    target='price',
    log_experiment=True, 
    log_plots=True,
    experiment_name='mlflow_pycaret', 
    log_profile=True,
    log_data=True,
    session_id=16,
    categorical_features=categorical_features,
    numeric_features=numerical_features,
    normalize=True,
    feature_selection=True,
    silent=True,
    verbose=False
               )

Summarize dataset:   0%|          | 0/36 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [6]:
best_models = compare_models(sort='RMSE')

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
gbr,Gradient Boosting Regressor,587.4841,11761591.7244,3325.9698,0.033,0.97,1.4673,1.17
lightgbm,Light Gradient Boosting Machine,681.3105,11844708.1886,3343.8569,0.0701,1.0836,1.7648,0.135
et,Extra Trees Regressor,633.2434,11669061.3304,3346.6743,-0.1373,0.903,1.6436,2.701
knn,K Neighbors Regressor,604.7234,11769280.525,3350.21,-0.0072,0.8501,1.3648,0.771
rf,Random Forest Regressor,630.7414,12017445.6722,3381.7422,-0.139,0.8853,1.5358,4.168
br,Bayesian Ridge,673.8625,13808877.0038,3548.7046,0.057,1.1721,1.927,0.054
lasso,Lasso Regression,689.4229,13808572.4375,3549.8884,0.055,1.1915,2.0161,0.034
ridge,Ridge Regression,693.6237,13811830.325,3550.5341,0.0544,1.192,2.0386,0.016
en,Elastic Net,617.3207,13859277.475,3553.8637,0.0562,1.0616,1.6531,0.019
omp,Orthogonal Matching Pursuit,650.2671,13871636.9903,3556.2522,0.0533,1.4185,1.7842,0.017


In [7]:
lightgbm = create_model('lightgbm')

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,605.9511,2413711.0013,1553.6122,-0.6381,1.1202,1.7987
1,778.378,17492518.725,4182.4059,0.2097,1.1677,1.9655
2,679.068,13644803.7419,3693.8873,0.2869,1.0927,1.6839
3,707.1067,16543552.4799,4067.3766,0.234,1.0632,1.7132
4,686.8229,8987678.9661,2997.9458,-0.1088,1.0834,1.9227
5,629.2266,5099284.2251,2258.1595,0.1185,1.0502,1.7166
6,747.6622,11992809.475,3463.0636,0.2577,1.0899,1.9333
7,616.0929,12898072.4776,3591.3887,-0.022,1.0109,1.489
8,692.2324,17441971.1114,4176.3586,-0.0228,1.0984,1.81
9,670.564,11932679.6829,3454.3711,0.3855,1.059,1.6154


In [8]:
tuned_lightgbm = tune_model(lightgbm, optimize='RMSE')

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,550.663,2364798.1509,1537.79,-0.6049,1.1081,2.0613
1,737.242,17960704.0717,4238.0071,0.1886,1.1236,2.147
2,620.8324,14646104.1323,3827.0229,0.2346,1.0911,1.8863
3,686.3339,18077761.5233,4251.7951,0.1629,1.0921,1.9861
4,612.7992,7675377.0209,2770.4471,0.0531,1.114,2.084
5,601.0261,4738363.0198,2176.7781,0.1809,1.1115,2.0272
6,693.7517,13969021.0759,3737.5154,0.1353,1.1189,2.0474
7,590.5449,12435720.3211,3526.4317,0.0146,1.082,1.9052
8,640.4005,16936075.121,4115.3463,0.0069,1.1085,2.0007
9,641.4875,12145344.6387,3485.0172,0.3745,1.0789,1.8878


In [9]:
boosted_lightgbm = ensemble_model(tuned_lightgbm)

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,532.8214,1719946.1794,1311.4672,-0.1673,1.0977,2.0107
1,712.0951,17747072.5017,4212.7274,0.1982,1.1138,2.0607
2,620.9006,15241040.8246,3903.9776,0.2035,1.096,1.9134
3,676.542,19012287.3734,4360.3082,0.1197,1.0862,1.968
4,615.7303,7726236.5428,2779.6109,0.0468,1.1148,2.094
5,595.9726,4848956.9829,2202.0347,0.1618,1.1002,1.9893
6,676.0913,13253230.5437,3640.4987,0.1796,1.1114,2.0173
7,575.2402,12330516.8464,3511.4836,0.023,1.0694,1.8649
8,626.1549,16628067.8768,4077.7528,0.025,1.1035,1.9969
9,631.68,13016923.3733,3607.8974,0.3297,1.0626,1.822


In [10]:
finalize_model(boosted_lightgbm)

Finished loading model, total used 190 iterations
Finished loading model, total used 190 iterations
Finished loading model, total used 190 iterations
Finished loading model, total used 190 iterations
Finished loading model, total used 190 iterations
Finished loading model, total used 190 iterations
Finished loading model, total used 190 iterations
Finished loading model, total used 190 iterations
Finished loading model, total used 190 iterations
Finished loading model, total used 190 iterations
Finished loading model, total used 190 iterations
Finished loading model, total used 190 iterations
Finished loading model, total used 190 iterations
Finished loading model, total used 190 iterations
Finished loading model, total used 190 iterations
Finished loading model, total used 190 iterations
Finished loading model, total used 190 iterations
Finished loading model, total used 190 iterations
Finished loading model, total used 190 iterations
Finished loading model, total used 190 iterations


BaggingRegressor(base_estimator=LGBMRegressor(bagging_fraction=0.8,
                                              bagging_freq=6,
                                              boosting_type='gbdt',
                                              class_weight=None,
                                              colsample_bytree=1.0,
                                              feature_fraction=0.6,
                                              importance_type='split',
                                              learning_rate=0.005, max_depth=-1,
                                              min_child_samples=1,
                                              min_child_weight=0.001,
                                              min_split_gain=0.9,
                                              n_estimators=190, n_jobs=-1,
                                              num_leaves=50, objective=None,
                                              random_state=16, reg_alpha=0.15,
               