## Prepared for the textbook:
-------------------------------------------------------------------
## Data Analysis for Business, Economics, and Policy
#### by Gabor BEKES and  Gabor KEZDI 
----------------------------------
#### Cambridge University Press 2021
-----------------------------------------------------------------------------------------------
#### License: Free to share, modify and use for educational purposes. Not to be used for business purposes.

In [2]:
import pandas as pd
import os
from pathlib import Path
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
from plotnine import *
import sys
from patsy import dmatrices
from sklearn.model_selection import train_test_split
from patsylearn import PatsyModel, PatsyTransformer

In [3]:
pip install git+https://github.com/amueller/patsylearn.git

Collecting git+https://github.com/amueller/patsylearn.git
  Cloning https://github.com/amueller/patsylearn.git to /tmp/pip-req-build-wyxoklbm
  Running command git clone -q https://github.com/amueller/patsylearn.git /tmp/pip-req-build-wyxoklbm
Building wheels for collected packages: patsylearn
  Building wheel for patsylearn (setup.py) ... [?25ldone
[?25h  Created wheel for patsylearn: filename=patsylearn-0.0.1-py3-none-any.whl size=10710 sha256=8ee03e668919980a43af39dff5f593eb2a34e6f45190f241edcc64eb7a902820
  Stored in directory: /tmp/pip-ephem-wheel-cache-tvj3yf6i/wheels/ee/04/89/4c256a72966ee1ab187dde02653b89e17b640d0e4240deaf81
Successfully built patsylearn
Note: you may need to restart the kernel to use updated packages.


In [4]:
path = Path(os.getcwd())

In [5]:
base_dir = path.parent.parent

In [6]:
data_in = os.path.join(str(base_dir) , "da_case_studies/ch16-airbnb-random-forest/")
data_out = os.path.join(str(base_dir), "da_case_studies/ch16-airbnb-random-forest/")
output = os.path.join(str(base_dir), "da_case_studies/ch16-airbnb-random-forest/output/")
func = os.path.join(str(base_dir) ,   "da_case_studies/ch00-tech-prep/")

In [7]:
sys.path.append(func)
from py_helper_functions import *

In [8]:
#########################################################################################
#
# PART I
# Loading and preparing data ----------------------------------------------
#
#########################################################################################

In [9]:
area="london"
data=pd.read_csv(data_in+"airbnb_"+area+"_workfile_adj.csv")
data=data.loc[data.price.notna(),:]

In [10]:
def count_missing_values(df):
    return df.isna().sum()[df.isna().sum()>0]

In [11]:
count_missing_values(data)

Series([], dtype: int64)

In [12]:
# Sample definition and preparation ---------------------------------------

# We focus on normal apartments, n<8
data = data.query('n_accommodates < 8')


In [13]:
# copy a variable - purpose later, see at variable importance
data = data.assign(n_accommodates_copy = data.n_accommodates)

In [14]:

# basic descr stat -------------------------------------------
data.describe()

Unnamed: 0,usd_price_day,n_accommodates,n_bathrooms,n_review_scores_rating,n_number_of_reviews,n_guests_included,n_reviews_per_month,n_extra_people,n_minimum_nights,n_beds,...,flag_review_scores_rating,flag_reviews_per_month,flag_n_number_of_reviews,ln_days_since,ln_days_since2,ln_days_since3,n_days_since2,n_days_since3,ln_review_scores_rating,n_accommodates_copy
count,49826.0,49826.0,49826.0,49826.0,49826.0,49826.0,49826.0,49826.0,49826.0,49826.0,...,49826.0,49826.0,49826.0,49826.0,49826.0,49826.0,49826.0,49826.0,49826.0,49826.0
mean,88.984044,2.838759,1.22426,92.458616,12.340144,1.35618,1.138253,6.50873,3.300947,1.585959,...,0.31885,0.303998,0.0,5.713297,33.456371,199.607207,293198.5,304653900.0,4.52091,2.838759
std,70.515089,1.494748,0.466448,8.418474,25.889935,0.879263,1.238156,12.31549,29.578062,0.927501,...,0.466035,0.459986,0.0,0.902564,9.506988,80.264045,564036.1,963614000.0,0.120421,1.494748
min,8.0,1.0,0.0,20.0,0.0,1.0,0.01,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.995732,1.0
25%,41.0,2.0,1.0,92.0,0.0,1.0,0.47,0.0,1.0,1.0,...,0.0,0.0,0.0,5.433722,29.525335,160.432461,51984.0,11852350.0,4.521789,2.0
50%,70.0,2.0,1.0,94.0,3.0,1.0,0.77,0.0,2.0,1.0,...,0.0,0.0,0.0,5.793014,33.559007,194.407782,106929.0,34965780.0,4.543295,2.0
75%,110.0,4.0,1.0,97.0,12.0,1.0,1.17,10.0,3.0,2.0,...,1.0,1.0,0.0,6.224558,38.745128,241.171311,254016.0,128024100.0,4.574711,4.0
max,999.0,7.0,8.0,100.0,396.0,16.0,15.0,240.0,5000.0,16.0,...,1.0,1.0,0.0,7.909489,62.560024,494.817853,7409284.0,20168070000.0,4.60517,7.0


In [15]:
data

Unnamed: 0,f_property_type,f_room_type,f_cancellation_policy,f_bed_type,f_neighbourhood_cleansed,usd_price_day,n_accommodates,n_bathrooms,n_review_scores_rating,n_number_of_reviews,...,flag_review_scores_rating,flag_reviews_per_month,flag_n_number_of_reviews,ln_days_since,ln_days_since2,ln_days_since3,n_days_since2,n_days_since3,ln_review_scores_rating,n_accommodates_copy
0,Apartment,Private room,flexible,Real Bed,Kingston upon Thames,23.0,1,1.0,100.0,1,...,0,0,0,4.532599,20.544458,93.119801,8464.0,778688.0,4.605170,1
1,Apartment,Private room,moderate,Couch,Kingston upon Thames,50.0,2,1.0,91.0,15,...,0,0,0,6.510258,42.383464,275.927298,450241.0,302111711.0,4.510860,2
2,Apartment,Private room,flexible,Real Bed,Kingston upon Thames,24.0,2,1.0,80.0,2,...,0,0,0,5.493061,30.173724,165.746120,58564.0,14172488.0,4.382027,2
3,House,Private room,flexible,Real Bed,Kingston upon Thames,50.0,2,1.5,94.0,0,...,1,1,0,5.793014,33.559007,194.407782,106929.0,34965783.0,4.543295,2
4,House,Private room,flexible,Real Bed,Kingston upon Thames,25.0,1,1.0,94.0,0,...,1,1,0,5.793014,33.559007,194.407782,106929.0,34965783.0,4.543295,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51641,Apartment,Private room,strict,Real Bed,Westminster,24.0,1,1.0,40.0,1,...,0,0,0,5.181784,26.850881,139.135452,31329.0,5545233.0,3.688879,1
51642,House,Entire home/apt,flexible,Real Bed,Richmond upon Thames,120.0,6,1.0,94.0,0,...,1,1,0,5.793014,33.559007,194.407782,106929.0,34965783.0,4.543295,6
51643,Apartment,Entire home/apt,flexible,Real Bed,Wandsworth,215.0,6,3.0,94.0,0,...,1,1,0,5.793014,33.559007,194.407782,106929.0,34965783.0,4.543295,6
51644,Apartment,Entire home/apt,flexible,Real Bed,Hounslow,150.0,4,2.0,94.0,0,...,1,1,0,5.793014,33.559007,194.407782,106929.0,34965783.0,4.543295,4


In [16]:
data.price.describe()

count    49826.000000
mean        88.984044
std         70.515089
min          8.000000
25%         41.000000
50%         70.000000
75%        110.000000
max        999.000000
Name: price, dtype: float64

In [17]:
data.f_room_type.value_counts()

Entire home/apt    24938
Private room       24351
Shared room          537
Name: f_room_type, dtype: int64

In [18]:
data.f_property_type.value_counts()

Apartment    37467
House        12359
Name: f_property_type, dtype: int64

In [19]:
data.f_number_of_reviews.value_counts()

1    31576
0    15147
2     3103
Name: f_number_of_reviews, dtype: int64

In [20]:
# create train and holdout samples -------------------------------------------
# train is where we do it all, incl CV

# First pick a smaller than usual training set so that models run faster and check if works
# If works, start anew without these two lines

In [21]:
data_train, data_holdout = train_test_split(data,train_size=0.7, random_state=2801)

In [22]:
data_train.shape, data_holdout.shape

((34878, 91), (14948, 91))

In [39]:
# Basic Variables inc neighnourhood
basic_vars = [
  "n_accommodates", "n_beds", "n_days_since",
  "f_property_type","f_room_type", "f_bathroom", "f_cancellation_policy", "f_bed_type",
  "f_neighbourhood_cleansed"]

# reviews
reviews = ["n_number_of_reviews", "flag_n_number_of_reviews" ,"n_review_scores_rating", "flag_review_scores_rating"]

# Dummy variables
amenities =  [col for col in data if col.startswith('d_')]

#interactions for the LASSO
# from ch14
X1  = ["n_accommodates:f_property_type",  "f_room_type:f_property_type",  "f_room_type:d_familykidfriendly",
         "d_airconditioning:f_property_type", "d_cats:f_property_type", "d_dogs:f_property_type"]
# with boroughs
X2  = ["f_property_type:f_neighbourhood_cleansed", "f_room_type:f_neighbourhood_cleansed",
         "n_accommodates:f_neighbourhood_cleansed" ]

In [40]:
predictors_1 = basic_vars
predictors_2 = basic_vars+reviews+amenities
predictors_E = basic_vars+reviews+amenities+X1+X2

In [42]:
## Testing patsylearn data transform
PatsyTransformer("n_accommodates:f_property_type", return_type="dataframe").fit_transform(data).head(2)

Unnamed: 0,n_accommodates:f_property_type[Apartment],n_accommodates:f_property_type[House]
0,1.0,0.0
1,2.0,0.0


In [None]:
#########################################################################################
#
# PART II
# RANDOM FORESTS -------------------------------------------------------
#
#########################################################################################

In [44]:
from skranger.ensemble import RangerForestRegressor

In [45]:
rfr = RangerForestRegressor()

In [50]:
rf_model_1=PatsyModel(rfr, "price ~ " + " + ".join(predictors_1))

In [53]:
rf_model_1.fit(data_train)

PatsyModel(NA_action='drop', add_intercept=False,
           estimator=RangerForestRegressor(alpha=0.5,
                                           always_split_features=None,
                                           categorical_features=None,
                                           holdout=False, importance='none',
                                           inbag=None, keep_inbag=False,
                                           local_importance=False, max_depth=0,
                                           min_node_size=0, minprop=0.1, mtry=0,
                                           n_estimators=100, n_jobs=-1,
                                           num_random_splits=1, oob_error=False,
                                           regularization...
                                           respect_categorical_features=None,
                                           sample_fraction=None,
                                           save_memory=False,
                         

In [54]:
rf_model_1.get_params()

{'NA_action': 'drop',
 'add_intercept': False,
 'estimator__alpha': 0.5,
 'estimator__always_split_features': None,
 'estimator__categorical_features': None,
 'estimator__holdout': False,
 'estimator__importance': 'none',
 'estimator__inbag': None,
 'estimator__keep_inbag': False,
 'estimator__local_importance': False,
 'estimator__max_depth': 0,
 'estimator__min_node_size': 0,
 'estimator__minprop': 0.1,
 'estimator__mtry': 0,
 'estimator__n_estimators': 100,
 'estimator__n_jobs': -1,
 'estimator__num_random_splits': 1,
 'estimator__oob_error': False,
 'estimator__regularization_factor': None,
 'estimator__regularization_usedepth': False,
 'estimator__replace': True,
 'estimator__respect_categorical_features': None,
 'estimator__sample_fraction': None,
 'estimator__save_memory': False,
 'estimator__scale_permutation_importance': False,
 'estimator__seed': 42,
 'estimator__split_rule': 'variance',
 'estimator__split_select_weights': None,
 'estimator__verbose': False,
 'estimator': Ran