#### Prepared for Gabor's Data Analysis

### Data Analysis for Business, Economics, and Policy
by Gabor Bekes and Gabor Kezdi
 
Cambridge University Press 2021

**[gabors-data-analysis.com ](https://gabors-data-analysis.com/)**

 License: Free to share, modify and use for educational purposes. 
 Not to be used for commercial purposes.

### Chapter 10
**CH10B Finding a good deal among hotels with multiple regression**

using the hotels-vienna dataset

version 1.0 2021-05-05

In [2]:
import os
import sys
import warnings
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf
from mizani.formatters import percent_format
from plotnine import *
from stargazer import stargazer
from statsmodels.tools.eval_measures import mse, rmse
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.model_selection import KFold
from math import sqrt
import matplotlib.pyplot as plt
warnings.filterwarnings("ignore")


In [3]:
hotels = pd.read_csv(os.path.join("/workspaces/codespaces-jupyter/data/hotels-vienna.csv"))

In [5]:
hotels.columns

Index(['country', 'city_actual', 'rating_count', 'center1label',
       'center2label', 'neighbourhood', 'price', 'city', 'stars', 'ratingta',
       'ratingta_count', 'scarce_room', 'hotel_id', 'offer', 'offer_cat',
       'year', 'month', 'weekend', 'holiday', 'distance', 'distance_alter',
       'accommodation_type', 'nnights', 'rating'],
      dtype='object')

In [11]:
hotels

Unnamed: 0,country,city_actual,rating_count,center1label,center2label,neighbourhood,price,city,stars,ratingta,...,distance_alter,accommodation_type,nnights,rating,high_rating,high_stars,high_price,long_distance,low_price,low_rating
0,Austria,Vienna,36.0,City centre,Donauturm,17. Hernals,81,Vienna,4.0,4.5,...,4.4,Apartment,1,4.4,False,True,False,False,False,False
1,Austria,Vienna,189.0,City centre,Donauturm,17. Hernals,81,Vienna,4.0,3.5,...,3.8,Hotel,1,3.9,False,True,False,False,False,False
2,Austria,Vienna,53.0,City centre,Donauturm,Alsergrund,85,Vienna,4.0,3.5,...,2.5,Hotel,1,3.7,False,True,False,False,False,False
3,Austria,Vienna,55.0,City centre,Donauturm,Alsergrund,83,Vienna,3.0,4.0,...,2.5,Hotel,1,4.0,False,False,False,False,False,False
4,Austria,Vienna,33.0,City centre,Donauturm,Alsergrund,82,Vienna,4.0,3.5,...,2.8,Hotel,1,3.9,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
423,Austria,Vienna,2.0,City centre,Donauturm,Wieden,109,Vienna,3.0,3.0,...,3.8,Apartment,1,5.0,True,False,False,False,False,False
424,Austria,Vienna,145.0,City centre,Donauturm,Wieden,185,Vienna,5.0,4.0,...,3.6,Hotel,1,4.3,False,True,True,False,False,False
425,Austria,Vienna,112.0,City centre,Donauturm,Wieden,100,Vienna,4.0,4.5,...,3.7,Hotel,1,4.4,False,True,False,False,False,False
426,Austria,Vienna,169.0,City centre,Donauturm,Wieden,58,Vienna,3.0,3.0,...,4.1,Hotel,1,3.2,False,False,False,False,False,False


In [17]:
hotels["city_actual"].unique()

array(['Vienna', 'Fischamend', 'Schwechat', 'Voesendorf'], dtype=object)

In [6]:
hotels.describe()

Unnamed: 0,rating_count,price,stars,ratingta,ratingta_count,scarce_room,hotel_id,offer,year,month,weekend,holiday,distance,distance_alter,nnights,rating
count,393.0,428.0,428.0,325.0,325.0,428.0,428.0,428.0,428.0,428.0,428.0,428.0,428.0,428.0,428.0,393.0
mean,155.048346,131.366822,3.434579,3.990769,556.516923,0.598131,22153.502336,0.679907,2017.0,11.0,0.0,0.0,1.658879,3.718458,1.0,3.970992
std,191.217435,91.580545,0.772278,0.482638,586.874582,0.49085,146.858477,0.467058,0.0,0.0,0.0,0.0,1.595673,1.631341,0.0,0.577444
min,1.0,27.0,1.0,2.0,2.0,0.0,21894.0,0.0,2017.0,11.0,0.0,0.0,0.0,0.6,1.0,1.0
25%,27.0,83.0,3.0,3.5,129.0,0.0,22027.75,0.0,2017.0,11.0,0.0,0.0,0.7,2.7,1.0,3.7
50%,84.0,109.5,3.5,4.0,335.0,1.0,22155.5,1.0,2017.0,11.0,0.0,0.0,1.3,3.4,1.0,4.0
75%,203.0,146.0,4.0,4.5,811.0,1.0,22279.25,1.0,2017.0,11.0,0.0,0.0,2.0,4.4,1.0,4.4
max,1541.0,1012.0,5.0,5.0,3171.0,1.0,22409.0,1.0,2017.0,11.0,0.0,0.0,13.0,13.0,1.0,5.0


In [21]:
hotels["high_rating"] = hotels["rating"] >= 4.5
hotels["high_stars"] = hotels["stars"] >= 4
hotels["high_price"] = hotels["price"] >= 150
hotels["long_distance"] = hotels["distance"] >= 10
hotels["low_price"] = hotels["price"] < 50
hotels["low_rating"] = hotels["rating"] < 3
hotels["city_center"]=np.where (hotels["center1label"] == "City centre", 1, 0)
hotels["vienna"] = np.where (hotels["city_actual"] == "Vienna", 1, 0)
hotels["hotel_type"] = np.where (hotels["accommodation_type"] == "Hotel", 1, 0)
hotels["apartment_type"] = np.where (hotels["accommodation_type"] == "Apartment", 1, 0)
hotels["BB_type"] = np.where (hotels["accommodation_type"] == "Bed and breakfast", 1, 0)
hotels["hostel_type"] = np.where (hotels["accommodation_type"] == "Hostel", 1, 0)


hotels["distancesq"] = hotels["distance"] ** 2

In [20]:
len(hotels)

428

In [22]:
model1 = "price ~ distance + distancesq"
#model2 = "price ~ age + agesq + odometer"
#model3 = "price ~ age + agesq + odometer + odometersq + gas + cond_excellent + cond_good + dealer"
#model4 = "price ~ age + agesq + odometer + odometersq + gas + hybrid + maybe_electric + old_car + new_car + manual + cond_likenew + cond_excellent + cond_good + cylind6 + dealer"
#model5 = "price ~ age + agesq + odometer + odometersq + gas * age + hybrid * age + maybe_electric * age + old_car * age + new_car * age + manual * age + cond_likenew * age + cond_excellent * age + cond_good * age + cylind6 * age + odometer * age + dealer * age"

#model_equations = [model1, model2, model3, model4, model5]