In [31]:
from statsmodels.iolib.summary2 import summary_col
import pandas as pd
import numpy as np
import statsmodels.api as sm

%store -r time_dummy
%store -r room_type_dummy

### read in cleaned data

In [32]:
df = pd.read_csv('airbnbdata_cleaned.csv', index_col = 0)
time_dummy = list(time_dummy.columns.values)
room_type_dummy = list(room_type_dummy.columns.values)
time_dummy = df[time_dummy]
room_type_dummy = df[room_type_dummy]

In [70]:
len(df.loc[df['review_scores_rating'] == 0, :])

0

In [71]:
len(df.loc[df['review_scores_rating'] == 1, :])

1221

In [73]:
df.describe()['review_scores_rating']

count    249557.000000
mean          4.742605
std           0.500412
min           1.000000
25%           4.650000
50%           4.950000
75%           5.000000
max           5.000000
Name: review_scores_rating, dtype: float64

### first try baseline model -- not a valid model

In [15]:
listing_attr = df[['property_type','accommodates','bathrooms','bedrooms', 'beds','amenities_counts','info_count']]
x_list = [listing_attr,room_type_dummy,time_dummy]
df_x1 = pd.concat(x_list,axis =1)
df_x_const = sm.add_constant(df_x1).apply(pd.to_numeric)

df_y = df['price in Euro']
modelbase = sm.OLS(df_y, df_x_const).fit() 
print(modelbase.summary())

                            OLS Regression Results                            
Dep. Variable:          price in Euro   R-squared:                       0.097
Model:                            OLS   Adj. R-squared:                  0.097
Method:                 Least Squares   F-statistic:                     1074.
Date:                Thu, 05 Dec 2019   Prob (F-statistic):               0.00
Time:                        19:14:31   Log-Likelihood:            -1.6183e+06
No. Observations:              249557   AIC:                         3.237e+06
Df Residuals:                  249531   BIC:                         3.237e+06
Df Model:                          25                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
const              -30.1608      2.014  

### Model 1 with host attributies only 

In [16]:
listing_attr = df[['property_type','accommodates','bathrooms','bedrooms', 'beds','amenities_counts','info_count']]
x_list = [listing_attr,room_type_dummy,time_dummy]
df_x1 = pd.concat(x_list,axis =1)
df_x_const = sm.add_constant(df_x1).apply(pd.to_numeric)

df_y = df['price in Euro'].apply(lambda x: np.log(x))
model1 = sm.OLS(df_y, df_x_const).fit() 
print(model1.summary())

                            OLS Regression Results                            
Dep. Variable:          price in Euro   R-squared:                       0.568
Model:                            OLS   Adj. R-squared:                  0.568
Method:                 Least Squares   F-statistic:                 1.313e+04
Date:                Thu, 05 Dec 2019   Prob (F-statistic):               0.00
Time:                        19:14:34   Log-Likelihood:            -1.6745e+05
No. Observations:              249557   AIC:                         3.350e+05
Df Residuals:                  249531   BIC:                         3.352e+05
Df Model:                          25                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
const                3.3655      0.006  

### Model-2 with listing attributes and distance to center  -- first we tried without normlization

In [17]:
dis_modle = df[['property_type','accommodates','bathrooms','bedrooms', 'beds','amenities_counts','info_count',\
                     'distance_Km']]
x_list = [dis_modle,room_type_dummy,time_dummy]
df_x2 = pd.concat(x_list,axis =1)
df_x_const = sm.add_constant(df_x2).apply(pd.to_numeric)

df_y = df['price in Euro'].apply(lambda x: np.log(x))
#df_y = df_listings['price in Euro']
model2 = sm.OLS(df_y, df_x_const).fit() 
print(model2.summary())

                            OLS Regression Results                            
Dep. Variable:          price in Euro   R-squared:                       0.568
Model:                            OLS   Adj. R-squared:                  0.568
Method:                 Least Squares   F-statistic:                 1.263e+04
Date:                Thu, 05 Dec 2019   Prob (F-statistic):               0.00
Time:                        19:14:43   Log-Likelihood:            -1.6742e+05
No. Observations:              249557   AIC:                         3.349e+05
Df Residuals:                  249530   BIC:                         3.352e+05
Df Model:                          26                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
const                3.3621      0.006  

### Model-2 with listing attributes and distance to center - with normalization and got better result

In [18]:
df['distance_Km'] = df['distance_Km'].apply(lambda x: np.log(x))

In [19]:
dis_modle = df[['property_type','accommodates','bathrooms','bedrooms', 'beds','amenities_counts','info_count',\
                     'distance_Km']]
x_list = [dis_modle,room_type_dummy,time_dummy]
df_x2 = pd.concat(x_list,axis =1)
df_x_const = sm.add_constant(df_x2).apply(pd.to_numeric)

df_y = df['price in Euro'].apply(lambda x: np.log(x))
#df_y = df_listings['price in Euro']
model2 = sm.OLS(df_y, df_x_const).fit() 
print(model2.summary())

                            OLS Regression Results                            
Dep. Variable:          price in Euro   R-squared:                       0.571
Model:                            OLS   Adj. R-squared:                  0.570
Method:                 Least Squares   F-statistic:                 1.275e+04
Date:                Thu, 05 Dec 2019   Prob (F-statistic):               0.00
Time:                        19:14:50   Log-Likelihood:            -1.6674e+05
No. Observations:              249557   AIC:                         3.335e+05
Df Residuals:                  249530   BIC:                         3.338e+05
Df Model:                          26                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
const                3.5127      0.007  

### Model-3 listing attributes+ distance to center+Policy Attributes

In [20]:
# instant_bookable true = 1 ## instant bookable ones has lower price cause those ones normamly want to attact users
#  strict = 1 strict has higher price 
# the beds one we can explain as special hosting or non special hosting
df3 = df[['minimum_nights','cancellation_policy','instant_bookable']]
df_x3 = pd.concat([df3,df_x2],axis =1)
df_x_const = sm.add_constant(df_x3).apply(pd.to_numeric)

df_y = df['price in Euro'].apply(lambda x: np.log(x))
#df_y = df_listings['price in Euro']
model3 = sm.OLS(df_y, df_x_const).fit() 
print(model3.summary())

                            OLS Regression Results                            
Dep. Variable:          price in Euro   R-squared:                       0.573
Model:                            OLS   Adj. R-squared:                  0.573
Method:                 Least Squares   F-statistic:                 1.154e+04
Date:                Thu, 05 Dec 2019   Prob (F-statistic):               0.00
Time:                        19:15:07   Log-Likelihood:            -1.6604e+05
No. Observations:              249557   AIC:                         3.321e+05
Df Residuals:                  249527   BIC:                         3.325e+05
Df Model:                          29                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
const                   3.5404    

### Model-4 listing attributes+ distance to center+Policy Attributes+reviews

In [21]:
df4 = df.dropna(subset = ['number_of_reviews','review_scores_rating'])
df4 = df4[['number_of_reviews','review_scores_rating']]
df_x4 = pd.concat([df4,df_x3],axis =1)
df_x_const = sm.add_constant(df_x4).apply(pd.to_numeric)
df_x_const = df_x_const.dropna(subset = ['number_of_reviews','review_scores_rating'])
df4index = list(df.index.values)

df_y = df['price in Euro'].apply(lambda x: np.log(x))
#df_y = df_listings['price in Euro']
model4 = sm.OLS(df_y, df_x_const).fit() 
print(model4.summary())

                            OLS Regression Results                            
Dep. Variable:          price in Euro   R-squared:                       0.576
Model:                            OLS   Adj. R-squared:                  0.575
Method:                 Least Squares   F-statistic:                 1.091e+04
Date:                Thu, 05 Dec 2019   Prob (F-statistic):               0.00
Time:                        19:15:15   Log-Likelihood:            -1.6529e+05
No. Observations:              249557   AIC:                         3.307e+05
Df Residuals:                  249525   BIC:                         3.310e+05
Df Model:                          31                                         
Covariance Type:            nonrobust                                         
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                    3.4646 

In [22]:
df_x4.columns.values

array(['number_of_reviews', 'review_scores_rating', 'minimum_nights',
       'cancellation_policy', 'instant_bookable', 'property_type',
       'accommodates', 'bathrooms', 'bedrooms', 'beds',
       'amenities_counts', 'info_count', 'distance_Km', 'Private room',
       'Shared room', '2018-05', '2018-07', '2018-08', '2018-09',
       '2018-10', '2018-11', '2018-12', '2019-01', '2019-02', '2019-03',
       '2019-04', '2019-05', '2019-06', '2019-07', '2019-08', '2019-09'],
      dtype=object)

### Model-5 listing attributes+ distance to center+host attributes+Policy Attributes+reviews+hosts

In [23]:
df5 = df.dropna(subset = ['host_is_superhost','host_identity_verified'])
x_list = ['host_is_superhost','host_identity_verified']
df4list = list(df_x4.columns.values)
x_list.extend(df4list)
df_x5 = df5[x_list]
df_x_const = sm.add_constant(df_x5).apply(pd.to_numeric)

df_y = df5['price in Euro'].apply(lambda x: np.log(1+x))
#df_y = df_listings['price in Euro']
model5 = sm.OLS(df_y, df_x_const).fit() 
print(model5.summary())

                            OLS Regression Results                            
Dep. Variable:          price in Euro   R-squared:                       0.575
Model:                            OLS   Adj. R-squared:                  0.575
Method:                 Least Squares   F-statistic:                 1.022e+04
Date:                Thu, 05 Dec 2019   Prob (F-statistic):               0.00
Time:                        19:15:19   Log-Likelihood:            -1.5956e+05
No. Observations:              249476   AIC:                         3.192e+05
Df Residuals:                  249442   BIC:                         3.195e+05
Df Model:                          33                                         
Covariance Type:            nonrobust                                         
                             coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------
const                      3

In [27]:
info_dict={'R-squared' : lambda x: f"{x.rsquared:.6f}",
           'No. observations' : lambda x: f"{int(x.nobs):d}"}

results_table = summary_col(results=[model1,model2,model3,model4,model5],
                            float_format='%0.6f',
                            stars = True,
                            model_names=['Lisingt_attributes',
                                         'Add_distance',
                                         'Add_policy',
                                         'Add_review',
                                         'Add_host'],
                            info_dict=info_dict,
                            regressor_order = ['info_count',
                                             'amenities_counts',
                                             'beds',
                                             'bedrooms',
                                             'bathrooms',
                                             'accommodates',
                                             'property_type',
                                             'Private room',
                                             'Shared room',
                                             'distance_Km',
                                             'minimum_nights','cancellation_policy','instant_bookable',
                                             'number_of_reviews','review_scores_rating',
                                             'host_is_superhost','host_identity_verified'],
                           drop_omitted = True)
                                        

results_table.add_title('Airbnb-Price-Determinate-Regression')

print(results_table)

                             Airbnb-Price-Determinate-Regression
                       Lisingt_attributes Add_distance  Add_policy   Add_review    Add_host  
---------------------------------------------------------------------------------------------
info_count             0.000207***        0.000181***  0.000189***  0.000245***  0.000240*** 
                       (0.000006)         (0.000006)   (0.000006)   (0.000006)   (0.000006)  
amenities_counts       0.001912***        0.001982***  0.001902***  0.002218***  0.002212*** 
                       (0.000082)         (0.000081)   (0.000082)   (0.000083)   (0.000082)  
beds                   -0.030155***       -0.030670*** -0.030121*** -0.028323*** -0.026939***
                       (0.001426)         (0.001422)   (0.001418)   (0.001415)   (0.001383)  
bedrooms               0.112718***        0.115539***  0.117078***  0.113846***  0.112492*** 
                       (0.002128)         (0.002124)   (0.002119)   (0.002115)   (0.00206