In [1]:
import statsmodels.api as sm
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler

In [2]:
df_cleaned = pd.read_csv('data/listings_cleaned.csv', low_memory=False)
df_cleaned.head()

Unnamed: 0,id,price,latitude,longitude,property_type,room_type,accommodates,bathrooms,bedrooms,bed_type,...,instant_bookable,is_business_travel_ready,cancellation_policy,host_is_superhost,listing_url,cleaning_fee,security_deposit,extra_people,guests_included,region
0,2265,225.0,30.2775,-97.71398,House,Entire home/apt,4,2.0,2.0,Real Bed,...,f,f,strict_14_with_grace_period,t,https://www.airbnb.com/rooms/2265,100.0,500.0,30.0,4,East
1,5245,100.0,30.27577,-97.71379,House,Private room,2,1.0,1.0,Real Bed,...,f,f,strict_14_with_grace_period,t,https://www.airbnb.com/rooms/5245,75.0,500.0,35.0,2,East
2,5456,95.0,30.26112,-97.73448,Guesthouse,Entire home/apt,3,1.0,1.0,Real Bed,...,f,f,strict_14_with_grace_period,t,https://www.airbnb.com/rooms/5456,0.0,100.0,45.0,2,East
3,5769,40.0,30.45697,-97.78422,House,Private room,2,1.0,1.0,Real Bed,...,f,f,moderate,t,https://www.airbnb.com/rooms/5769,0.0,0.0,0.0,2,North
4,6413,99.0,30.24829,-97.73726,Guesthouse,Entire home/apt,2,1.0,1.0,Real Bed,...,t,f,strict_14_with_grace_period,t,https://www.airbnb.com/rooms/6413,50.0,0.0,25.0,2,South


In [3]:
df_cleaned.drop(['amenities', 'cancellation_policy','instant_bookable', 'listing_url','bed_type', 
                 'review_scores_rating', 'guests_included']
                #, 'property_type','accommodates', ,'is_business_travel_ready','host_is_superhost', ]#], 
                ,axis=1, inplace=True)

In [4]:
categorical = df_cleaned.select_dtypes(include='object').columns
categorical

Index(['property_type', 'room_type', 'is_business_travel_ready',
       'host_is_superhost', 'region'],
      dtype='object')

In [5]:
# Create dummy variables for categorical columns in the df
df_cleaned = pd.get_dummies(df_cleaned, drop_first=True)
df_cleaned.head()

Unnamed: 0,id,price,latitude,longitude,accommodates,bathrooms,bedrooms,minimum_nights,number_of_reviews,availability_365,...,property_type_Serviced apartment,property_type_Townhouse,room_type_Hotel room,room_type_Private room,room_type_Shared room,host_is_superhost_t,region_East,region_North,region_South,region_West
0,2265,225.0,30.2775,-97.71398,4,2.0,2.0,30,24,0,...,0,0,0,0,0,1,1,0,0,0
1,5245,100.0,30.27577,-97.71379,2,1.0,1.0,30,9,0,...,0,0,0,1,0,1,1,0,0,0
2,5456,95.0,30.26112,-97.73448,3,1.0,1.0,2,529,334,...,0,0,0,0,0,1,1,0,0,0
3,5769,40.0,30.45697,-97.78422,2,1.0,1.0,1,257,14,...,0,0,0,1,0,1,0,1,0,0
4,6413,99.0,30.24829,-97.73726,2,1.0,1.0,3,112,0,...,0,0,0,0,0,1,0,0,1,0


In [6]:
numerical = df_cleaned.select_dtypes(include=['int64', 'float']).drop(['price', 'latitude', 'longitude'], axis=1).columns
numerical

Index(['id', 'accommodates', 'bathrooms', 'bedrooms', 'minimum_nights',
       'number_of_reviews', 'availability_365', 'cleaning_fee',
       'security_deposit', 'extra_people'],
      dtype='object')

In [7]:
scaler = StandardScaler()
df_scaled = df_cleaned.copy()
df_scaled[numerical] = scaler.fit_transform(df_scaled[numerical])

In [8]:
X = df_cleaned.drop(['price', 'id'], axis=1)
y = np.log(df_cleaned[['price']])

X.drop(['property_type_Bungalow', 'property_type_Guesthouse', 'property_type_House', 'property_type_Serviced apartment',
       'region_West'], axis=1, inplace=True)

X.shape

(11541, 23)

In [9]:
X = sm.add_constant(X)

ols_model = sm.OLS(y, X.astype(float))
res = ols_model.fit()
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.511
Model:                            OLS   Adj. R-squared:                  0.510
Method:                 Least Squares   F-statistic:                     523.3
Date:                Wed, 05 Aug 2020   Prob (F-statistic):               0.00
Time:                        13:40:15   Log-Likelihood:                -11535.
No. Observations:               11541   AIC:                         2.312e+04
Df Residuals:                   11517   BIC:                         2.329e+04
Df Model:                          23                                         
Covariance Type:            nonrobust                                         
                                coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------
const                 