In [1]:
import sqlite3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
import seaborn as sns
import datetime
from collections import Counter
import models
import math

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Connect to airbnb database
connection = sqlite3.connect('airbnb.db')

In [3]:
# Query the listings table in airbnb.db and convert the listings table to dataframe
vwListings = pd.read_sql('SELECT * FROM most_recent_listings', connection)

Since the original data set have quite a few of noises, we will clean up the data set before feeding to the model. After completing EDA step, we decided to drop some certain observations, convert the data type, manipulate some variables.

In [4]:
# Create a room_type_mod
vwListings['room_type_mod'] = vwListings['room_type']
vwListings.loc[vwListings['property_type'].isin(['Room in hotel', 'Room in boutique hotel',
        'Private room in bed and breakfast']), 'room_type_mod'] = 'Hotel room'

# Drop the listings having price = 0
vwListings.drop(vwListings.loc[vwListings['price']==0].index, inplace=True)
vwListings = vwListings.reset_index(drop=True)

# Drop the unavailable listings
vwListings.drop(vwListings.loc[vwListings['has_availability']=='f'].index, inplace=True)
vwListings = vwListings.reset_index(drop=True)

# Replace null reviews_per_month = 0
vwListings['reviews_per_month'] = vwListings['reviews_per_month'].replace({'': 0})

# Drop 2 listings having over 31 reviews per month
vwListings.drop(vwListings.loc[vwListings['reviews_per_month'] > 31].index, inplace=True)
vwListings = vwListings.reset_index(drop=True)

# Replace null year in host_since by the most common value 2019-09-13
vwListings['host_since'] = vwListings['host_since'].replace({'': '2019-09-13'})
vwListings['host_since'] = pd.to_datetime(vwListings['host_since'])

# Replace null value by 'N/A' in host_response_time
vwListings['host_response_time'] = vwListings['host_response_time'].replace({'': 'N/A'})

# Replace N/A value in host_response_rate = 0
vwListings['host_response_rate'] = vwListings['host_response_rate'].str.replace('N/A', '0%')
vwListings['host_response_rate'] = vwListings['host_response_rate'].str.replace('%', '')
vwListings['host_response_rate'] = pd.to_numeric(vwListings['host_response_rate'])

# Replace N/A value in host_acceptance_rate = 0
vwListings['host_acceptance_rate'] = vwListings['host_acceptance_rate'].str.replace('N/A', '0%')
vwListings['host_acceptance_rate'] = vwListings['host_acceptance_rate'].str.replace('%', '')
vwListings['host_acceptance_rate'] = pd.to_numeric(vwListings['host_acceptance_rate'])

# Replace null value by 'f' in host_is_superhost
vwListings['host_is_superhost'] = vwListings['host_is_superhost'].replace({'': 'f'})

# Replace null value by 'f' in host_verifications
vwListings['host_verifications'] = vwListings['host_verifications'].replace({'': 'f'})

# Replace null value by 'f' in host_has_profile_pic
vwListings['host_has_profile_pic'] = vwListings['host_has_profile_pic'].replace({'': 'f'})

# Replace null value by 'f' in host_identity_verified
vwListings['host_identity_verified'] = vwListings['host_identity_verified'].replace({'': 'f'})

We will convert some bathroom text to the number of bathrooms based on the human's interpretation. With the null values or the half bathroom only, we will assume the number of bathrooms is equal to the mean bathrooms which is 1.

In [5]:
for i in range(len(vwListings.bathrooms_text)):
    if '0 shared baths' in vwListings['bathrooms_text'][i]: 
        vwListings['bathrooms_text'][i] = str(vwListings['bedrooms'][i]) + ' bathrooms'
    elif '0 baths' in vwListings['bathrooms_text'][i]:
        vwListings['bathrooms_text'][i] = '1 bathroom'
    elif 'Half-bath' in vwListings['bathrooms_text'][i]:
        vwListings['bathrooms_text'][i] = '1 bathroom'
    elif 'Private half-bath' in vwListings['bathrooms_text'][i]:
        vwListings['bathrooms_text'][i] = '1 bathroom'
    elif 'Shared half-bath' in vwListings['bathrooms_text'][i]:
        vwListings['bathrooms_text'][i] = '1 bathroom'
    elif pd.isna(vwListings['bathrooms_text'][i]) == True:
        vwListings['bathrooms_text'][i] = '1 bathroom'
    else:
        continue

# Extract the number of bathrooms from bathroom_text.
vwListings['bathroom_numbers'] = vwListings['bathrooms_text'].str.extract('(\d+)')
vwListings['bathroom_numbers'] = pd.to_numeric(vwListings['bathroom_numbers'])

In [6]:
# This step is to prepare for One Hot Encodings
for i in range(len(vwListings['host_response_time'])):
    if 'within an hour' in vwListings['host_response_time'][i]:
        vwListings['host_response_time'][i] = 'within_an_hour'
    elif 'N/A' in vwListings['host_response_time'][i]:
        vwListings['host_response_time'][i] = 'na'
    elif 'within a few hours' in vwListings['host_response_time'][i]:
        vwListings['host_response_time'][i] = 'within_a_few_hours'
    elif 'within a day' in vwListings['host_response_time'][i]:
        vwListings['host_response_time'][i] = 'within_a_day'
    elif 'a few days or more' in vwListings['host_response_time'][i]:
        vwListings['host_response_time'][i] = 'a_few_days_or_more'

        
for i in range(len(vwListings['room_type_mod'])):
    if 'Entire home/apt' in vwListings['room_type_mod'][i]:
        vwListings['room_type_mod'][i] = 'entire_home_apt'
    elif 'Private room' in vwListings['room_type_mod'][i]:
        vwListings['room_type_mod'][i] = 'private_room'
    elif 'Hotel room' in vwListings['room_type_mod'][i]:
        vwListings['room_type_mod'][i] = 'hotel_room'
    elif 'Shared room' in vwListings['room_type_mod'][i]:
        vwListings['room_type_mod'][i] = 'shared_room'

Next, we will clean the bedrooms variable. We will follow the rules below:
- If the value of the bedrooms variable is null and the value of the beds variable is not null, bedrooms will be equal to the value of beds.
- If the value of the beds variable is null and the value of the bedrooms variable is not null, beds will be equal to the value of bedrooms.
- Otherwise, both of the variables will be equal the mean of corresponding variable.

In [7]:
for i in range(len(vwListings['bedrooms'])):
    if vwListings['bedrooms'][i] == '' and vwListings['beds'][i] == '':
        vwListings['beds'][i] = 2
        vwListings['bedrooms'][i] = 2
    elif vwListings['beds'][i] == '':
        vwListings['beds'][i] = vwListings['bedrooms'][i]
    elif vwListings['bedrooms'][i] == '':
        vwListings['bedrooms'][i] = vwListings['beds'][i]
    else:
        continue

In [8]:
# Convert the all the review scores to numeric
vwListings['review_scores_rating'] = pd.to_numeric(vwListings['review_scores_rating'])
vwListings['review_scores_accuracy'] = pd.to_numeric(vwListings['review_scores_accuracy'])
vwListings['review_scores_cleanliness'] = pd.to_numeric(vwListings['review_scores_cleanliness'])
vwListings['review_scores_checkin'] = pd.to_numeric(vwListings['review_scores_checkin'])
vwListings['review_scores_communication'] = pd.to_numeric(vwListings['review_scores_communication'])
vwListings['review_scores_location'] = pd.to_numeric(vwListings['review_scores_location'])
vwListings['review_scores_value'] = pd.to_numeric(vwListings['review_scores_value'])

In [9]:
#  the coordinates (latitude, longtitute of a place from google map )
smithsonian_lat = 38.8889532077163
smithsonian_long = -77.0259992316524
lincoln_memorial_lat = 38.8893771334115
lincoln_memorial_long = -77.0501761044084
capitol_lat = 38.8900557831996
capttol_long = -77.0090290467363
white_house_lat = 38.9103145107623
white_house_long = -77.0221504467353
library_of_congress_lat = 38.8888008852652
library_of_congress_long = -77.0047082755724
nationals_park_lat = 38.8731981381059
nationals_park_long = -77.0074329
national_zoo_lat = 38.9294590679026
national_zoo_long = -77.0492384835061
union_station_lat = 38.8973065709087
union_station_long = -77.00629767372
mcpherson_sq_lat = 38.9028682064423
mcpherson_sq_long = -77.0324745881233

In [10]:
# distance unit = miles
vwListings["distance_smitsonian"] = np.arccos(np.sin(np.radians(vwListings["latitude"]))
                                            * np.sin(np.radians(smithsonian_lat)) 
                                            + np.cos(np.radians(vwListings["latitude"]))
                                            *np.cos(np.radians(smithsonian_lat))
                                            *np.cos(np.radians(smithsonian_long-vwListings["longitude"])) )*3963*1.15077945

In [11]:
vwListings["distance_lincoln"] = np.arccos(np.sin(np.radians(vwListings["latitude"]))
                                            * np.sin(np.radians(lincoln_memorial_lat)) 
                                            + np.cos(np.radians(vwListings["latitude"]))
                                            *np.cos(np.radians(lincoln_memorial_lat))
                                            *np.cos(np.radians(lincoln_memorial_long-vwListings["longitude"])) )*3963*1.15077945

In [12]:
vwListings["distance_capitol"] = np.arccos(np.sin(np.radians(vwListings["latitude"]))
                                            * np.sin(np.radians(capitol_lat)) 
                                            + np.cos(np.radians(vwListings["latitude"]))
                                            *np.cos(np.radians(capitol_lat))
                                            *np.cos(np.radians(capttol_long-vwListings["longitude"])) )*3963*1.15077945

In [13]:
vwListings["distance_white_house"] = np.arccos(np.sin(np.radians(vwListings["latitude"]))
                                            * np.sin(np.radians(white_house_lat)) 
                                            + np.cos(np.radians(vwListings["latitude"]))
                                            *np.cos(np.radians(white_house_lat))
                                            *np.cos(np.radians(white_house_long-vwListings["longitude"])) )*3963*1.15077945

In [14]:
vwListings["distance_library_congress"] = np.arccos(np.sin(np.radians(vwListings["latitude"]))
                                            * np.sin(np.radians(library_of_congress_lat)) 
                                            + np.cos(np.radians(vwListings["latitude"]))
                                            *np.cos(np.radians(library_of_congress_lat))
                                            *np.cos(np.radians(library_of_congress_long-vwListings["longitude"])))*3963*1.15077945

In [15]:
vwListings["distance_nationals_park"] = np.arccos(np.sin(np.radians(vwListings["latitude"]))
                                            * np.sin(np.radians(nationals_park_lat)) 
                                            + np.cos(np.radians(vwListings["latitude"]))
                                            *np.cos(np.radians(nationals_park_lat))
                                            *np.cos(np.radians(nationals_park_long-vwListings["longitude"])) )*3963*1.15077945

In [16]:
vwListings["distance_national_zoo"] = np.arccos(np.sin(np.radians(vwListings["latitude"]))
                                            * np.sin(np.radians(national_zoo_lat)) 
                                            + np.cos(np.radians(vwListings["latitude"]))
                                            *np.cos(np.radians(national_zoo_lat))
                                            *np.cos(np.radians(national_zoo_long-vwListings["longitude"])) )*3963*1.15077945

In [17]:
vwListings["distance_railway"] = np.arccos(np.sin(np.radians(vwListings["latitude"]))
                                            * np.sin(np.radians(union_station_lat)) 
                                            + np.cos(np.radians(vwListings["latitude"]))
                                            *np.cos(np.radians(union_station_lat))
                                            *np.cos(np.radians(union_station_long-vwListings["longitude"])) )*3963*1.15077945

In [18]:

vwListings["distance_mcpherson"] = np.arccos(np.sin(np.radians(vwListings["latitude"]))
                                            * np.sin(np.radians(mcpherson_sq_lat)) 
                                            + np.cos(np.radians(vwListings["latitude"]))
                                            *np.cos(np.radians(mcpherson_sq_lat))
                                            *np.cos(np.radians(mcpherson_sq_long-vwListings["longitude"])) )*3963*1.15077945

We will convert one hot encodings for the following categorical variables host_response_time, host_is_superhost, host_has_profile_pic, host_identity_verified, room_type_mod

In [19]:
vwListings = pd.concat([vwListings, pd.get_dummies(vwListings['host_response_time'], prefix='response_time', prefix_sep='_')], axis=1)
vwListings = pd.concat([vwListings, pd.get_dummies(vwListings['host_is_superhost'], prefix='superhost', prefix_sep='_')], axis=1)
vwListings = pd.concat([vwListings, pd.get_dummies(vwListings['host_identity_verified'], prefix='identity_verified', prefix_sep='_')], axis=1)
vwListings = pd.concat([vwListings, pd.get_dummies(vwListings['host_has_profile_pic'], prefix='profile', prefix_sep='_')], axis=1)
vwListings = pd.concat([vwListings, pd.get_dummies(vwListings['room_type_mod'])], axis=1)

In [20]:
vwListings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8164 entries, 0 to 8163
Data columns (total 95 columns):
 #   Column                                        Non-Null Count  Dtype         
---  ------                                        --------------  -----         
 0   id                                            8164 non-null   int64         
 1   MAX(last_scraped)                             8164 non-null   object        
 2   source_name                                   8164 non-null   object        
 3   scrape_id                                     8164 non-null   int64         
 4   source                                        8164 non-null   object        
 5   name                                          8164 non-null   object        
 6   description                                   8164 non-null   object        
 7   neighborhood_overview                         8164 non-null   object        
 8   host_id                                       8164 non-null   int64 

### MODEL 1

In [21]:
# NEED TO CONFIRM WITH STEVE AGAIN
# Convert the date time of last review to numeric
#vwListings['last_review'] = pd.to_datetime(vwListings['last_review'])

In [None]:
# TEMPORARY TAKE OUT last_review
model = "price ~ host_since + response_time_a_few_days_or_more + response_time_within_a_day \
        + response_time_within_a_few_hours + response_time_within_an_hour + host_response_rate \
        + host_acceptance_rate + superhost_t \
        + profile_t + identity_verified_t + entire_home_apt + hotel_room + private_room + accommodates \
        + bathroom_numbers + bedrooms + beds + minimum_nights + maximum_nights + number_of_reviews \
        + review_scores_rating + review_scores_accuracy + review_scores_cleanliness + review_scores_checkin \
        + review_scores_communication + review_scores_location + review_scores_value + instant_bookable \
        + calculated_host_listings_count + distance_smitsonian + distance_lincoln + distance_capitol \
        + distance_white_house + distance_library_congress + distance_nationals_park + distance_national_zoo \
        + distance_railway + distance_mcpherson"
result1 = models.bootstrap_linear_regression(model, data=vwListings)
models.describe_bootstrap_lr(result1)