I will be running a predictive analysis for a Dallas Airbnb dataset containing 6191 listings from June 2022. The dataset was sourced from http://insideairbnb.com/dallas/.

In [32]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import joblib
import plydata.cat_tools as cat
import plotnine as pn
from xgboost import XGBClassifier, XGBRegressor
from sklearn.model_selection import GridSearchCV

In [17]:
dallas_df = pd.read_csv("/Users/adedayo/Desktop/career/Dallas_listings.csv")

dallas_df.head(3)

Unnamed: 0,id,listing_url,scrape_id,last_scraped,name,description,neighborhood_overview,picture_url,host_id,host_url,...,review_scores_communication,review_scores_location,review_scores_value,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,54038873.0,https://www.airbnb.com/rooms/54038873,20220600000000.0,6/10/22,Branch Style Living,Located just a short 15 minutes from central D...,Beautiful River and nice places to walk,https://a0.muscache.com/pictures/0c58757a-e5bc...,393029413,https://www.airbnb.com/users/show/393029413,...,5.0,5.0,5.0,,t,2,2,0,0,0.81
1,42709525.0,https://www.airbnb.com/rooms/42709525,20220600000000.0,6/10/22,The Best Airbnb in North Dallas | Far North Da...,Two-Bedroom + den ( 2 bedrooms whit king beds...,Hop on Dallas North Tollway for a quick commut...,https://a0.muscache.com/pictures/63dd9ec1-7ca6...,108514926,https://www.airbnb.com/users/show/108514926,...,3.67,4.33,3.67,,t,323,323,0,0,0.12
2,15050487.0,https://www.airbnb.com/rooms/15050487,20220600000000.0,6/10/22,"Cozy, Relax and, Charming 3/2 Home",Cozy and Relaxing Vacation Home within a natur...,The house is located in a quaint and quiet are...,https://a0.muscache.com/pictures/d14a29ea-49a5...,16354066,https://www.airbnb.com/users/show/16354066,...,4.89,4.91,4.85,,f,1,1,0,0,2.41


## Data Cleaning

In [18]:
# number of listings and features:

dallas_df.shape

(6191, 74)

There are 6191 listings and 74 features in the dataset.

In [19]:
#checking the columns in the dataset:

dallas_df.columns

Index(['id', 'listing_url', 'scrape_id', 'last_scraped', 'name', 'description',
       'neighborhood_overview', 'picture_url', 'host_id', 'host_url',
       'host_name', 'host_since', 'host_location', 'host_about',
       'host_response_time', 'host_response_rate', 'host_acceptance_rate',
       'host_is_superhost', 'host_thumbnail_url', 'host_picture_url',
       'host_neighbourhood', 'host_listings_count',
       'host_total_listings_count', 'host_verifications',
       'host_has_profile_pic', 'host_identity_verified', 'neighbourhood',
       'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'latitude',
       'longitude', 'property_type', 'room_type', 'accommodates', 'bathrooms',
       'bathrooms_text', 'bedrooms', 'beds', 'amenities', 'price',
       'minimum_nights', 'maximum_nights', 'minimum_minimum_nights',
       'maximum_minimum_nights', 'minimum_maximum_nights',
       'maximum_maximum_nights', 'minimum_nights_avg_ntm',
       'maximum_nights_avg_ntm', 'calendar_upd

In [20]:
# reading columns useful to my analysis:

dallas_df = dallas_df[["id", "name", "host_id",
                       'host_name', 'host_since', 'host_location', 'host_response_time',
                     'host_response_rate','host_total_listings_count', 'neighbourhood_cleansed',
                      'latitude','longitude','property_type','accommodates','bathrooms',"bedrooms",
                      "beds","price",'minimum_nights', 'maximum_nights',
                       'availability_30', 'availability_60', 'availability_90','availability_365',
                      'number_of_reviews','first_review',
       'last_review', 'review_scores_rating','reviews_per_month']]

dallas_df

Unnamed: 0,id,name,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_total_listings_count,neighbourhood_cleansed,...,maximum_nights,availability_30,availability_60,availability_90,availability_365,number_of_reviews,first_review,last_review,review_scores_rating,reviews_per_month
0,5.403887e+07,Branch Style Living,393029413,R.J.,3/17/21,"Washington, District of Columbia, United States",within an hour,100%,4.0,District 6,...,365,1,1,8,8,3,2/20/22,4/13/22,5.00,0.81
1,4.270952e+07,The Best Airbnb in North Dallas | Far North Da...,108514926,Dalls,12/26/16,"Houston, Texas, United States",within an hour,98%,337.0,District 13,...,220,0,11,41,316,3,4/29/20,5/8/22,3.67,0.12
2,1.505049e+07,"Cozy, Relax and, Charming 3/2 Home",16354066,Daniel,6/3/14,"Grand Prairie, Texas, United States",within an hour,100%,13.0,District 3,...,1125,9,35,65,337,168,9/18/16,5/1/22,4.82,2.41
3,6.187800e+04,MODERN LIVING AND FURNISHINGS,300211,Rita,11/26/10,"Roanoke, Texas, United States",within a few hours,100%,1.0,District 2,...,365,0,0,0,102,45,12/29/10,2/28/22,4.73,0.32
4,2.477940e+05,Private Room in North Dallas,78344,J. Yuriko,2/6/10,"Dallas, Texas, United States",within a day,100%,1.0,District 10,...,365,9,39,69,344,0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6186,5.302339e+07,NEW! Peaceful Garland House w/ Yard - Pets Wel...,124060715,Evolve,4/3/17,United States,within an hour,98%,842.0,District 9,...,1125,17,42,65,333,15,11/14/21,5/15/22,4.40,2.15
6187,4.924705e+07,★ Cozy North Dallas Gem ★ Beautiful Home,85748009,Daniel,7/24/16,"Carrollton, Texas, United States",within an hour,100%,10.0,District 6,...,1125,12,22,48,318,109,4/25/21,5/27/22,4.72,7.94
6188,6.219410e+17,"Gorgeous 5 Bedroom, Pool, BBQ, Fire Pit + Turf",118440196,Lindsay,2/27/17,"Carrollton, Texas, United States",within an hour,100%,3.0,District 12,...,45,11,34,34,34,0,,,,
6189,5.130939e+07,Cozy and elegant! Convenient to all!,100889303,Immaculate,10/23/16,"Dallas, Texas, United States",within a few hours,90%,1.0,District 6,...,365,23,53,83,173,15,8/22/21,5/2/22,5.00,1.54


In [21]:
# checking for duplicates in the dataset:

dallas_df.duplicated().value_counts()

False    6191
dtype: int64

There are no duplicates in the dataset.

In [22]:
##checking the number of missing values in each column:

dallas_df.isna().sum()

id                              0
name                            0
host_id                         0
host_name                     105
host_since                    105
host_location                 113
host_response_time            521
host_response_rate            521
host_total_listings_count     105
neighbourhood_cleansed          0
latitude                        0
longitude                       0
property_type                   0
accommodates                    0
bathrooms                    6191
bedrooms                      216
beds                           78
price                           0
minimum_nights                  0
maximum_nights                  0
availability_30                 0
availability_60                 0
availability_90                 0
availability_365                0
number_of_reviews               0
first_review                 1168
last_review                  1168
review_scores_rating         1168
reviews_per_month            1168
dtype: int64

From the above, there seemed to be a lot of missing records for bathrooms, first review and last review. The missing values for review scores rating and reviews per month can be modified to zero since it is most likely due to the guests not providing reviews.

In [23]:
dallas_df.drop(['bathrooms','first_review','last_review'], axis = 1, inplace = True)

dallas_df['review_scores_rating'] = dallas_df['review_scores_rating'].fillna(0)

dallas_df['reviews_per_month'] = dallas_df['reviews_per_month'].fillna(0)

In [24]:
#checking the columns with missing values:

dallas_df.isna().sum()

id                             0
name                           0
host_id                        0
host_name                    105
host_since                   105
host_location                113
host_response_time           521
host_response_rate           521
host_total_listings_count    105
neighbourhood_cleansed         0
latitude                       0
longitude                      0
property_type                  0
accommodates                   0
bedrooms                     216
beds                          78
price                          0
minimum_nights                 0
maximum_nights                 0
availability_30                0
availability_60                0
availability_90                0
availability_365               0
number_of_reviews              0
review_scores_rating           0
reviews_per_month              0
dtype: int64

In [25]:
# dropping listings where both host_name, host_since and host_location are missing:

dallas_df.dropna( how='all',
          subset=['host_name','host_since','host_location'], axis = 0, inplace = True)

In [26]:
dallas_df.isna().sum()

id                             0
name                           0
host_id                        0
host_name                      0
host_since                     0
host_location                  8
host_response_time           416
host_response_rate           416
host_total_listings_count      0
neighbourhood_cleansed         0
latitude                       0
longitude                      0
property_type                  0
accommodates                   0
bedrooms                     213
beds                          78
price                          0
minimum_nights                 0
maximum_nights                 0
availability_30                0
availability_60                0
availability_90                0
availability_365               0
number_of_reviews              0
review_scores_rating           0
reviews_per_month              0
dtype: int64

In [27]:
# dropping listings where both host response time and host response rates are missing:

dallas_df.dropna( how='all',
          subset=['host_response_time', 'host_response_rate'],
                 axis = 0, inplace = True)

In [28]:
# dropping listings where any number of bedrooms and beds are missing:

dallas_df.dropna(how = 'any',
                 subset = ['bedrooms', 'beds'], axis = 0, inplace = True)

In [29]:
dallas_df.isna().sum()

id                           0
name                         0
host_id                      0
host_name                    0
host_since                   0
host_location                8
host_response_time           0
host_response_rate           0
host_total_listings_count    0
neighbourhood_cleansed       0
latitude                     0
longitude                    0
property_type                0
accommodates                 0
bedrooms                     0
beds                         0
price                        0
minimum_nights               0
maximum_nights               0
availability_30              0
availability_60              0
availability_90              0
availability_365             0
number_of_reviews            0
review_scores_rating         0
reviews_per_month            0
dtype: int64

In [30]:
# Since the location of the host is not really important to my analysis, I will drop the column:

dallas_df.drop(["host_location"], axis = 1, inplace = True)

In [31]:
dallas_df.shape

(5427, 25)

The dataset has been reduced to 5427 listings with 25 features

## Questions:

1. What can we learn about different hosts and areas?

2. What can we learn from predictions? (ex: locations, prices, reviews, etc)

3. Which hosts are the busiest and why?

4. Is there any noticeable difference of traffic among different areas and what could be the reason for it?