In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import random
import time


## 1. Business questions

- Q1: How do listing descriptions differ among different neiborhoods?
- Q2: What are the busiest times of a year to visit Chicago? By how much do prices spike?
- Q3: Is there a general upward trend of both new Airbnb listings and total Airbnb visitors to Chicago?
- Q4: What are the factors that explain the listing price the most?

## 2. Exploratory data analysis

### 2.1. Load the data

Airbnb data: Chicago, Illinois, United States <br>
Data source: http://insideairbnb.com/get-the-data.html

- listings.csv.gz:	Detailed Listings data for Chicago
- calendar.csv.gz:	Detailed Calendar Data for listings in Chicago
- reviews.csv.gz:	Detailed Review Data for listings in Chicago
- listings.csv:	Summary information and metrics for listings in Chicago (good for visualisations).
- reviews.csv:	Summary Review data and Listing ID (to facilitate time based analytics and visualisations linked to a listing).
- neighbourhoods.csv:	Neighbourhood list for geo filter. Sourced from city or open source GIS files.

In [14]:
# Listings
listing = pd.read_csv("listings.csv.gz")
print("Listings table:")
print(f"rows, cols: {listing.shape}")
listing.head()

Listings table:
rows, cols: (7666, 96)


Unnamed: 0,id,listing_url,scrape_id,last_scraped,name,summary,space,description,experiences_offered,neighborhood_overview,...,requires_license,license,jurisdiction_names,instant_bookable,is_business_travel_ready,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,reviews_per_month
0,2384,https://www.airbnb.com/rooms/2384,20181011131034,2018-10-11,Hyde Park-Walk to UChicago or Theological Semi...,"As the sole guest in my quiet, vintage (1924) ...","The spacious bedroom has a queen size bed, che...","As the sole guest in my quiet, vintage (1924) ...",none,My building is located one block from beautifu...,...,t,City registration pending,"{""Illinois State"","" Cook County"","" IL"","" CHICA...",f,f,strict_14_with_grace_period,f,f,1,2.91
1,4505,https://www.airbnb.com/rooms/4505,20181011131034,2018-10-11,1 Great Apartment. 352 Great Reviews. 1 bad one.,Across the street from CTA train. Runs every 6...,"We travel a lot, we know what people need. We...",Across the street from CTA train. Runs every 6...,none,,...,t,City registration pending,"{""Illinois State"","" Cook County"","" IL"","" CHICA...",t,f,moderate,f,f,1,3.11
2,6715,https://www.airbnb.com/rooms/6715,20181011131034,2018-10-11,Lincoln Park Oasis - Unit 2 ONLY,Unit 1 & Unit 2 are rented separately. They ca...,License #: (Phone number hidden by Airbnb) Be...,Unit 1 & Unit 2 are rented separately. They ca...,none,Things To Do & Close to: - An awesome Children...,...,t,2114275,"{""Illinois State"","" Cook County"","" IL"","" CHICA...",f,f,strict_14_with_grace_period,f,f,2,0.82
3,9811,https://www.airbnb.com/rooms/9811,20181011131034,2018-10-11,Barbara's Hideaway - Old Town,One-bedroom hideaway tucked into Old Town step...,"This lovely one bedroom ""hideaway"" is located ...",One-bedroom hideaway tucked into Old Town step...,none,Chicago’s Old Town neighborhood is squeezed be...,...,t,2079260,"{""Illinois State"","" Cook County"","" IL"","" CHICA...",t,f,strict_14_with_grace_period,f,f,8,0.55
4,10610,https://www.airbnb.com/rooms/10610,20181011131034,2018-10-11,3 Comforts of Cooperative Living,The condo is the 2nd floor in a lovely 1912 3-...,Newly furnished with queen bed and the comfort...,The condo is the 2nd floor in a lovely 1912 3-...,none,It's a 10 minute walk from the lakefront bike ...,...,t,City registration pending,"{""Illinois State"","" Cook County"","" IL"","" CHICA...",t,f,moderate,f,f,5,0.64


In [15]:
# Listings summary
listing_summary = pd.read_csv("listings.csv")
print("Listings summay table:")
print(f"rows, cols: {listing_summary.shape}")
listing_summary.head()

Listings summay table:
rows, cols: (7666, 16)


Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2384,Hyde Park-Walk to UChicago or Theological Semi...,2613,Rebecca,,Hyde Park,41.788865,-87.586709,Private room,50,2,133,2018-10-01,2.91,1,241
1,4505,1 Great Apartment. 352 Great Reviews. 1 bad one.,5775,Craig & Kathleen,,South Lawndale,41.854953,-87.696962,Entire home/apt,120,2,363,2018-09-24,3.11,1,188
2,6715,Lincoln Park Oasis - Unit 2 ONLY,15365,Reem,,Lincoln Park,41.929262,-87.660091,Entire home/apt,255,4,93,2018-08-12,0.82,2,352
3,9811,Barbara's Hideaway - Old Town,33004,At Home Inn,,Lincoln Park,41.917689,-87.637879,Entire home/apt,150,3,30,2018-08-13,0.55,8,349
4,10610,3 Comforts of Cooperative Living,2140,Lois And Ed,,Hyde Park,41.797085,-87.591949,Private room,35,2,31,2018-07-29,0.64,5,144


In [10]:
listing_object_list = listing.dtypes.loc[listing.dtypes=='object'].index.tolist()

In [16]:
# Calendar
calendar = pd.read_csv("calendar.csv.gz")
print("Calendar table:")
print(f"rows, cols: {calendar.shape}")
calendar.head()

Calendar table:
rows, cols: (2798090, 4)


Unnamed: 0,listing_id,date,available,price
0,2384,2019-07-27,f,
1,2384,2019-07-26,f,
2,2384,2019-07-25,f,
3,2384,2019-07-24,f,
4,2384,2019-07-23,f,


In [33]:
calendar.loc[calendar['listing_id']==2384]

Unnamed: 0,listing_id,date,available,price
0,2384,2019-07-27,f,
1,2384,2019-07-26,f,
2,2384,2019-07-25,f,
3,2384,2019-07-24,f,
4,2384,2019-07-23,f,
5,2384,2019-07-22,f,
6,2384,2019-07-21,f,
7,2384,2019-07-20,f,
8,2384,2019-07-19,f,
9,2384,2019-07-18,f,


In [17]:
# Reviews
review = pd.read_csv("reviews.csv.gz")
print("Reviews table:")
print(f"rows, cols: {review.shape}")
review.head()

Reviews table:
rows, cols: (259883, 6)


Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
0,2384,25218143,2015-01-09,14385014,Ivan,it's a wonderful trip experience. I didn't exc...
1,2384,28475392,2015-03-24,16241178,Namhaitou,This is my first trip using Airbnb. I was a li...
2,2384,30273263,2015-04-19,26101401,Patrick,The reservation was canceled 80 days before ar...
3,2384,30974202,2015-04-30,26247321,Cristina,Sólo puedo decir cosas buenas de Rebecca. La h...
4,2384,31363208,2015-05-04,31293837,SuJung,Rebecca was an absolutely wonderful host.\r\n\...


In [18]:
# Reviews summary
review_summary = pd.read_csv("reviews.csv")
print("Review summay table:")
print(f"rows, cols: {review_summary.shape}")
review_summary.head()

Review summay table:
rows, cols: (259883, 2)


Unnamed: 0,listing_id,date
0,2384,2015-01-09
1,2384,2015-03-24
2,2384,2015-04-19
3,2384,2015-04-30
4,2384,2015-05-04


## 2. Is there a general upward trend of both new Airbnb listings and total Airbnb visitors to Seattle?

In [21]:
review_summary.groupby(by=['listing_id'])['date'].min()

listing_id
2384        2015-01-09
4505        2009-03-06
6715        2009-06-07
9811        2014-04-15
10610       2014-10-25
10945       2014-04-28
12068       2015-02-28
12140       2015-06-29
22362       2013-12-08
22651       2010-06-07
24833       2010-05-24
25267       2010-09-19
25269       2010-06-11
25879       2010-06-14
37738       2010-07-20
39742       2016-05-29
44020       2011-08-19
46151       2010-09-12
46154       2010-09-29
53497       2010-10-21
56802       2011-01-01
71930       2011-05-23
79101       2011-03-21
80640       2011-09-13
84042       2011-04-18
110705      2011-05-23
126280      2011-06-16
133262      2012-11-16
144840      2016-03-08
145659      2015-10-15
               ...    
28838019    2018-10-04
28839032    2018-10-07
28839981    2018-09-30
28840745    2018-10-07
28842296    2018-10-08
28859527    2018-10-04
28862616    2018-10-07
28862988    2018-10-03
28865800    2018-10-08
28875456    2018-10-05
28884148    2018-10-10
28909247    2018-10-07


In [32]:
listing['host_since'].dtypes

dtype('O')