In [1]:
import pandas
import matplotlib.pyplot as pyplot
import sklearn
import numpy

# Read in the data
calendar_dataframe = pandas.read_csv('./calendar.csv')
listings_dataframe = pandas.read_csv('./listings.csv')

The third question was to see if an ML model could be built to predict a price. Review the data that was loaded in.

In [2]:
calendar_dataframe.head()

Unnamed: 0,listing_id,date,available,price
0,241032,2016-01-04,t,$85.00
1,241032,2016-01-05,t,$85.00
2,241032,2016-01-06,f,
3,241032,2016-01-07,f,
4,241032,2016-01-08,f,


In [3]:
listings_dataframe.head()

Unnamed: 0,id,listing_url,scrape_id,last_scraped,name,summary,space,description,experiences_offered,neighborhood_overview,...,review_scores_value,requires_license,license,jurisdiction_names,instant_bookable,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,reviews_per_month
0,241032,https://www.airbnb.com/rooms/241032,20160104002432,2016-01-04,Stylish Queen Anne Apartment,,Make your self at home in this charming one-be...,Make your self at home in this charming one-be...,none,,...,10.0,f,,WASHINGTON,f,moderate,f,f,2,4.07
1,953595,https://www.airbnb.com/rooms/953595,20160104002432,2016-01-04,Bright & Airy Queen Anne Apartment,Chemically sensitive? We've removed the irrita...,"Beautiful, hypoallergenic apartment in an extr...",Chemically sensitive? We've removed the irrita...,none,"Queen Anne is a wonderful, truly functional vi...",...,10.0,f,,WASHINGTON,f,strict,t,t,6,1.48
2,3308979,https://www.airbnb.com/rooms/3308979,20160104002432,2016-01-04,New Modern House-Amazing water view,New modern house built in 2013. Spectacular s...,"Our house is modern, light and fresh with a wa...",New modern house built in 2013. Spectacular s...,none,Upper Queen Anne is a charming neighborhood fu...,...,10.0,f,,WASHINGTON,f,strict,f,f,2,1.15
3,7421966,https://www.airbnb.com/rooms/7421966,20160104002432,2016-01-04,Queen Anne Chateau,A charming apartment that sits atop Queen Anne...,,A charming apartment that sits atop Queen Anne...,none,,...,,f,,WASHINGTON,f,flexible,f,f,1,
4,278830,https://www.airbnb.com/rooms/278830,20160104002432,2016-01-04,Charming craftsman 3 bdm house,Cozy family craftman house in beautiful neighb...,Cozy family craftman house in beautiful neighb...,Cozy family craftman house in beautiful neighb...,none,We are in the beautiful neighborhood of Queen ...,...,9.0,f,,WASHINGTON,f,strict,f,f,1,0.89


First thing...clean and setup the data in the dataframes as needed

In [5]:
# Convert the listing id to a string so that the data type is an object as it will be used a categorical variable
calendar_dataframe['listing_id'] = calendar_dataframe['listing_id'].astype(str)

# Impute NaN prices with the string $0.00, and then remove any dollar symbols so all prices are quantitative
calendar_dataframe['price'].fillna('$0.00', inplace=True)
calendar_dataframe['price'] = calendar_dataframe['price'].str.replace('[\$,]', '', regex=True).astype(float)

# Check the calendar dataframe
calendar_dataframe.head(), calendar_dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1393570 entries, 0 to 1393569
Data columns (total 4 columns):
 #   Column      Non-Null Count    Dtype  
---  ------      --------------    -----  
 0   listing_id  1393570 non-null  object 
 1   date        1393570 non-null  object 
 2   available   1393570 non-null  object 
 3   price       1393570 non-null  float64
dtypes: float64(1), object(3)
memory usage: 42.5+ MB


(  listing_id        date available  price
 0     241032  2016-01-04         t   85.0
 1     241032  2016-01-05         t   85.0
 2     241032  2016-01-06         f    0.0
 3     241032  2016-01-07         f    0.0
 4     241032  2016-01-08         f    0.0,
 None)

In [6]:
# Store the quantitative columns from the listings dataframe in a new dataframe
quantitative_columns = ['id', 'accommodates', 'bathrooms', 'bedrooms', 'beds', 'price', 'minimum_nights', 'number_of_reviews', 'review_scores_rating', 'instant_bookable']

listings_dataframe_quantitative = listings_dataframe[quantitative_columns].copy()

# Convert the id to a string so that the data type is an object as it will be used a categorical variable
listings_dataframe_quantitative['id'] = listings_dataframe_quantitative['id'].astype(str)

# Remove any dollar symbols so all prices are quantitative
listings_dataframe_quantitative['price'].fillna('$0.00', inplace=True)
listings_dataframe_quantitative['price'] = listings_dataframe_quantitative['price'].str.replace('[\$,]', '', regex=True).astype(float)

# Check which quantities have null values
listings_dataframe_quantitative.isnull().sum()

id                        0
accommodates              0
bathrooms                16
bedrooms                  6
beds                      1
price                     0
minimum_nights            0
number_of_reviews         0
review_scores_rating    647
instant_bookable          0
dtype: int64

In [35]:
# Get the rows with null values and loop through them imputing the column NaN values with 0
null_rows = listings_dataframe_quantitative.isnull().sum()

for index, value in null_rows.items():
    if value > 0:
        listings_dataframe_quantitative[index].fillna(0, inplace=True)

# Verify there are no more NaN values in the columns
listings_dataframe_quantitative.isnull().sum()

id                      0
accommodates            0
bathrooms               0
bedrooms                0
beds                    0
price                   0
minimum_nights          0
number_of_reviews       0
review_scores_rating    0
instant_bookable        0
dtype: int64

In [26]:
# Convert the listing id to a string so that the data type is an object as it will be used a categorical variable
calendar_dataframe['listing_id'] = calendar_dataframe['listing_id'].astype(str)

# Impute NaN prices with 0's, and remove any dollar symbols so all prices are quantitative
calendar_dataframe['price'].fillna(0, inplace=True)
calendar_dataframe['price'] = calendar_dataframe['price'].str.replace('[\$,]', '', regex=True).astype(float)