# Case Study AirBnB Forecasting

In [115]:
# Data Representation
import numpy as np
import pandas as pd

# Processing & Modeling
import sklearn
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import VotingRegressor
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler


from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


from sklearn import set_config
set_config(display='diagram')   

import statsmodels.api as sm

# Visualization
import matplotlib.pyplot as plt
import plotly.express as px
import plotly
from plotly.subplots import make_subplots
import plotly.graph_objects as go

random_state = 42
pd.set_option('display.max_rows', 100)

Inspiration --> https://www.kaggle.com/nikomata/time-series-forecasting-airbnb-data-fbprophet

In [40]:
link = 'https://drive.google.com/file/d/1OxHShzKji6vodNnTkw1bNZyTaM5hv7mC/view?usp=sharing'
path = 'https://drive.google.com/uc?export=download&id='+link.split('/')[-2]

In [41]:
calendar_df = pd.read_csv(path)
calendar_df.head(1)

Unnamed: 0,listing_id,date,available,price
0,241032,2016-01-04,t,$85.00


In [42]:
calendar_df.head(1).T

Unnamed: 0,0
listing_id,241032
date,2016-01-04
available,t
price,$85.00


In [43]:
calendar_df.shape

(1393570, 4)

In [44]:
# % of NaN values
missing_series = pd.Series(1 - calendar_df.count() / len(calendar_df))
missing_series

listing_id    0.00000
date          0.00000
available     0.00000
price         0.32939
dtype: float64

> Since there's just a few prices missing and there's a lot of observations (1,393,570), we will just drop those missing observations

In [45]:
calendar_df.dropna(subset=['price'], inplace=True)

In [46]:
calendar_df.shape

(934542, 4)

In [47]:
calendar_df['price_float'] = calendar_df['price'].replace('[\$\,]',"",regex=True).astype(float)

In [48]:
calendar_df.dtypes

listing_id       int64
date            object
available       object
price           object
price_float    float64
dtype: object

## Segmenting Time Series Data into Temporal Frequency
- Daily
- Monthly
- Quarterly

In [66]:
calendar_df.shape

(934542, 5)

In [69]:
calendar_df['date'].value_counts().head(10)

2017-01-01    2922
2016-12-31    2859
2016-12-30    2840
2016-12-29    2835
2016-12-28    2833
2016-12-27    2831
2016-12-26    2831
2016-12-25    2829
2016-12-23    2822
2016-12-24    2822
Name: date, dtype: int64

In [70]:
calendar_df['date'].value_counts().mean()

2560.38904109589

> This tells us that across this dataset, there are on average 2,560 listing prices per a given day.

In [52]:
daily_average_prices_df = calendar_df.groupby('date')[["price_float"]].mean()
daily_average_prices_df.columns = ['Average']
daily_average_prices_df.index = pd.to_datetime(daily_average_prices_df.index)
daily_average_prices_df.head()

Unnamed: 0_level_0,Average
date,Unnamed: 1_level_1
2016-01-04,122.085879
2016-01-05,120.681307
2016-01-06,120.928258
2016-01-07,120.958896
2016-01-08,127.640853


In [65]:
daily_average_prices_df.shape

(365, 1)

In [57]:
daily_average_prices_df.index.dtype

dtype('<M8[ns]')

In [61]:
monthly_average_prices_df = daily_average_prices_df[['Average']].resample('M').mean()
monthly_average_prices_df.head()

Unnamed: 0_level_0,Average
date,Unnamed: 1_level_1
2016-01-31,121.692505
2016-02-29,124.315614
2016-03-31,128.640797
2016-04-30,135.10946
2016-05-31,139.539566


In [64]:
monthly_average_prices_df.shape

(13, 1)

In [62]:
quarterly_average_prices_df = daily_average_prices_df[['Average']].resample('Q').mean()
quarterly_average_prices_df.head()

Unnamed: 0_level_0,Average
date,Unnamed: 1_level_1
2016-03-31,125.004632
2016-06-30,140.693254
2016-09-30,148.73778
2016-12-31,136.663931
2017-03-31,136.849867


In [63]:
quarterly_average_prices_df.shape # We see that the data is spread across 5 quarters

(5, 1)

In [86]:
px.line(daily_average_prices_df,
        labels={'value':'Average Price Per Night ($)', 'date':'Date'},
        title="Monthly Averages Across Time")

In [87]:
px.line(monthly_average_prices_df,
        labels={'value':'Average Price Per Night ($)', 'date':'Date'},
        title="Monthly Averages Across Time")

In [130]:
px.line(quarterly_average_prices_df,
        labels={'value':'Average Price Per Night ($)', 'date':'Date'},
        title="Quarterly Averages Across Time")

## Let's Forecast
We will be using Facebook's Prophet time series modeling library.

### Forecast Daily Averages

In [157]:
from fbprophet import Prophet

In [158]:
prophet = Prophet(interval_width = 0.95,
                  daily_seasonality = True,
                #   weekly_seasonality = True,
                  yearly_seasonality = True,
                  changepoint_prior_scale = 0.095
                  )
prophet

<fbprophet.forecaster.Prophet at 0x7fc2b8172f50>

`prophet` expects to fit a dataframe with columns: df[['ds', 'y']]

In [159]:
# we are going to use the df w/daily averages
calendar_prophet_df = daily_average_prices_df.copy()
calendar_prophet_df.reset_index(inplace=True)
calendar_prophet_df.columns = ['ds', 'y']
calendar_prophet_df.head()

Unnamed: 0,ds,y
0,2016-01-04,122.085879
1,2016-01-05,120.681307
2,2016-01-06,120.928258
3,2016-01-07,120.958896
4,2016-01-08,127.640853


In [160]:
prophet.fit(calendar_prophet_df)

<fbprophet.forecaster.Prophet at 0x7fc2b8172f50>

In [161]:
y_pred_future = prophet.make_future_dataframe(periods = 60, freq = 'd')
y_pred_future['cap'] = 5.05
forecast = prophet.predict(y_pred_future)

In [162]:
forecast.head()

Unnamed: 0,ds,trend,cap,yhat_lower,yhat_upper,trend_lower,trend_upper,additive_terms,additive_terms_lower,additive_terms_upper,...,weekly,weekly_lower,weekly_upper,yearly,yearly_lower,yearly_upper,multiplicative_terms,multiplicative_terms_lower,multiplicative_terms_upper,yhat
0,2016-01-04,123.495795,5.05,118.359024,122.946137,123.495795,123.495795,-2.672515,-2.672515,-2.672515,...,-2.164526,-2.164526,-2.164526,-6.43803,-6.43803,-6.43803,0.0,0.0,0.0,120.823281
1,2016-01-05,123.543654,5.05,118.254163,122.870616,123.543654,123.543654,-3.003575,-3.003575,-3.003575,...,-2.431518,-2.431518,-2.431518,-6.5021,-6.5021,-6.5021,0.0,0.0,0.0,120.540079
2,2016-01-06,123.591512,5.05,118.103771,123.086058,123.591512,123.591512,-3.112452,-3.112452,-3.112452,...,-2.449518,-2.449518,-2.449518,-6.592975,-6.592975,-6.592975,0.0,0.0,0.0,120.479061
3,2016-01-07,123.639371,5.05,118.983832,123.755688,123.639371,123.639371,-2.298657,-2.298657,-2.298657,...,-1.519862,-1.519862,-1.519862,-6.708837,-6.708837,-6.708837,0.0,0.0,0.0,121.340714
4,2016-01-08,123.687229,5.05,125.370796,130.152045,123.687229,123.687229,4.098942,4.098942,4.098942,...,5.016244,5.016244,5.016244,-6.847345,-6.847345,-6.847345,0.0,0.0,0.0,127.786171


In [163]:
previous_future_prices = forecast.set_index('ds').join(calendar_prophet_df.set_index('ds'))
previous_future_prices = previous_future_prices[['y', 'yhat', 'yhat_upper', 'yhat_lower']]
previous_future_prices.head()

Unnamed: 0_level_0,y,yhat,yhat_upper,yhat_lower
ds,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2016-01-04,122.085879,120.823281,122.946137,118.359024
2016-01-05,120.681307,120.540079,122.870616,118.254163
2016-01-06,120.928258,120.479061,123.086058,118.103771
2016-01-07,120.958896,121.340714,123.755688,118.983832
2016-01-08,127.640853,127.786171,130.152045,125.370796


In [1]:
# help(go.Scatter)

In [218]:
fig = go.Figure()

fig.add_trace(
    go.Scatter(x=previous_future_prices.index, 
                y=previous_future_prices['y'], 
                mode='lines+markers',
                name='y',
                line_color='blue')
)

fig.add_trace(
    go.Scatter(x=previous_future_prices.index, 
                y=previous_future_prices['yhat'], 
                mode='lines+markers',
                line_color='red',
                name='yhat'),
)

fig.add_trace(
    go.Scatter(x=previous_future_prices.index, 
                y=previous_future_prices['yhat_upper'],
                fillcolor='rgba(0,0,0,.2)', 
                #opacity=0.1,
                fill='tonexty', 
                mode='none', 
                name='upper')
)

fig.add_trace(
    go.Scatter(x=previous_future_prices.index, 
                y=previous_future_prices['yhat_lower'],
                fillcolor='rgba(0,0,0,.2)', 
                #opacity=0.01,
                fill='tonexty', 
                mode='none', 
                name='lower')
)

fig.update_layout(
    title="Past, Present, and Predicted Future Daily Average Prices",
    xaxis_title="Date",
    yaxis_title="Average Price Per Night ($)",
    legend_title="Legend",
)


fig.show()

In [182]:
px.line(previous_future_prices,
        labels={'value':'Average Price Per Night ($)', 'ds':'Date'},
        title="Past, Present, and Predicted Future Daily Average Prices")

### Forecast Monthly Averages

In [228]:
prophet = Prophet(interval_width = 0.95,
                  daily_seasonality = True,
                  weekly_seasonality = True,
                  changepoint_prior_scale = 0.095
                  )
prophet

<fbprophet.forecaster.Prophet at 0x7fc2b9a39dd0>

In [229]:
# we are going to use the df w/daily averages
calendar_prophet_df = monthly_average_prices_df.copy()
calendar_prophet_df.reset_index(inplace=True)
calendar_prophet_df.columns = ['ds', 'y']
calendar_prophet_df.head()

Unnamed: 0,ds,y
0,2016-01-31,121.692505
1,2016-02-29,124.315614
2,2016-03-31,128.640797
3,2016-04-30,135.10946
4,2016-05-31,139.539566


In [230]:
calendar_prophet_df.shape

(13, 2)

In [231]:
prophet.fit(calendar_prophet_df)

INFO:fbprophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
INFO:fbprophet:n_changepoints greater than number of observations. Using 9.


<fbprophet.forecaster.Prophet at 0x7fc2b9a39dd0>

In [232]:
y_pred_future = prophet.make_future_dataframe(periods = 10, freq = 'm')
y_pred_future['cap'] = 5.05
forecast = prophet.predict(y_pred_future)

In [233]:
forecast.head()

Unnamed: 0,ds,trend,cap,yhat_lower,yhat_upper,trend_lower,trend_upper,additive_terms,additive_terms_lower,additive_terms_upper,daily,daily_lower,daily_upper,weekly,weekly_lower,weekly_upper,multiplicative_terms,multiplicative_terms_lower,multiplicative_terms_upper,yhat
0,2016-01-31,7.078695,5.05,118.640498,125.640723,7.078695,7.078695,114.986149,114.986149,114.986149,113.261867,113.261867,113.261867,1.724281,1.724281,1.724281,0.0,0.0,0.0,122.064843
1,2016-02-29,11.805638,5.05,118.665816,126.325112,11.805638,11.805638,110.501116,110.501116,110.501116,113.261867,113.261867,113.261867,-2.760751,-2.760751,-2.760751,0.0,0.0,0.0,122.306754
2,2016-03-31,16.858577,5.05,127.195479,134.381978,16.858577,16.858577,113.777402,113.777402,113.777402,113.261867,113.261867,113.261867,0.515535,0.515535,0.515535,0.0,0.0,0.0,130.63598
3,2016-04-30,21.750014,5.05,131.719637,139.044654,21.750014,21.750014,113.742929,113.742929,113.742929,113.261867,113.261867,113.261867,0.481062,0.481062,0.481062,0.0,0.0,0.0,135.492944
4,2016-05-31,26.804499,5.05,137.528027,145.22407,26.804499,26.804499,114.616051,114.616051,114.616051,113.261867,113.261867,113.261867,1.354183,1.354183,1.354183,0.0,0.0,0.0,141.42055


In [234]:
previous_future_prices = forecast.set_index('ds').join(calendar_prophet_df.set_index('ds'))
previous_future_prices = previous_future_prices[['y', 'yhat', 'yhat_upper', 'yhat_lower']]
previous_future_prices.head()

Unnamed: 0_level_0,y,yhat,yhat_upper,yhat_lower
ds,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2016-01-31,121.692505,122.064843,125.640723,118.640498
2016-02-29,124.315614,122.306754,126.325112,118.665816
2016-03-31,128.640797,130.63598,134.381978,127.195479
2016-04-30,135.10946,135.492944,139.044654,131.719637
2016-05-31,139.539566,141.42055,145.22407,137.528027


In [235]:
fig = go.Figure()

fig.add_trace(
    go.Scatter(x=previous_future_prices.index, 
                y=previous_future_prices['y'], 
                mode='lines+markers',
                name='y',
                line_color='blue')
)

fig.add_trace(
    go.Scatter(x=previous_future_prices.index, 
                y=previous_future_prices['yhat'], 
                mode='lines+markers',
                line_color='red',
                name='yhat'),
)

fig.add_trace(
    go.Scatter(x=previous_future_prices.index, 
                y=previous_future_prices['yhat_upper'],
                fillcolor='rgba(0,0,0,.2)', 
                #opacity=0.1,
                fill='tonexty', 
                mode='none', 
                name='upper')
)

fig.add_trace(
    go.Scatter(x=previous_future_prices.index, 
                y=previous_future_prices['yhat_lower'],
                fillcolor='rgba(0,0,0,.2)', 
                #opacity=0.01,
                fill='tonexty', 
                mode='none', 
                name='lower')
)

fig.update_layout(
    title="Past, Present, and Predicted Future Monthly Average Prices",
    xaxis_title="Date",
    yaxis_title="Average Price Per Night ($)",
    legend_title="Legend",
)


fig.show()

In [156]:
px.line(previous_future_prices,
        title="Past, Present, and Predicted Future Monthly Average Prices",
        labels={'value':'Average Price Per Night ($)', 'ds':'Date'})