# San Francisco Airbnb Listings

This dashboard explores the distribution of Airbnb listing prices in San Francisco, 
their variation by neighborhood, the relationship between the number of reviews and listing prices,
and the availability of listings throughout the year.


In [1]:
import pandas as pd
import numpy as np
import altair as alt

In [2]:
df = pd.read_csv('~/datasets/airbnb/listings.csv').head(4000)
# Preprocessing: removing the dollar sign and converting the 'price' column to a float
# df['price'] = df['price'].str.replace('[$,]', '', regex=True).astype(float)

# transform price column to logarithmic scale
df['log_price'] = np.log10(df['price'])

# Calculate the IQR for price
Q1 = df['price'].quantile(0.25)
Q3 = df['price'].quantile(0.75)
IQR = Q3 - Q1

# Define the lower and upper bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Identify the outliers
outliers = df[(df['price'] < lower_bound) | (df['price'] > upper_bound)]

In [3]:
# Create a selection for hovering
hover = alt.selection_single(on='mouseover', nearest=True, empty='none')

# Define the base chart
base = alt.Chart(df).encode(
    x=alt.X('longitude:Q', title='Longitude', scale=alt.Scale(domain=[-122.51246, -122.36823])),
    y=alt.Y('latitude:Q', title='Latitude', scale=alt.Scale(domain=[37.70854, 37.80954])),
    color=alt.Color('price:Q', title='Price', scale=alt.Scale(scheme='viridis')),
    tooltip=[
        alt.Tooltip('name:N', title='Name'),
        alt.Tooltip('price:Q', title='Price'),
        alt.Tooltip('room_type:N', title='Room Type'),
        alt.Tooltip('neighbourhood:N', title='Neighbourhood'),
    ],
)

# Create the points layer with interactive hover
points = base.mark_circle(size=60).add_selection(hover)

# Create a text layer to show the property name on hover
text = base.mark_text(align='left', dx=5, dy=-5).encode(
    text=alt.condition(hover, 'name:N', alt.value(''))
)

# Define the base chart for outliers
outliers_base = alt.Chart(outliers).encode(
    x=alt.X('longitude:Q', title='Longitude', scale=alt.Scale(domain=[-122.51246, -122.36823])),
    y=alt.Y('latitude:Q', title='Latitude', scale=alt.Scale(domain=[37.70854, 37.80954])),
    color=alt.Color('price:Q', title='Price', scale=alt.Scale(scheme='viridis')),
    tooltip=[
        alt.Tooltip('name:N', title='Name'),
        alt.Tooltip('price:Q', title='Price'),
        alt.Tooltip('room_type:N', title='Room Type'),
        alt.Tooltip('neighbourhood:N', title='Neighbourhood'),
    ],
)

# Create a new selection object for the outlier points
outlier_hover = alt.selection_single(on='mouseover', empty='none')

# Create the outlier points layer with interactive hover and a distinct color
outlier_points = outliers_base.mark_circle(size=60, color='red', opacity=0.8).add_selection(
    outlier_hover
)

# Combine the points and text layers
interactive_plot = (points + text + outlier_points).properties(
    title="Plot 1: San Francisco Airbnb Listings", width=600, height=400
)
interactive_plot

In [4]:
# create the histogram chart
hist = (
    alt.Chart(df)
    .mark_bar(color='#6495ED')
    .encode(
        alt.X("log_price:Q", bin=alt.Bin(step=0.1), title='Price (log10)'),
        y=alt.Y('count()', title='Number of listings'),
    )
    .properties(
        width=600, height=400, title='Distribution of Airbnb listing prices in San Francisco'
    )
)

# add a vertical line for the mean price
mean_price = df['log_price'].mean()
mean_line = (
    alt.Chart(pd.DataFrame({'mean_price': [mean_price]}))
    .mark_rule(color='#FFA07A')
    .encode(x='mean_price:Q')
)

# combine the histogram chart and the mean line
chart = (hist + mean_line).properties(
    title="Plot 2: Distribution of Airbnb Listing Prices in San Francisco"
)

# display the chart
chart

In [5]:
# Define the chart for the box plot
price_by_neighborhood = (
    alt.Chart(df)
    .mark_boxplot()
    .encode(
        alt.X("neighbourhood:N", title="Neighborhood"),
        alt.Y("price:Q", title="Listing Price"),
        color=alt.Color("neighbourhood:N", legend=None),
        tooltip=["neighbourhood:N", "price"],
    )
    .properties(
        title="Plot 3: Listing Prices by Neighborhood in San Francisco", width=800, height=400
    )
    .interactive()
)

# Allow users to zoom and pan the chart
price_by_neighborhood = price_by_neighborhood.add_selection(alt.selection_interval(bind="scales"))
price_by_neighborhood

In [6]:
# Define a selection filter for neighborhoods
neighborhood_selection = alt.selection_multi(fields=["neighbourhood"], bind="legend")

# Define the chart for the scatter plot
reviews_vs_price = (
    alt.Chart(df)
    .mark_circle()
    .encode(
        alt.X("number_of_reviews:Q", title="Number of Reviews"),
        alt.Y("price:Q", title="Listing Price"),
        color=alt.Color("neighbourhood:N", legend=alt.Legend(title="Neighborhood")),
        tooltip=["neighbourhood", "number_of_reviews", "price"],
        opacity=alt.condition(neighborhood_selection, alt.value(1), alt.value(0.1)),
    )
    .properties(
        title="Plot 4: Relationship between Number of Reviews and Listing Price",
        width=800,
        height=400,
    )
    .add_selection(neighborhood_selection)
    .interactive()
)

# Allow users to zoom and pan the chart
reviews_vs_price = reviews_vs_price.add_selection(alt.selection_interval(bind="scales"))
reviews_vs_price