# Data Scientist Portfolio Project: Airbnb Listing Analysis in Python

## Imports

In [1]:
import polars as pl
import altair as alt

## Objective 1: Profile & QA the Data

1) Import `Listing.csv` file
2) Cast any date columns as a datetime format

In [2]:
listings = pl.scan_parquet("airbnb-data/Listings.parquet")

listings.head().collect()

listing_id,name,host_id,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_total_listings_count,host_has_profile_pic,host_identity_verified,neighbourhood,district,city,latitude,longitude,property_type,room_type,accommodates,bedrooms,amenities,price,minimum_nights,maximum_nights,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable
i64,str,i64,date,str,str,str,str,str,i64,str,str,str,str,str,f64,f64,str,str,i64,i64,str,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,str
281420,"""Beautiful Flat in le Village M…",1466919,2011-12-03,"""Paris, Ile-de-France, France""",,,,"""f""",1,"""t""","""f""","""Buttes-Montmartre""",,"""Paris""",48.88668,2.33343,"""Entire apartment""","""Entire place""",2,1,"""[""Heating"", ""Kitchen"", ""Washer…",53,2,1125,100,10,10,10,10,10,10,"""f"""
3705183,"""39 mÂ² Paris (Sacre CÅ“ur)""",10328771,2013-11-29,"""Paris, Ile-de-France, France""",,,,"""f""",1,"""t""","""t""","""Buttes-Montmartre""",,"""Paris""",48.88617,2.34515,"""Entire apartment""","""Entire place""",2,1,"""[""Shampoo"", ""Heating"", ""Kitche…",120,2,1125,100,10,10,10,10,10,10,"""f"""
4082273,"""Lovely apartment with Terrace,…",19252768,2014-07-31,"""Paris, Ile-de-France, France""",,,,"""f""",1,"""t""","""f""","""Elysee""",,"""Paris""",48.88112,2.31712,"""Entire apartment""","""Entire place""",2,1,"""[""Heating"", ""TV"", ""Kitchen"", ""…",89,2,1125,100,10,10,10,10,10,10,"""f"""
4797344,"""Cosy studio (close to Eiffel t…",10668311,2013-12-17,"""Paris, Ile-de-France, France""",,,,"""f""",1,"""t""","""t""","""Vaugirard""",,"""Paris""",48.84571,2.30584,"""Entire apartment""","""Entire place""",2,1,"""[""Heating"", ""TV"", ""Kitchen"", ""…",58,2,1125,100,10,10,10,10,10,10,"""f"""
4823489,"""Close to Eiffel Tower - Beauti…",24837558,2014-12-14,"""Paris, Ile-de-France, France""",,,,"""f""",1,"""t""","""f""","""Passy""",,"""Paris""",48.855,2.26979,"""Entire apartment""","""Entire place""",2,1,"""[""Heating"", ""TV"", ""Kitchen"", ""…",60,2,1125,100,10,10,10,10,10,10,"""f"""


3) Filter the date down to rows where the city is Paris, and keep only the columns `host_since`, `neighbourhood`, `city`, `accomodates`, and `price` in your table

In [3]:
paris_listings = (
    listings
    .filter(pl.col("city") == "Paris")
    .select("host_since", "neighbourhood", "city", "accommodates", "price")
)

paris_listings.head().collect()

host_since,neighbourhood,city,accommodates,price
date,str,str,i64,i64
2011-12-03,"""Buttes-Montmartre""","""Paris""",2,53
2013-11-29,"""Buttes-Montmartre""","""Paris""",2,120
2014-07-31,"""Elysee""","""Paris""",2,89
2013-12-17,"""Vaugirard""","""Paris""",2,58
2014-12-14,"""Passy""","""Paris""",2,60


4) QA the Paris listings data: check for missing values, and calculate the minimum, maximum, and average for each numeric field

In [4]:
paris_listings.describe()

statistic,host_since,neighbourhood,city,accommodates,price
str,str,str,str,f64,f64
"""count""","""64657""","""64690""","""64690""",64690.0,64690.0
"""null_count""","""33""","""0""","""0""",0.0,0.0
"""mean""","""2015-11-01 11:06:05.528867""",,,3.037997,113.096445
"""std""",,,,1.588766,214.433668
"""min""","""2008-08-30""","""Batignolles-Monceau""","""Paris""",0.0,0.0
"""25%""","""2014-03-09""",,,2.0,59.0
"""50%""","""2015-07-07""",,,2.0,80.0
"""75%""","""2017-05-29""",,,4.0,120.0
"""max""","""2021-02-07""","""Vaugirard""","""Paris""",16.0,12000.0


In [5]:
(
    paris_listings
    .filter((pl.col("accommodates") == 0) | (pl.col("price") == 0))
    .count()
    .collect()
)

host_since,neighbourhood,city,accommodates,price
u32,u32,u32,u32,u32
62,62,62,62,62


A small fraction of the data has missing `host_since` and zero `accomodates` and `price`values (possibly due to incorrect or incomplete entries). These can be simply ignored as they will not have an effect on the analysis.

## Objective 2: Prepare for Visualization

1) Create a table named `paris_listings_neighbourhood`, that groups Paris listings by `neightbourhood` and calculated the mean price for each neighbourhood sorted from lowest to highest average price.

In [6]:
paris_listings_neighbourhood = (
    paris_listings
    .group_by("neighbourhood")
    .agg(avg_price = pl.col("price").mean())
    .sort("avg_price", descending=False)
)
paris_listings_neighbourhood.tail().collect()

neighbourhood,avg_price
str,f64
"""Luxembourg""",155.638639
"""Palais-Bourbon""",156.856578
"""Passy""",161.144635
"""Louvre""",175.379972
"""Elysee""",210.536765


2) Create a table named `paris_listings_accomodations`. This table should be filtered down to the most expensive neighbourhood in Paris, grouped by the `accomodations` column, and contain the mean price for each value of `accomodates` sorted from lowest to highest average price.

In [7]:
paris_listings_accomodations = (
    paris_listings
    .filter(pl.col("neighbourhood") == "Elysee")
    .group_by("accommodates")
    .agg(
        avg_price = pl.col("price").mean(),
        # count = pl.col("neighbourhood").len()
    )
    .sort("avg_price", descending=False)
)

paris_listings_accomodations.tail().collect()

accommodates,avg_price
i64,f64
12,529.625
16,800.0
11,805.0
13,842.5
14,971.0


3) Create a table called `paris_listings_over_time` which is grouped by the year of the `host_since` column. Calculate a count of rows, representing total number of new hosts, and the average price for each year.

In [8]:
paris_listings_over_time = (
    paris_listings
    .drop_nulls("host_since")
    .with_columns(year = pl.col("host_since").dt.year())
    .group_by("year")
    .agg(
        new_hosts = pl.len(),
        avg_price = pl.col("price").mean()
    )
    .sort("year")
)

paris_listings_over_time.collect()


year,new_hosts,avg_price
i32,u32,f64
2008,4,77.75
2009,106,159.641509
2010,416,125.03125
2011,1339,124.82823
2012,4592,111.578615
…,…,…
2017,4585,108.658888
2018,4294,138.209362
2019,5694,129.757113
2020,3412,141.456038


## Objective 3: Visualize the Data

1) Create a horizontal bar chart of the average price by `neighborhood` in Paris. Make sure to add a title and change axis labels as needed.

In [9]:
(
    alt.Chart(
        paris_listings_neighbourhood.collect(),
        title="Average Price by Neighborhood in Paris"
    ).mark_bar()
    .encode(
        alt.X('avg_price').title('Average Price (Euros)'),
        alt.Y('neighbourhood').title('Neighbourhood').sort('-x')        
    )
)

2) Create a horizontal bar chart of the average price by `accomodates` in Paris' most expensive neighborhood. Make sure to add a title and change axis labels as needed.

In [10]:
(
    alt.Chart(
        paris_listings_accomodations.collect(),
        title="Average Price by 'Accomodates' in Paris"
    ).mark_bar()
    .encode(
        alt.X('avg_price').title('Average Price (Euros)'),
        alt.Y('accommodates:O').title('Accomodates').sort('-y')
    )
)

3) Create two line charts: one of the count of new hosts over time, and one of average price, make sure to set the y-axis limit to 0, add a title, and change axis labels as needed.

**Challenge**: Create a dual axis line chart that contains both new hosts and average price over time.

In [11]:
base = (
    alt.Chart(
        paris_listings_over_time.collect(),
        title="Number of new hosts onboarded each year vs average prices"
    ).mark_bar()
    .encode(
        x=alt.X("year:O", title="Year")
    )
)

new_hosts_line = base.mark_line(color="steelblue").encode(
    y=alt.Y(
        "new_hosts:Q",
        title="New Hosts",
        axis=alt.Axis(titleColor="steelblue")
    )
)

avg_price_line = base.mark_line(color="orange").encode(
    y=alt.Y(
        "avg_price:Q",
        title="Average Price (Euros)",
        axis=alt.Axis(titleColor="orange")
    )
)

alt.layer(
    new_hosts_line,
    avg_price_line
).resolve_scale(
    y="independent"
)

4) Based on your findings, what insights do you have about the impact of the 2015 regulations on new hosts and prices?