# Comparison of Superhosts and Regular Hosts

A Jupyter Notebook to get the data, clean and augment it and generate visualisations and printed comparisons to better understand how Superhosts differ from regular hosts in the dataset.

In [1]:
from helpers.analyses import (
    run_ttest_independent,
    get_percent_count_groupby,
)
from helpers.transformations import (
    add_columns_to_df,
    clean_dataframe,
    remove_outliers,
)
from helpers.visualisations import (
    create_bar_chart,
    create_box_plot,
    create_histogramms_boxplots,
    create_pie_chart,
    create_subplots,
    visualize_accomodations_on_map,
    update_fig_layout,
    DEFAULT_COLOR_MAPPING
)
import os
import pandas as pd
import plotly.express as px

### Get the data, clean and augment it

In [2]:
df = pd.read_csv(
    os.path.join(
        "http://data.insideairbnb.com/the-netherlands/north-holland/amsterdam/2022-03-08/data/",
        "listings.csv.gz"),
    compression="gzip"
)
clean_dataframe(df)
add_columns_to_df(df)

# Splitting datasets
df_super = df.loc[df["host_is_superhost"]]
df_regular = df.loc[~df["host_is_superhost"]]

Dropping the following columns containing '_url':
listing_url  
picture_url  
host_url  
host_thumbnail_url  
host_picture_url


Dropping the following columns containing 'scrape':
scrape_id  
last_scraped  
calendar_last_scraped


The following columns with 90.0% or more missing values were dropped:
bathrooms
calendar_updated
neighbourhood_group_cleansed




### Location

Check for differences in listings locations between Superhosts and regular hosts.

In [3]:
visualize_accomodations_on_map(df, save_output=False)

In [19]:
fig = px.scatter(df, x="latitude", y="longitude", color="host_type", color_discrete_map=DEFAULT_COLOR_MAPPING, opacity=0.4)
update_fig_layout(fig, width=1000, height=600, xaxis_title="latitude", yaxis_title="longitude")
fig

In [25]:
metric = "neighbourhood_cleansed"
create_bar_chart(
    get_percent_count_groupby(df, ["host_type", metric]),
    x="percent",
    y=metric,
    yaxis_title=metric,
    xaxis_title="percent",
    color="host_type",
    orientation="h",
    save_output=False
)

In [3]:
# Distance to centre
print("Average distance to centre")
print(f"Superhost listings: {round(df_super['distance_to_centre_km'].mean(), 2)} km, regular host listings: {round(df_regular['distance_to_centre_km'].mean(), 2)} km")
print("\n")
print("Median distance to centre")
print(f"Superhost listings: {round(df_super['distance_to_centre_km'].median(), 2)} km, regular host listings: {round(df_regular['distance_to_centre_km'].median(), 2)} km")
print("\n")

# Check significane of differences
run_ttest_independent(df_super, df_regular, "distance_to_centre_km")

Average distance to centre
Superhost listings: 2.77 km, regular host listings: 2.89 km


Median distance to centre
Superhost listings: 2.18 km, regular host listings: 2.52 km
distance_to_centre_km: 0.03


### Authenticity

Compare

In [None]:
# Bar charts
    for metric in [
        "host_response_time",
        "host_identity_verified",
    ]:
        create_bar_chart(
            get_percent_count_groupby(df, ["host_type", metric]),
            x="percent",
            y=metric,
            color="host_type",
            orientation="h",
            fig_name=metric,
            width=600,
            height=400,
            viz_dir=directory
        )

In [None]:
for metric_col in [
        "review_scores_rating",
        "review_scores_accuracy",
        "review_scores_cleanliness",
        "review_scores_checkin",
        "review_scores_communication",
        "review_scores_location",
        "review_scores_value",
    ]:

In [None]:
print_metric_mean_by_host_type(df, metric)

In [None]:


### Print metrics


# Number of listings
print(f"Average number of listings to centre Superhost listings: {(df_super['host_total_listings_count'] > 1).sum() / df_super.shape[0]}")
print(f"Average number of listings to centre regular hosts listings: {(df_regular['host_total_listings_count'] > 1).sum() / df_regular.shape[0]}")

# Reviews
print_metric_mean_by_host_type(df, "review_scores_rating")
print_metric_mean_by_host_type(df, "review_scores_accuracy")
print_metric_mean_by_host_type(df, "review_scores_cleanliness")
print_metric_mean_by_host_type(df, "review_scores_checkin")
print_metric_mean_by_host_type(df, "review_scores_communication")
print_metric_mean_by_host_type(df, "review_scores_location")
print_metric_mean_by_host_type(df, "review_scores_value")

### T-tests: distance_to_centre_km,
