# Comparison of Superhosts and Regular Hosts

A Jupyter Notebook to get the data, clean and augment it and generate visualisations and printed comparisons to better understand how Superhosts differ from regular hosts in the dataset.

In [19]:
from helpers.analyses import (
    run_ttest_independent,
    get_percent_count_groupby,
)
from helpers.transformations import (
    add_columns_to_df,
    clean_dataframe,
    remove_outliers,
)
from helpers.visualisations import (
    create_bar_chart,
    create_box_plot,
    create_histogramms_boxplots,
    create_pie_chart,
    create_subplots,
    visualize_accomodations_on_map,
    update_fig_layout,
    DEFAULT_COLOR_MAPPING
)
import os
import pandas as pd
import plotly.express as px

### Get the data, clean and augment it

In [20]:
df = pd.read_csv(
    os.path.join(
        "http://data.insideairbnb.com/the-netherlands/north-holland/amsterdam/2022-03-08/data/",
        "listings.csv.gz"),
    compression="gzip"
)
clean_dataframe(df)
add_columns_to_df(df)

# Splitting datasets
df_super = df.loc[df["host_is_superhost"]]
df_regular = df.loc[~df["host_is_superhost"]]

Dropping the following columns containing '_url':
listing_url  
picture_url  
host_url  
host_thumbnail_url  
host_picture_url


Dropping the following columns containing 'scrape':
scrape_id  
last_scraped  
calendar_last_scraped


The following columns with 90.0% or more missing values were dropped:
neighbourhood_group_cleansed
calendar_updated
bathrooms




### Location

Check for differences in listings locations between Superhosts and regular hosts.

In [21]:
visualize_accomodations_on_map(df, save_output=False)

In [22]:
fig = px.scatter(df, x="latitude", y="longitude", color="host_type", color_discrete_map=DEFAULT_COLOR_MAPPING, opacity=0.4)
update_fig_layout(fig, width=1000, height=600, xaxis_title="latitude", yaxis_title="longitude")
fig

In [23]:
metric = "neighbourhood_cleansed"
create_bar_chart(
    get_percent_count_groupby(df, ["host_type", metric]),
    x="percent",
    y=metric,
    yaxis_title=metric,
    xaxis_title="percent",
    color="host_type",
    orientation="h",
    save_output=False
)

In [24]:
# Distance to centre
print("Average distance to centre")
print(f"Superhost listings: {round(df_super['distance_to_centre_km'].mean(), 2)} km, regular host listings: {round(df_regular['distance_to_centre_km'].mean(), 2)} km")
print("\n")
print("Median distance to centre")
print(f"Superhost listings: {round(df_super['distance_to_centre_km'].median(), 2)} km, regular host listings: {round(df_regular['distance_to_centre_km'].median(), 2)} km")
print("\n")

# Check significane of differences
run_ttest_independent(df_super, df_regular, "distance_to_centre_km")

Average distance to centre
Superhost listings: 2.77 km, regular host listings: 2.89 km


Median distance to centre
Superhost listings: 2.18 km, regular host listings: 2.52 km


Significance of Independent T-Test comparing Superhosts and regular hosts on distance_to_centre_km:
p = 0.03


### Authenticity

Compare Superhosts and regular hosts on indicators of commercial activity.

In [25]:
super_multiple_listings = (df_super['host_total_listings_count'] > 1).sum()
regular_multiple_listings = (df_regular['host_total_listings_count'] > 1).sum()
print("Percentage of hosts with more than one listing")
print(f"Superhost: {int(super_multiple_listings / df_super.shape[0] * 100)}%, regular hosts: {int(regular_multiple_listings / df_regular.shape[0] * 100)}%")
print("\n")

# Check significane of differences
run_ttest_independent(df_super, df_regular, "host_total_listings_count")

Percentage of hosts with more than one listing
Superhost: 38%, regular hosts: 27%


Significance of Independent T-Test comparing Superhosts and regular hosts on host_total_listings_count:
p = 0.35


In [26]:
metric = "host_acceptance_rate"
print(f"Superhosts average {metric}: {round(df_super[metric].mean(), 2)} (standard deviation: {round(df_super[metric].std(), 2)})")
print(f"Regular hosts average {metric}: {round(df_regular[metric].mean(), 2)} (standard deviation: {round(df_regular[metric].std(), 2)})")
print("\n")

# Check significane of differences
run_ttest_independent(df_super, df_regular, metric)

create_box_plot(
    remove_outliers(df.dropna(subset=[metric]), metric),
    x="host_type",
    y=metric,
    yaxis_title=metric,
    fig_name=metric,
    save_output=False
)

Superhosts average host_acceptance_rate: 0.89 (standard deviation: 0.2)
Regular hosts average host_acceptance_rate: 0.74 (standard deviation: 0.32)


Significance of Independent T-Test comparing Superhosts and regular hosts on host_acceptance_rate:
p < 0.001



More than 5.0% of values removed (337)!



In [27]:
metric = "reviews_per_month"

print(f"Superhosts average {metric}: {round(df_super[metric].mean(), 2)} (standard deviation: {round(df_super[metric].std(), 2)})")
print(f"Regular hosts average {metric}: {round(df_regular[metric].mean(), 2)} (standard deviation: {round(df_regular[metric].std(), 2)})")
print("\n")

# Check significane of differences
run_ttest_independent(df_super, df_regular, metric)

create_box_plot(
    remove_outliers(df.dropna(subset=[metric]), metric),
    x="host_type",
    y=metric,
    yaxis_title=metric,
    fig_name=metric,
    save_output=False
)

Superhosts average reviews_per_month: 1.73 (standard deviation: 1.75)
Regular hosts average reviews_per_month: 0.87 (standard deviation: 2.15)


Significance of Independent T-Test comparing Superhosts and regular hosts on reviews_per_month:
p < 0.001


In [28]:
super_instant = df_super['instant_bookable'].sum()
regular_instant = df_regular['instant_bookable'].sum()
print("Percentage of instant bookable listings")
print(f"Superhost: {int(super_instant / df_super.shape[0] * 100)}%, regular hosts: {int(regular_instant / df_regular.shape[0] * 100)}%")

Percentage of instant bookable listings
Superhost: 38%, regular hosts: 30%


In [29]:
metric = "availability_365"

print(f"Superhosts average {metric}: {round(df_super[metric].mean(), 2)} (standard deviation: {round(df_super[metric].std(), 2)})")
print(f"Regular hosts average {metric}: {round(df_regular[metric].mean(), 2)} (standard deviation: {round(df_regular[metric].std(), 2)})")
print("\n")

# Check significane of differences
run_ttest_independent(df_super, df_regular, metric)

create_box_plot(
    remove_outliers(df.dropna(subset=[metric]), metric),
    x="host_type",
    y=metric,
    yaxis_title=metric,
    fig_name=metric,
    save_output=False
)

Superhosts average availability_365: 110.89 (standard deviation: 114.22)
Regular hosts average availability_365: 94.36 (standard deviation: 123.31)


Significance of Independent T-Test comparing Superhosts and regular hosts on availability_365:
p < 0.001



More than 5.0% of values removed (308)!



### Experience

Compare Superhosts and regular hosts on communication metrics and reviews.

In [30]:
metric = "host_response_rate"

print(f"Superhosts average {metric}: {round(df_super[metric].mean(), 2)} (standard deviation: {round(df_super[metric].std(), 2)})")
print(f"Regular hosts average {metric}: {round(df_regular[metric].mean(), 2)} (standard deviation: {round(df_regular[metric].std(), 2)})")
print("\n")

# Check significane of differences
run_ttest_independent(df_super, df_regular, metric)

create_box_plot(
    remove_outliers(df.dropna(subset=[metric]), metric),
    x="host_type",
    y=metric,
    yaxis_title=metric,
    fig_name=metric,
    save_output=False
)

Superhosts average host_response_rate: 0.99 (standard deviation: 0.05)
Regular hosts average host_response_rate: 0.92 (standard deviation: 0.19)


Significance of Independent T-Test comparing Superhosts and regular hosts on host_response_rate:
p < 0.001


In [31]:
metric = "host_response_time"
create_bar_chart(
    get_percent_count_groupby(df, ["host_type", metric]),
    x="percent",
    y=metric,
    yaxis_title=metric,
    xaxis_title="percent",
    color="host_type",
    orientation="h",
    save_output=False
)

In [32]:
metric = "review_scores_rating"

print(f"Superhosts average {metric}: {round(df_super[metric].mean(), 2)} (standard deviation: {round(df_super[metric].std(), 2)})")
print(f"Regular hosts average {metric}: {round(df_regular[metric].mean(), 2)} (standard deviation: {round(df_regular[metric].std(), 2)})")
print("\n")

# Check significane of differences
run_ttest_independent(df_super, df_regular, metric)

create_box_plot(
    remove_outliers(df.dropna(subset=[metric]), metric),
    x="host_type",
    y=metric,
    yaxis_title=metric,
    fig_name=metric,
    save_output=False
)

Superhosts average review_scores_rating: 4.87 (standard deviation: 0.13)
Regular hosts average review_scores_rating: 4.76 (standard deviation: 0.38)


Significance of Independent T-Test comparing Superhosts and regular hosts on review_scores_rating:
p < 0.001


In [33]:
metric = "review_scores_accuracy"

print(f"Superhosts average {metric}: {round(df_super[metric].mean(), 2)} (standard deviation: {round(df_super[metric].std(), 2)})")
print(f"Regular hosts average {metric}: {round(df_regular[metric].mean(), 2)} (standard deviation: {round(df_regular[metric].std(), 2)})")
print("\n")

# Check significane of differences
run_ttest_independent(df_super, df_regular, metric)

create_box_plot(
    remove_outliers(df.dropna(subset=[metric]), metric),
    x="host_type",
    y=metric,
    yaxis_title=metric,
    fig_name=metric,
    save_output=False
)

Superhosts average review_scores_accuracy: 4.89 (standard deviation: 0.15)
Regular hosts average review_scores_accuracy: 4.81 (standard deviation: 0.29)


Significance of Independent T-Test comparing Superhosts and regular hosts on review_scores_accuracy:
p < 0.001


In [34]:
metric = "review_scores_cleanliness"

print(f"Superhosts average {metric}: {round(df_super[metric].mean(), 2)} (standard deviation: {round(df_super[metric].std(), 2)})")
print(f"Regular hosts average {metric}: {round(df_regular[metric].mean(), 2)} (standard deviation: {round(df_regular[metric].std(), 2)})")
print("\n")

# Check significane of differences
run_ttest_independent(df_super, df_regular, metric)

create_box_plot(
    remove_outliers(df.dropna(subset=[metric]), metric),
    x="host_type",
    y=metric,
    yaxis_title=metric,
    fig_name=metric,
    save_output=False
)

Superhosts average review_scores_cleanliness: 4.85 (standard deviation: 0.21)
Regular hosts average review_scores_cleanliness: 4.71 (standard deviation: 0.35)


Significance of Independent T-Test comparing Superhosts and regular hosts on review_scores_cleanliness:
p < 0.001
