In [13]:
import itertools
import math
import numpy as np
import pandas as pd
# import seaborn as sns
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

In [14]:
colors = px.colors.qualitative.Set2
colors.append(px.colors.qualitative.Pastel2)
colors.append(px.colors.qualitative.Dark2)

In [21]:
def get_url(city, file_name):
    """"""
    return f"http://data.insideairbnb.com/the-netherlands/north-holland/amsterdam/2022-03-08/visualisations/listings.csv"

def drop_na_columns(df, threshold):
    """"""
    for c in df.columns:
        percent_missing = df[c].isna().sum() / df.shape[0]
        if  percent_missing >= threshold:
            print(f"Dropping column '{c}' having {percent_missing * 100}% missing values.")
            df.drop(c, inplace=True)

def create_subplots(df, plot_type="histogram", exclude_cols=None):
    """Takes a pandas dataframe and returns plotly sublots with histograms or boxplots for numeric columns."""
    numeric_cols = [c for c in df.select_dtypes(include=["int64", "float64"]).columns]
    if exclude_cols:
        numeric_cols = [c for c in numeric_cols if c not in exclude_cols]

    subplot_dim = {
        "rows": {"histogram": math.ceil(len(numeric_cols)/2), "boxplot": math.ceil(len(numeric_cols)/4)},
        "cols": {"histogram": 2, "boxplot": 4},
        "height": 300 * math.ceil(len(numeric_cols)/2),
        "width": 1400,
    }
    fig = make_subplots(rows=subplot_dim["rows"][plot_type], cols=subplot_dim["cols"][plot_type], subplot_titles=numeric_cols)
    rows_cols = itertools.product(range(1, subplot_dim["rows"][plot_type]+1), range(1, subplot_dim["cols"][plot_type]+1))
    for c, row_col in zip(numeric_cols, rows_cols):
        if plot_type == "histogram":
            fig.add_trace(go.Histogram(x=df[c], name=c), row=row_col[0], col=row_col[1])
        elif plot_type == "boxplot":
            fig.add_trace(go.Box(y=df[c], name=c), row=row_col[0], col=row_col[1])

    fig.update_layout(showlegend=False, height=subplot_dim["height"], width=subplot_dim["width"], title_text=plot_type.capitalize())
    return fig

def create_histogramms_boxplots(df, exclude_cols=None):
    """Takes a pandas dataframe and returns plotly sublots with histograms or boxplots for numeric columns."""
    numeric_cols = [c for c in df.select_dtypes(include=["int64", "float64"]).columns]
    if exclude_cols:
        numeric_cols = [c for c in numeric_cols if c not in exclude_cols]

    fig = make_subplots(rows=len(numeric_cols), cols=2, column_widths=[0.8, 0.2])
    for i, c in enumerate(numeric_cols):
        fig.add_trace(go.Histogram(x=df[c], name=c, marker_color=colors[i]), row=i+1, col=1)
        fig.add_trace(go.Box(y=df[c], name=c, marker_color=colors[i]), row=i+1, col=2)

    fig.update_layout(showlegend=False, height=400*len(numeric_cols), width=1600)
    return fig

create_histogramms_boxplots(df_listings, exclude_cols=["latitude", "longitude"]).show()

In [17]:
# Get data
df_listings = pd.read_csv("http://data.insideairbnb.com/the-netherlands/north-holland/amsterdam/2022-03-08/visualisations/listings.csv")

In [18]:
# Fix data types
df_listings["id"] = df_listings["id"].astype("str")
df_listings["host_id"] = df_listings["host_id"].astype("str")

# Remove columns with all missing values
df_listings.dropna(axis=1, inplace=True, how="all")

In [170]:
df_listings.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,number_of_reviews_ltm,license
0,2818,Quiet Garden View Room & Super Fast WiFi,3159,Daniel,Oostelijk Havengebied - Indische Buurt,52.36435,4.94358,Private room,49,3,285,2021-11-21,1.81,1,62,7,0363 5F3A 5684 6750 D14D
1,20168,Studio with private bathroom in the centre 1,59484,Alexander,Centrum-Oost,52.36407,4.89393,Private room,106,1,339,2020-04-09,2.3,2,0,0,0363 CBB3 2C10 0C2A 1E29
2,27886,"Romantic, stylish B&B houseboat in canal district",97647,Flip,Centrum-West,52.38761,4.89188,Private room,134,2,228,2022-02-20,1.84,1,189,9,0363 974D 4986 7411 88D8
3,28871,Comfortable double room,124245,Edwin,Centrum-West,52.36775,4.89092,Private room,75,2,379,2022-03-05,2.7,2,146,43,0363 607B EA74 0BD8 2F6F
4,29051,Comfortable single room,124245,Edwin,Centrum-Oost,52.36584,4.89111,Private room,55,2,532,2022-03-04,3.98,2,170,53,0363 607B EA74 0BD8 2F6F


In [171]:
# Get summary statistics
df_listings.describe()

Unnamed: 0,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,number_of_reviews_ltm
count,5732.0,5732.0,5732.0,5732.0,5732.0,5156.0,5732.0,5732.0,5732.0
mean,52.366941,4.891896,174.613922,3.893057,49.57083,1.107719,2.205862,98.579728,7.329728
std,0.017964,0.038285,200.142824,22.91633,90.739099,2.085598,3.127854,121.258191,22.258302
min,52.27752,4.75334,0.0,1.0,0.0,0.01,1.0,0.0,0.0
25%,52.356277,4.868655,100.0,2.0,4.0,0.25,1.0,0.0,0.0
50%,52.36647,4.888655,145.0,2.0,17.0,0.51,1.0,34.0,2.0
75%,52.37674,4.90932,200.0,3.0,48.0,1.1925,2.0,187.0,6.0
max,52.43076,5.0701,8812.0,1001.0,939.0,78.94,20.0,365.0,687.0


In [173]:
# Histograms
histograms = create_subplots(df_listings, plot_type="histogram", exclude_cols=["latitude", "longitude"])
histograms.show()

In [174]:
# Boxplots
boxplots = create_subplots(df_listings, plot_type="boxplot", exclude_cols=["latitude", "longitude"])
boxplots.show()

In [None]:
# Missing values


In [None]:
# Correlations

In [102]:
7/2.round()

SyntaxError: invalid syntax (2866120221.py, line 1)

2

In [107]:
import math.ceil
print(int(math.ceil(4.2)))

ModuleNotFoundError: No module named 'math.ceil'; 'math' is not a package