# Imports

In [None]:
import requests
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import calendar
from shapely import wkb

# Fetch data from api

In [None]:
def fetch_data(
    dataset_id: str = "comptage-multimodal-comptages",
    export_format: str = "parquet",
    save_path: str = "../data/01_raw/comptage-multimodal-comptages.parquet",
) -> None:
    """
    Download the multimodal counting data (bikes + scooters) for Boulevard Sébastopol
    from the OpenDataSoft API in Parquet format and save it locally.
    """
    # Build the endpoint URL
    url = (
        f"https://parisdata.opendatasoft.com/api/explore/v2.1/catalog/"
        f"datasets/{dataset_id}/exports/{export_format}"
    )

    # Query parameters: filter to both bike and scooter modes on the two Sebastopol counters
    params = {
        "refine": [
            "mode:Trottinettes + vélos",
            "mode:Trottinettes",
            "mode:Vélos",
            "label:CF1461_113 boulevard de Sébastopol",
            "label:CF0001_9 boulevard de Sébastopol",
        ],
        "timezone": "UTC",
        "limit": -1,
        "parquet_compression": "snappy",
    }

    # Send request and save the response content to disk
    response = requests.get(url, params=params)
    response.raise_for_status()

    with open(save_path, "wb") as f:
        f.write(response.content)

    print(f"File downloaded and saved to: {save_path}")


In [211]:
fetch_data()

File downloaded and saved to: ../data/01_raw/comptage-multimodal-comptages.parquet


# Load data from filesystem

In [None]:
df = pd.read_parquet(
    "../data/01_raw/comptage-multimodal-comptages.parquet", engine="pyarrow"
)

# Inspect data

In [415]:
# Show the first few rows of the DataFrame
display(df.head())
# Show df info
print(df.info())

# Show unique values for key columns
for x in [
    "id_site",
    "label",
    "sens",
    "id_trajectoire",
    "trajectoire",
    "coordonnees_geo",
]:
    print(f"Unique values in {x}: {df[x].unique()} Total: {df[x].nunique()}")

# Check for missing values
print(df.isna().sum())

Unnamed: 0,id_trajectoire,id_site,label,t,mode,nb_usagers,voie,sens,trajectoire,coordonnees_geo
0,10030_4 -> 2,10030,CF0001_9 boulevard de Sébastopol,2024-03-15 03:00:00+00:00,Trottinettes,4,Piste cyclable,N-S,4 -> 2,b'\x01\x01\x00\x00\x00\x01\xa5\xa1F!\xc9\x02@\...
1,10030_2 -> 1,10030,CF0001_9 boulevard de Sébastopol,2024-07-24 09:00:00+00:00,Vélos,12,Voie de circulation générale,S-N,2 -> 1,b'\x01\x01\x00\x00\x00\x01\xa5\xa1F!\xc9\x02@\...
2,10030_3 -> 2,10030,CF0001_9 boulevard de Sébastopol,2024-06-03 05:00:00+00:00,Trottinettes,32,Piste cyclable,S-N,3 -> 2,b'\x01\x01\x00\x00\x00\x01\xa5\xa1F!\xc9\x02@\...
3,10030_1 -> 1,10030,CF0001_9 boulevard de Sébastopol,2024-06-01 17:00:00+00:00,Vélos,26,Voie de circulation générale,S-N,1 -> 1,b'\x01\x01\x00\x00\x00\x01\xa5\xa1F!\xc9\x02@\...
4,10030_2 -> 1,10030,CF0001_9 boulevard de Sébastopol,2023-11-10 18:00:00+00:00,Vélos,16,Voie de circulation générale,S-N,2 -> 1,b'\x01\x01\x00\x00\x00\x01\xa5\xa1F!\xc9\x02@\...


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 262799 entries, 0 to 262798
Data columns (total 10 columns):
 #   Column           Non-Null Count   Dtype              
---  ------           --------------   -----              
 0   id_trajectoire   262799 non-null  object             
 1   id_site          262799 non-null  object             
 2   label            262799 non-null  object             
 3   t                262799 non-null  datetime64[ms, UTC]
 4   mode             262799 non-null  object             
 5   nb_usagers       262799 non-null  int64              
 6   voie             262799 non-null  object             
 7   sens             262799 non-null  object             
 8   trajectoire      262799 non-null  object             
 9   coordonnees_geo  262799 non-null  object             
dtypes: datetime64[ms, UTC](1), int64(1), object(8)
memory usage: 20.1+ MB
None
Unique values in id_site: ['10030' '10180'] Total: 2
Unique values in label: ['CF0001_9 boulevard de 

# Basic transformations

In [None]:
def preprocess_df(df: pd.DataFrame) -> pd.DataFrame:
    """
    Convert the 't' column to Europe/Paris timezone, parse WKB coordinates into latitude and longitude,
    and remove raw geometry columns.
    """
    df = df.copy()
    # 1) Convert timestamp to Europe/Paris timezone
    df["t_paris"] = df["t"].dt.tz_convert("Europe/Paris")

    # 2) Parse WKB bytes into Shapely geometry
    df["geometry"] = df["coordonnees_geo"].apply(wkb.loads)

    # 3) Extract longitude (x) and latitude (y)
    df["lon"] = df["geometry"].apply(lambda point: point.x)
    df["lat"] = df["geometry"].apply(lambda point: point.y)

    # 4) Drop raw geometry columns
    df.drop(columns=["coordonnees_geo", "geometry"], inplace=True)

    return df


df_processed = preprocess_df(df)
display(df_processed.head())


Unnamed: 0,id_trajectoire,id_site,label,t,mode,nb_usagers,voie,sens,trajectoire,t_paris,lon,lat
0,10030_4 -> 2,10030,CF0001_9 boulevard de Sébastopol,2024-03-15 03:00:00+00:00,Trottinettes,4,Piste cyclable,N-S,4 -> 2,2024-03-15 04:00:00+01:00,2.348208,48.858628
1,10030_2 -> 1,10030,CF0001_9 boulevard de Sébastopol,2024-07-24 09:00:00+00:00,Vélos,12,Voie de circulation générale,S-N,2 -> 1,2024-07-24 11:00:00+02:00,2.348208,48.858628
2,10030_3 -> 2,10030,CF0001_9 boulevard de Sébastopol,2024-06-03 05:00:00+00:00,Trottinettes,32,Piste cyclable,S-N,3 -> 2,2024-06-03 07:00:00+02:00,2.348208,48.858628
3,10030_1 -> 1,10030,CF0001_9 boulevard de Sébastopol,2024-06-01 17:00:00+00:00,Vélos,26,Voie de circulation générale,S-N,1 -> 1,2024-06-01 19:00:00+02:00,2.348208,48.858628
4,10030_2 -> 1,10030,CF0001_9 boulevard de Sébastopol,2023-11-10 18:00:00+00:00,Vélos,16,Voie de circulation générale,S-N,2 -> 1,2023-11-10 19:00:00+01:00,2.348208,48.858628


# plot sites location 

In [None]:
def plot_sites_location(df: pd.DataFrame) -> go.Figure:
    """
    Creates a Plotly scatter map to display the locations of counting sites.
    """
    fig = go.Figure(
        go.Scattermap(
            lat=df["lat"],
            lon=df["lon"],
            mode="markers+text",
            marker=dict(size=14, symbol="marker"),
            text=df["label"] + " (" + df["id_site"] + ")",
            textposition="top right",
            textfont=dict(size=12, color="black"),
        )
    )

    # Configure map layout
    fig.update_layout(
        map=dict(
            center=dict(lat=df["lat"].mean(), lon=df["lon"].mean()),
            zoom=13.75,
        ),
        showlegend=False,
        margin=dict(l=30, r=30, t=50, b=30),
        title="Locations of Counting Sites",
        height=400,
        width=800,
    )

    return fig


In [400]:
df_map_plot = df_processed[["id_site", "label", "lon", "lat"]].drop_duplicates()
plot_sites_location(df_map_plot).show()

# Hourly Bike and Scooter Count

In [407]:
def plot_multimodal_site(df: pd.DataFrame, site_id: str) -> px.line:
    """
    Creates a Plotly line chart of multimodal counts for a given site.
    """
    df_site = df[df["id_site"] == site_id]
    df_plot = (
        df_site.groupby(["id_site", "t_paris", "mode"])["nb_usagers"]
        .sum()
        .reset_index()
    )
    fig = px.line(
        data_frame=df_plot,
        x="t_paris",
        y="nb_usagers",
        color="mode",
        title=f"Multimodal Count - Site {site_id}",
        labels={
            "t_paris": "Date and Time",
            "nb_usagers": "Number of Users",
            "mode": "Transport Mode",
        },
    )
    return fig

In [409]:
df_plot = (
    df_processed.groupby(["id_site", "t_paris", "mode"])["nb_usagers"]
    .sum()
    .reset_index()
)
display(df_plot.head())

# Display the chart for each site
for site in df_processed["id_site"].unique():
    plot_multimodal_site(df_processed, site).show()

Unnamed: 0,id_site,t_paris,mode,nb_usagers
0,10030,2021-06-04 17:00:00+02:00,Trottinettes,47
1,10030,2021-06-04 17:00:00+02:00,Vélos,873
2,10030,2021-06-04 18:00:00+02:00,Trottinettes,66
3,10030,2021-06-04 18:00:00+02:00,Vélos,1313
4,10030,2021-06-04 19:00:00+02:00,Trottinettes,64


# Bike focus

## Date Aggregation

In [403]:
df_scope = df_processed[
    (df_processed["id_site"] == "10030") & (df_processed["mode"] == "Vélos")
]
df_agg = df_scope.groupby(["id_site", "t_paris"])["nb_usagers"].sum().reset_index()
df_agg["day"] = df_agg["t_paris"].dt.strftime("%Y-%m-%d")
df_agg["month"] = df_agg["t_paris"].dt.strftime("%Y-%m")

In [404]:
def plot_aggregated_counts(df: pd.DataFrame, site_id: str) -> go.Figure:
    """
    Creates an interactive Plotly figure with buttons to display
    aggregated user counts at hourly, daily, and monthly levels.
    """
    # Filter for the selected site and ensure datetime index
    df_site = (
        df[df["id_site"] == site_id]
        .copy()
        .assign(t_paris=lambda d: pd.to_datetime(d["t_paris"]))
        .set_index("t_paris")
    )

    # Aggregate by frequency
    df_hour = df_site["nb_usagers"].resample("h").sum().reset_index()
    df_day = df_site["nb_usagers"].resample("d").sum().reset_index()
    df_month = df_site["nb_usagers"].resample("MS").sum().reset_index()

    # Traces
    trace_hour = go.Scatter(
        x=df_hour["t_paris"], y=df_hour["nb_usagers"], name="Hourly", visible=True
    )
    trace_day = go.Scatter(
        x=df_day["t_paris"], y=df_day["nb_usagers"], name="Daily", visible=False
    )
    trace_month = go.Scatter(
        x=df_month["t_paris"], y=df_month["nb_usagers"], name="Monthly", visible=False
    )

    # Buttons
    buttons = [
        dict(
            label="Hourly",
            method="update",
            args=[
                {"visible": [True, False, False]},
                {
                    "title": f"Hourly Counts - Site {site_id}",
                    "xaxis": {"title": "Date / Time"},
                },
            ],
        ),
        dict(
            label="Daily",
            method="update",
            args=[
                {"visible": [False, True, False]},
                {
                    "title": f"Daily Counts - Site {site_id}",
                    "xaxis": {"title": "Date"},
                },
            ],
        ),
        dict(
            label="Monthly",
            method="update",
            args=[
                {"visible": [False, False, True]},
                {
                    "title": f"Monthly Counts - Site {site_id}",
                    "xaxis": {"title": "Month"},
                },
            ],
        ),
    ]

    fig = go.Figure(data=[trace_hour, trace_day, trace_month])
    fig.update_layout(
        updatemenus=[
            dict(
                type="buttons",
                direction="right",
                x=0.5,
                y=1.15,
                showactive=True,
                buttons=buttons,
            )
        ],
        title=f"Hourly Counts - Site {site_id}",
        yaxis_title="Number of Users",
        xaxis=dict(type="date", title="Date / Time"),
    )

    return fig

In [405]:
def plot_average_profiles(df: pd.DataFrame, site_id: str) -> go.Figure:
    """
    Creates an interactive Plotly figure with buttons to visualize
    the average usage profiles by hour of day, weekday, and month.
    """
    # Filter and set index to Europe/Paris timezone
    df_site = (
        df[df["id_site"] == site_id]
        .copy()
        .assign(
            t_paris=lambda d: pd.to_datetime(d["t_paris"]).dt.tz_convert("Europe/Paris")
        )
        .set_index("t_paris")
    )

    # 1) Average by hour of the day (0–23)
    hourly_sum = df_site["nb_usagers"].resample("h").sum()
    df_hour = (
        hourly_sum.groupby(hourly_sum.index.hour)
        .mean()
        .rename_axis("hour")
        .reset_index(name="nb_usagers")
    )

    # 2) Average daily total by weekday
    daily_sum = df_site["nb_usagers"].resample("d").sum()
    df_wd = (
        daily_sum.groupby(daily_sum.index.weekday)
        .mean()
        .rename_axis("weekday")
        .reset_index(name="nb_usagers")
    )
    df_wd["weekday_name"] = df_wd["weekday"].map(
        {i: name for i, name in enumerate(calendar.day_name)}
    )

    # 3) Average by month of the year
    monthly_sum = df_site["nb_usagers"].resample("MS").sum()
    df_mo = (
        monthly_sum.groupby(monthly_sum.index.month)
        .mean()
        .rename_axis("month")
        .reset_index(name="nb_usagers")
    )
    df_mo["month_name"] = df_mo["month"].map(
        {i: calendar.month_name[i] for i in range(1, 13)}
    )

    # --- Traces with markers ---
    trace_hour = go.Scatter(
        x=df_hour["hour"],
        y=df_hour["nb_usagers"],
        mode="lines+markers",
        marker=dict(symbol="circle", size=6),
        name="Hourly",
        visible=True,
    )
    trace_wd = go.Scatter(
        x=df_wd["weekday_name"],
        y=df_wd["nb_usagers"],
        mode="lines+markers",
        marker=dict(symbol="square", size=6),
        name="By Weekday",
        visible=False,
    )
    trace_mo = go.Scatter(
        x=df_mo["month_name"],
        y=df_mo["nb_usagers"],
        mode="lines+markers",
        marker=dict(symbol="diamond", size=6),
        name="By Month",
        visible=False,
    )

    # Buttons in order: Hourly, By Weekday, By Month
    buttons = [
        dict(
            label="Hourly",
            method="update",
            args=[
                {"visible": [True, False, False]},
                {
                    "title": f"Average Usage by Hour - Site {site_id}",
                    "xaxis": {"title": "Hour of Day (0–23)"},
                    "yaxis": {"title": "Average Number of Users"},
                },
            ],
        ),
        dict(
            label="By Weekday",
            method="update",
            args=[
                {"visible": [False, True, False]},
                {
                    "title": f"Average Daily Total by Weekday - Site {site_id}",
                    "xaxis": {"title": "Weekday"},
                    "yaxis": {"title": "Average Number of Users"},
                },
            ],
        ),
        dict(
            label="By Month",
            method="update",
            args=[
                {"visible": [False, False, True]},
                {
                    "title": f"Average Usage by Month - Site {site_id}",
                    "xaxis": {"title": "Month of Year"},
                    "yaxis": {"title": "Average Number of Users"},
                },
            ],
        ),
    ]

    fig = go.Figure(data=[trace_hour, trace_wd, trace_mo])
    fig.update_layout(
        updatemenus=[
            dict(
                type="buttons",
                direction="right",
                x=0.5,
                y=1.15,
                showactive=True,
                buttons=buttons,
            )
        ],
        title=f"Average Usage by Hour - Site {site_id}",
        yaxis_title="Average Number of Users",
        xaxis_title="Hour of Day (0–23)",
    )

    return fig


## Aggregated counts + average profiles

In [406]:
plot_aggregated_counts(df_agg, site_id="10030").show()

In [258]:
plot_average_profiles(df_agg, site_id="10030").show()

In [1]:
# TODO: Deepen the EDA with decomposition, seasonality, and trend analysis.
# TODO: Spot holidays and special events in the data, or major event that drive anomalous traffic.
# TODO: Identify gaps in the data and handle them appropriately.