In [83]:
import pandas as pd
import polars as pl
import requests

from vcub_keeper.transform.features_factory import get_transactions_all, get_transactions_in, get_transactions_out

%load_ext autoreload
%autoreload 2

pd.options.display.max_rows = 500
pd.options.display.max_columns = 500

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Objectifs

- Obtenir les données depuis l'api de Damien
- Structuré ces données pour les utiliser avec le pipeline de prédiction

# Get data from API

## API Oslandia

### Dev

In [81]:
station_id = 25
# station_id="25,102"
start_date = "2021-10-14"
stop_date = "2021-10-20"

# url = "http://data.oslandia.io/bikes/api/bordeaux/timeseries/station/106?start=2020-10-09&stop=2020-10-17"

url = (
    "http://data.oslandia.io/bikes/api/bordeaux/timeseries/station/"
    + str(station_id)
    + "?start="
    + start_date
    + "&stop="
    + stop_date
)

response = requests.get(url)  # noqa: S113

In [84]:
url

'http://data.oslandia.io/bikes/api/bordeaux/timeseries/station/25?start=2021-10-14&stop=2021-10-20'

In [85]:
response.json()

{'data': [{'available_bikes': [5,
    5,
    5,
    5,
    5,
    5,
    5,
    5,
    5,
    5,
    5,
    5,
    6,
    6,
    6,
    6,
    6,
    7,
    7,
    7,
    7,
    7,
    7,
    7,
    7,
    8,
    8,
    8,
    8,
    8,
    8,
    8,
    8,
    8,
    8,
    8,
    8,
    8,
    8,
    8,
    8,
    8,
    8,
    8,
    8,
    8,
    8,
    8,
    8,
    8,
    8,
    8,
    8,
    8,
    8,
    8,
    8,
    8,
    8,
    8,
    8,
    8,
    8,
    8,
    8,
    8,
    8,
    8,
    8,
    8,
    8,
    8,
    8,
    8,
    8,
    8,
    8,
    8,
    8,
    8,
    8,
    8,
    8,
    8,
    8,
    7,
    7,
    7,
    7,
    7,
    7,
    7,
    6,
    7,
    7,
    7,
    7,
    6,
    5,
    4,
    3,
    3,
    3,
    1,
    1,
    0,
    0,
    0,
    0,
    1,
    1,
    1,
    1,
    0,
    0,
    0,
    0,
    0,
    0,
    1,
    1,
    2,
    2,
    2,
    2,
    2,
    2,
    2,
    2,
    2,
    2,
    2,
    2,
    2,
    2,
    2,
    2,
    2,
    2,


In [None]:
station_df = pd.DataFrame(response.json()["data"][0])

In [5]:
station_df.id.unique()

array(['25'], dtype=object)

In [6]:
# Status mapping
status_dict = {"open": 1, "closed": 0}
station_df["status"] = station_df["status"].map(status_dict)
station_df["status"] = station_df["status"].astype("uint8")

# Naming
station_df.rename(columns={"id": "station_id"}, inplace=True)
station_df.rename(columns={"ts": "date"}, inplace=True)

# Casting date & sorting DataFrame on station_id & date
station_df["date"] = pd.to_datetime(station_df["date"])
station_df = station_df.sort_values(["station_id", "date"], ascending=[1, 1])

# Reset index
station_df = station_df.reset_index(drop=True)

# Dropduplicate station_id / date rows
station_df = station_df.drop_duplicates(subset=["station_id", "date"]).reset_index(drop=True)

# Create features
station_df = get_transactions_in(station_df)
station_df = get_transactions_out(station_df)
station_df = get_transactions_all(station_df)

In [7]:
station_df.tail(10)

Unnamed: 0,available_bikes,available_stands,station_id,name,nb_stands,status,date,transactions_in,transactions_out,transactions_all
1619,0,0,25,François de Sourdis,14,0,2020-10-19 23:13:13,0.0,0.0,0.0
1620,0,0,25,François de Sourdis,14,0,2020-10-19 23:18:13,0.0,0.0,0.0
1621,0,0,25,François de Sourdis,14,0,2020-10-19 23:24:13,0.0,0.0,0.0
1622,0,0,25,François de Sourdis,14,0,2020-10-19 23:29:13,0.0,0.0,0.0
1623,0,0,25,François de Sourdis,14,0,2020-10-19 23:33:13,0.0,0.0,0.0
1624,0,0,25,François de Sourdis,14,0,2020-10-19 23:37:13,0.0,0.0,0.0
1625,0,0,25,François de Sourdis,14,0,2020-10-19 23:42:13,0.0,0.0,0.0
1626,0,0,25,François de Sourdis,14,0,2020-10-19 23:47:13,0.0,0.0,0.0
1627,0,0,25,François de Sourdis,14,0,2020-10-19 23:53:13,0.0,0.0,0.0
1628,0,0,25,François de Sourdis,14,0,2020-10-19 23:57:13,0.0,0.0,0.0


### Industrialisation

In [20]:
from vcub_keeper.config import ROOT_DATA_REF, THRESHOLD_PROFILE_STATION
from vcub_keeper.production.data import get_data_from_api_by_station, transform_json_station_data_to_df
from vcub_keeper.reader.reader import read_station_profile

# THRESHOLD_PROFILE_STATION = 0.3

station_profile = read_station_profile(path_directory=ROOT_DATA_REF).to_pandas()

stations_id_to_pred = station_profile[station_profile["mean"] >= THRESHOLD_PROFILE_STATION]["station_id"].unique()

In [800]:
station_id = 1
station_id = [19, 105, 102]
start_date = "2020-10-14"
stop_date = "2020-10-17"

station_json = get_data_from_api_by_station(station_id=station_id, start_date=start_date, stop_date=stop_date)

station_df = transform_json_station_data_to_df(station_json)

In [21]:
from datetime import date, timedelta

# Get date
date_today = date.today()
end_date = date_today + timedelta(days=1)
start_date = date_today - timedelta(days=8)

start_date_str = start_date.strftime("%Y-%m-%d")
end_date_str = end_date.strftime("%Y-%m-%d")
print(start_date_str, end_date_str)

2024-10-13 2024-10-22


In [801]:
def transform_json_station_data_to_df(station_json):
    """ """

    # Si il y a plusieurs stations dans le json
    if len(station_json["data"]) > 1:
        station_df = pd.DataFrame()
        for i in range(0, len(station_json["data"])):
            temp_station_df = pd.DataFrame(station_json["data"][i])
            station_df = pd.concat([station_df, temp_station_df])
    # Il y une seule station dans le json
    else:
        station_df = pd.DataFrame(station_json["data"][0])

    # Status mapping
    status_dict = {"open": 1, "closed": 0}
    station_df["status"] = station_df["status"].map(status_dict)
    station_df["status"] = station_df["status"].astype("uint8")

    # Naming
    station_df.rename(columns={"id": "station_id"}, inplace=True)
    station_df.rename(columns={"ts": "date"}, inplace=True)

    # Casting & sorting DataFrame on station_id & date
    station_df["date"] = pd.to_datetime(station_df["date"])
    station_df["station_id"] = station_df["station_id"].astype(int)
    station_df = station_df.sort_values(["station_id", "date"], ascending=[1, 1])

    # Dropduplicate station_id / date rows
    station_df = station_df.drop_duplicates(subset=["station_id", "date"]).reset_index(drop=True)

    # Create features
    station_df = get_transactions_in(pl.from_pandas(station_df), output_type="pandas")
    station_df = get_transactions_out(pl.from_pandas(station_df), output_type="pandas")
    station_df = get_transactions_all(pl.from_pandas(station_df), output_type="pandas")

    ## Resampling

    # cf Bug Pandas : https://github.com/pandas-dev/pandas/issues/33548
    station_df = station_df.set_index("date")

    station_df_resample = (
        station_df.groupby("station_id")
        .resample(
            "10min",
            label="right",
        )
        .agg(
            {
                "available_stands": "last",
                "available_bikes": "last",
                "status": "max",  # Empeche les micro déconnection à la station
                "transactions_in": "sum",
                "transactions_out": "sum",
                "transactions_all": "sum",
            }
        )
        .reset_index()
    )
    return station_df_resample

In [802]:
# Actual Pandas
# %%timeit -r 50
station_df_pandas = pd.DataFrame()
for i in range(0, len(station_json["data"])):
    temp_station_df_pandas = pd.DataFrame(station_json["data"][i])
    station_df_pandas = pd.concat([station_df_pandas, temp_station_df_pandas])

In [507]:
# Polars as Pandas
# %%timeit -r 50
if len(station_json["data"]) > 1:
    station_df = pl.DataFrame()
    for i in range(0, len(station_json["data"])):
        temp_station_df = pl.DataFrame(station_json["data"][i])
        station_df = pl.concat([station_df, temp_station_df])

In [803]:
# Polars Optimized
# %%timeit -r 50
station_df = pl.DataFrame(station_json["data"]).explode("available_bikes", "available_stands", "status", "ts")

In [804]:
from pandas.testing import assert_frame_equal

assert_frame_equal(station_df.to_pandas(), station_df_pandas.reset_index(drop=True))

In [805]:
status_dict = {"open": 1, "closed": 0}

In [806]:
# %%timeit -r 30
station_df_pandas["status"] = station_df_pandas["status"].map(status_dict)

In [807]:
# %%timeit -r 30
station_df = station_df.with_columns(status=pl.col("status").replace(status_dict).cast(pl.UInt8))

In [808]:
station_df.head()

available_bikes,available_stands,id,name,nb_stands,status,ts
i64,i64,str,str,i64,u8,str
8,12,"""102""","""Place de la Bourse""",20,1,"""2020-10-14T00:02:13"""
8,12,"""102""","""Place de la Bourse""",20,1,"""2020-10-14T00:07:13"""
8,12,"""102""","""Place de la Bourse""",20,1,"""2020-10-14T00:12:13"""
8,12,"""102""","""Place de la Bourse""",20,1,"""2020-10-14T00:17:13"""
9,11,"""102""","""Place de la Bourse""",20,1,"""2020-10-14T00:23:13"""


In [539]:
station_df_pandas.head()

Unnamed: 0,available_bikes,available_stands,id,name,nb_stands,status,ts
0,8,12,102,Place de la Bourse,20,1,2020-10-14T00:02:13
1,8,12,102,Place de la Bourse,20,1,2020-10-14T00:07:13
2,8,12,102,Place de la Bourse,20,1,2020-10-14T00:12:13
3,8,12,102,Place de la Bourse,20,1,2020-10-14T00:17:13
4,9,11,102,Place de la Bourse,20,1,2020-10-14T00:23:13


In [540]:
station_df.collect_schema()

Schema([('available_bikes', Int64),
        ('available_stands', Int64),
        ('id', String),
        ('name', String),
        ('nb_stands', Int64),
        ('status', UInt8),
        ('ts', String)])

In [541]:
from pandas.testing import assert_series_equal

assert_series_equal(station_df.to_pandas()["status"], station_df_pandas["status"], check_index=False, check_dtype=False)

In [809]:
# %%timeit -r 10
station_df_pandas.rename(columns={"id": "station_id"}, inplace=True)
station_df_pandas.rename(columns={"ts": "date"}, inplace=True)

# Casting & sorting DataFrame on station_id & date
station_df_pandas["date"] = pd.to_datetime(station_df_pandas["date"])
station_df_pandas["station_id"] = station_df_pandas["station_id"].astype(int)
station_df_pandas = station_df_pandas.sort_values(["station_id", "date"], ascending=[1, 1])

# Dropduplicate station_id / date rows
station_df_pandas = station_df_pandas.drop_duplicates(subset=["station_id", "date"]).reset_index(drop=True)

In [810]:
station_df = station_df.rename({"id": "station_id", "ts": "date"})

station_df = station_df.with_columns(station_id=pl.col("station_id").cast(pl.Int32()))
station_df = station_df.with_columns(date=pl.col("date").str.to_datetime(format="%Y-%m-%dT%H:%M:%S"))

station_df = station_df.unique(subset=["station_id", "date"])
station_df = station_df.sort(["station_id", "date"], descending=[False, False])

In [544]:
station_df.head()

available_bikes,available_stands,station_id,name,nb_stands,status,date
i64,i64,i32,str,i64,u8,datetime[μs]
10,20,19,"""Place Tourny""",30,1,2020-10-14 00:02:13
10,20,19,"""Place Tourny""",30,1,2020-10-14 00:07:13
10,20,19,"""Place Tourny""",30,1,2020-10-14 00:12:13
10,20,19,"""Place Tourny""",30,1,2020-10-14 00:17:13
10,20,19,"""Place Tourny""",30,1,2020-10-14 00:23:13


In [545]:
station_df_pandas.head()

Unnamed: 0,available_bikes,available_stands,station_id,name,nb_stands,status,date
0,10,20,19,Place Tourny,30,1,2020-10-14 00:02:13
1,10,20,19,Place Tourny,30,1,2020-10-14 00:07:13
2,10,20,19,Place Tourny,30,1,2020-10-14 00:12:13
3,10,20,19,Place Tourny,30,1,2020-10-14 00:17:13
4,10,20,19,Place Tourny,30,1,2020-10-14 00:23:13


In [811]:
# Create features
station_df_pandas = get_transactions_in(pl.from_pandas(station_df_pandas), output_type="pandas")
station_df_pandas = get_transactions_out(pl.from_pandas(station_df_pandas), output_type="pandas")
station_df_pandas = get_transactions_all(pl.from_pandas(station_df_pandas), output_type="pandas")

In [812]:
# Create features
station_df = get_transactions_in(station_df)
station_df = get_transactions_out(station_df)
station_df = get_transactions_all(station_df)

In [813]:
station_df_pandas = station_df_pandas.set_index("date")

In [814]:
# %%timeit -r 10
# cf Bug Pandas : https://github.com/pandas-dev/pandas/issues/33548
# station_df_pandas = station_df_pandas.set_index("date")

station_df_pandas_resample = (
    station_df_pandas.groupby("station_id")
    .resample(
        "10min",
        label="right",
    )
    .agg(
        {
            "available_stands": "last",
            "available_bikes": "last",
            "status": "max",  # Empeche les micro déconnection à la station
            "transactions_in": "sum",
            "transactions_out": "sum",
            "transactions_all": "sum",
        }
    )
    .reset_index()
)

In [815]:
# %%timeit -r 10
station_df_resample = station_df.group_by_dynamic("date", group_by="station_id", every="10m", label="right").agg(
    pl.col("available_stands").last(),
    pl.col("available_bikes").last(),
    pl.col("status").max(),  # Empeche les micro déconnection à la station
    pl.col("transactions_in").sum(),
    pl.col("transactions_out").sum(),
    pl.col("transactions_all").sum(),
)  # .sort(["date", "station_id"]).upsample("date", every="10m", group_by="station_id")
station_df_resample = station_df_resample.sort(["station_id", "date"], descending=[False, False]).upsample(
    "date", every="10m", group_by="station_id"
)
station_df_resample = station_df_resample.with_columns(station_id=pl.col.station_id.forward_fill())
station_df_resample = station_df_resample.with_columns(
    transactions_in=pl.col.transactions_in.fill_null(0),
    transactions_out=pl.col.transactions_out.fill_null(0),
    transactions_all=pl.col.transactions_all.fill_null(0),
)

In [816]:
from polars.testing import assert_frame_equal

In [821]:
# assert_frame_equal(station_df_resample.to_pandas(), station_df_pandas_resample[station_df_pandas_resample["available_bikes"].notnull()].reset_index(drop=True), check_dtype=False)
# station_df_resample_pandas = station_df_resample.to_pandas()
station_df_pandas_resample = station_df_pandas_resample[
    [
        "date",
        "station_id",
        "available_stands",
        "available_bikes",
        "status",
        "transactions_in",
        "transactions_out",
        "transactions_all",
    ]
]
# # Ensure the same data type for 'station_id' column
# station_df_resample_pandas['station_id'] = station_df_resample_pandas['station_id'].astype(int)
# station_df_resample_pandas['transactions_in'] = station_df_resample_pandas['transactions_in'].astype(float)
assert_frame_equal(station_df_resample, pl.from_pandas(station_df_pandas_resample), check_dtypes=False)

In [820]:
station_df_resample.columns

['date',
 'station_id',
 'available_stands',
 'available_bikes',
 'status',
 'transactions_in',
 'transactions_out',
 'transactions_all']

In [818]:
station_df_resample.slice(70, 10)

date,station_id,available_stands,available_bikes,status,transactions_in,transactions_out,transactions_all
datetime[μs],i32,i64,i64,u8,i64,i64,i64
2020-10-14 11:50:00,19,15,15,1,1,0,1
2020-10-14 12:00:00,19,15,15,1,0,0,0
2020-10-14 12:10:00,19,15,15,1,0,0,0
2020-10-14 12:20:00,19,15,15,1,0,0,0
2020-10-14 12:30:00,19,15,15,1,0,0,0
2020-10-14 12:40:00,19,16,14,1,0,1,1
2020-10-14 12:50:00,19,16,14,1,0,0,0
2020-10-14 13:00:00,19,15,15,1,1,0,1
2020-10-14 13:10:00,19,13,17,1,2,0,2
2020-10-14 13:20:00,19,13,17,1,0,0,0


In [819]:
station_df_pandas_resample.iloc[70:80]

Unnamed: 0,station_id,date,available_stands,available_bikes,status,transactions_in,transactions_out,transactions_all
70,19,2020-10-14 11:50:00,15.0,15.0,1.0,1,0,1
71,19,2020-10-14 12:00:00,15.0,15.0,1.0,0,0,0
72,19,2020-10-14 12:10:00,15.0,15.0,1.0,0,0,0
73,19,2020-10-14 12:20:00,15.0,15.0,1.0,0,0,0
74,19,2020-10-14 12:30:00,15.0,15.0,1.0,0,0,0
75,19,2020-10-14 12:40:00,16.0,14.0,1.0,0,1,1
76,19,2020-10-14 12:50:00,16.0,14.0,1.0,0,0,0
77,19,2020-10-14 13:00:00,15.0,15.0,1.0,1,0,1
78,19,2020-10-14 13:10:00,13.0,17.0,1.0,2,0,2
79,19,2020-10-14 13:20:00,13.0,17.0,1.0,0,0,0


In [628]:
station_df.filter((pl.col("station_id") == 19) & (pl.col("date") >= pl.datetime(2020, 10, 15, 13, 00, 0))).head(10)

available_bikes,available_stands,station_id,name,nb_stands,status,date,transactions_in,transactions_out,transactions_all
i64,i64,i32,str,i64,u8,datetime[μs],i64,i64,i64
22,8,19,"""Place Tourny""",30,1,2020-10-15 13:02:13,0,1,1
10,20,19,"""Place Tourny""",30,1,2020-10-15 21:18:13,0,12,12
10,20,19,"""Place Tourny""",30,1,2020-10-15 21:23:13,0,0,0
10,20,19,"""Place Tourny""",30,1,2020-10-15 21:29:13,0,0,0
10,20,19,"""Place Tourny""",30,1,2020-10-15 21:34:13,0,0,0
10,20,19,"""Place Tourny""",30,1,2020-10-15 21:37:13,0,0,0
10,20,19,"""Place Tourny""",30,1,2020-10-15 21:42:13,0,0,0
10,20,19,"""Place Tourny""",30,1,2020-10-15 21:48:13,0,0,0
10,20,19,"""Place Tourny""",30,1,2020-10-15 21:54:13,0,0,0
10,20,19,"""Place Tourny""",30,1,2020-10-15 21:59:13,0,0,0


In [629]:
station_df_resample.filter(
    (pl.col("station_id") == 19) & (pl.col("date") >= pl.datetime(2020, 10, 15, 13, 00, 0))
).head(10)

date,station_id,available_stands,available_bikes,status,transactions_in,transactions_out,transactions_all
datetime[μs],i32,i64,i64,u8,i64,i64,i64
2020-10-15 13:00:00,19,7.0,23.0,1.0,0,0,0
2020-10-15 13:10:00,19,8.0,22.0,1.0,0,1,1
2020-10-15 13:20:00,19,,,,0,0,0
2020-10-15 13:30:00,19,,,,0,0,0
2020-10-15 13:40:00,19,,,,0,0,0
2020-10-15 13:50:00,19,,,,0,0,0
2020-10-15 14:00:00,19,,,,0,0,0
2020-10-15 14:10:00,19,,,,0,0,0
2020-10-15 14:20:00,19,,,,0,0,0
2020-10-15 14:30:00,19,,,,0,0,0


In [630]:
station_df_pandas_resample[
    (station_df_pandas_resample["station_id"] == 19) & (station_df_pandas_resample["date"] >= "2020-10-15 13:00:00")
].head(10)

Unnamed: 0,date,station_id,available_stands,available_bikes,status,transactions_in,transactions_out,transactions_all
221,2020-10-15 13:00:00,19,7.0,23.0,1.0,0,0,0
222,2020-10-15 13:10:00,19,8.0,22.0,1.0,0,1,1
223,2020-10-15 13:20:00,19,,,,0,0,0
224,2020-10-15 13:30:00,19,,,,0,0,0
225,2020-10-15 13:40:00,19,,,,0,0,0
226,2020-10-15 13:50:00,19,,,,0,0,0
227,2020-10-15 14:00:00,19,,,,0,0,0
228,2020-10-15 14:10:00,19,,,,0,0,0
229,2020-10-15 14:20:00,19,,,,0,0,0
230,2020-10-15 14:30:00,19,,,,0,0,0


In [571]:
import datetime

data = [
    {"date": datetime.datetime(2020, 10, 14, 0, 28, 13), "nb": 20},
    {"date": datetime.datetime(2020, 10, 14, 0, 43, 13), "nb": 20},
    {"date": datetime.datetime(2020, 10, 14, 0, 49, 13), "nb": 21},
    {"date": datetime.datetime(2020, 10, 14, 0, 54, 13), "nb": 21},
]

df = pl.DataFrame(data)
df

date,nb
datetime[μs],i64
2020-10-14 00:28:13,20
2020-10-14 00:43:13,20
2020-10-14 00:49:13,21
2020-10-14 00:54:13,21


In [572]:
df_resample = (
    df.group_by_dynamic("date", every="10m", label="right")
    .agg(pl.col("nb").last())
    .sort("date")
    .upsample("date", every="10m")
)
df_resample

date,nb
datetime[μs],i64
2020-10-14 00:30:00,20.0
2020-10-14 00:40:00,
2020-10-14 00:50:00,21.0
2020-10-14 01:00:00,21.0


### Script

In [822]:
from vcub_keeper.production.data import get_data_from_api_by_station, transform_json_station_data_to_df
from vcub_keeper.transform.features_factory import get_consecutive_no_transactions_out
from vcub_keeper.visualisation import plot_station_activity

In [823]:
# station_id=37
station_id = [19, 105, 102]
start_date = "2022-02-01"
stop_date = "2022-09-01"

In [824]:
station_json = get_data_from_api_by_station(station_id=station_id, start_date=start_date, stop_date=stop_date)

station_df = transform_json_station_data_to_df(station_json)

In [825]:
station_df.tail(10)

date,station_id,available_stands,available_bikes,status,transactions_in,transactions_out,transactions_all
datetime[μs],i32,i64,i64,u8,i64,i64,i64
2022-02-22 13:10:00,105,11,9,1,0,0,0
2022-02-22 13:20:00,105,11,9,1,0,0,0
2022-02-22 13:30:00,105,11,9,1,0,0,0
2022-02-22 13:40:00,105,11,9,1,0,0,0
2022-02-22 13:50:00,105,11,9,1,0,0,0
2022-02-22 14:00:00,105,11,9,1,0,0,0
2022-02-22 14:10:00,105,11,9,1,0,0,0
2022-02-22 14:20:00,105,11,9,1,0,0,0
2022-02-22 14:30:00,105,12,8,1,0,1,1
2022-02-22 14:40:00,105,12,8,1,0,0,0


In [826]:
station_df = get_consecutive_no_transactions_out(station_df.to_pandas())

In [827]:
station_df

Unnamed: 0,date,station_id,available_stands,available_bikes,status,transactions_in,transactions_out,transactions_all,consecutive_no_transactions_out
0,2022-02-01 00:10:00,19,19,11,1,0,0,0,0
1,2022-02-01 00:20:00,19,19,11,1,0,0,0,1
2,2022-02-01 00:30:00,19,19,11,1,0,0,0,2
3,2022-02-01 00:40:00,19,19,11,1,0,0,0,3
4,2022-02-01 00:50:00,19,19,11,1,0,0,0,4
...,...,...,...,...,...,...,...,...,...
9331,2022-02-22 14:00:00,105,11,9,1,0,0,0,14
9332,2022-02-22 14:10:00,105,11,9,1,0,0,0,15
9333,2022-02-22 14:20:00,105,11,9,1,0,0,0,16
9334,2022-02-22 14:30:00,105,12,8,1,0,1,1,0


In [None]:
plot_station_activity(
    station_df,
    station_id=19,  # 105 #station_id
    features_to_plot=[
        "available_bikes",  #'available_stands',
        "consecutive_no_transactions_out",
        "status",
    ],
    # start_date=start_date,
    # end_date=end_date,
    return_data=False,
)

In [830]:
station_df[station_df.status == 0]

Unnamed: 0,date,station_id,available_stands,available_bikes,status,transactions_in,transactions_out,transactions_all,consecutive_no_transactions_out
252,2022-02-02 18:10:00,19,15,15,0,0,0,0,0
253,2022-02-02 18:20:00,19,15,15,0,0,0,0,0
277,2022-02-02 22:20:00,19,15,15,0,0,0,0,0
530,2022-02-04 16:30:00,19,16,14,0,0,0,0,0
531,2022-02-04 16:40:00,19,16,14,0,0,0,0,0
532,2022-02-04 16:50:00,19,16,14,0,0,0,0,0
1419,2022-02-10 20:40:00,19,6,24,0,0,0,0,0
1420,2022-02-10 20:50:00,19,6,24,0,0,0,0,0
1421,2022-02-10 21:00:00,19,6,24,0,0,0,0,0
1422,2022-02-10 21:10:00,19,6,24,0,0,0,0,0


In [36]:
station_df.status.value_counts()

status
1.0    4044
0.0     151
Name: count, dtype: int64

## API de open data Bordeaux

### Dev

In [25]:
# Day
URL = "https://data.bordeaux-metropole.fr/geojson/aggregate/ci_vcub_p?key=[KEY]&&rangeStart=2022-03-31T15:50:00&&rangeEnd=2022-03-31T16:00:00&rangeStep=day"
# 5min
URL = "https://data.bordeaux-metropole.fr/geojson/aggregate/ci_vcub_p?key=[KEY]&&rangeStart=2022-03-31T15:50:00&&rangeEnd=2022-03-31T16:00:00&rangeStep=5min"
URL = "https://data.bordeaux-metropole.fr/geojson/aggregate/ci_vcub_p?key=[KEY]&&rangeStart=2022-03-31T15:50:00&&rangeEnd=2022-03-31T16:01:00&rangeStep=5min"

#
URL = "https://data.bordeaux-metropole.fr/geojson/aggregate/ci_vcub_p?key=[KEY]&&rangeStart=2022-03-24T15:50:00&&rangeEnd=2022-03-31T16:01:00&rangeStep=5min"  # KO
URL = "https://data.bordeaux-metropole.fr/geojson/aggregate/ci_vcub_p?key=[KEY]&&rangeStart=2022-03-28&&rangeEnd=2022-03-31&rangeStep=5min"  # KO
URL = "https://data.bordeaux-metropole.fr/geojson/aggregate/ci_vcub_p?key=[KEY]&&rangeStart=2022-03-30&&rangeEnd=2022-03-31&rangeStep=5min"  # KO


# avec filtre station vcub
URL = 'https://data.bordeaux-metropole.fr/geojson/aggregate/ci_vcub_p?key=[KEY]&rangeStart=2022-03-31T15:50:00&filter={"ident":2}&rangeEnd=2022-03-31T16:01:00&rangeStep=5min'  # OK

# filtre station vcub + 1 semaine data
URL = 'https://data.bordeaux-metropole.fr/geojson/aggregate/ci_vcub_p?key=[KEY]&rangeStart=2022-03-24T00:00:00&filter={"ident":2}&rangeEnd=2022-03-31T16:01:00&rangeStep=5min'  # OK

# filtre station vcub + 1 semaine data + 1ere données


# 1 +/- semaine de data (toutes stations)
URL = 'https://data.bordeaux-metropole.fr/geojson/aggregate/ci_vcub_p?key=[KEY]&rangeStart=2022-03-24T15:50:00&rangeEnd=2022-03-31T16:01:00&rangeStep=5min&attributes=["nom","etat", "ident", "nbplaces", "nbvelos", "gid"]'  # KO

# 1 jours de data (toutes stations)
# 17,9 s
URL = 'https://data.bordeaux-metropole.fr/geojson/aggregate/ci_vcub_p?key=[KEY]&rangeStart=2022-03-30T15:50:00&rangeEnd=2022-03-31T16:01:00&rangeStep=5min&attributes=["nom","etat", "ident", "nbplaces", "nbvelos", "gid"]'  # OK

# 2 jours de data (toutes stations)
# 38,8 s
URL = 'https://data.bordeaux-metropole.fr/geojson/aggregate/ci_vcub_p?key=[KEY]&rangeStart=2022-03-30T15:50:00&rangeEnd=2022-03-31T16:01:00&rangeStep=5min&attributes=["nom","etat", "ident", "nbplaces", "nbvelos", "gid"]'  # KO

# avec filtre sur plusieurs station vcub
URL = 'https://data.bordeaux-metropole.fr/geojson/aggregate/ci_vcub_p?key=[KEY]&rangeStart=2022-03-31T15:50:00&filter={"ident":{"$in":[1,2]}}&rangeEnd=2022-03-31T16:01:00&rangeStep=5min'  # OK

# URL optimisé (on choisit les attribut de retour)
attributes = ["nom", "etat", "ident", "nbplaces", "nbvelos", "gid"]
attributes = {"nom": "mode", "etat": "mode", "nbplaces": "max", "nbvelos": "max"}
# 0,4s
URL = 'https://data.bordeaux-metropole.fr/geojson/aggregate/ci_vcub_p?key=[KEY]&rangeStart=2022-03-29T15:50:00&filter={"ident":1}&rangeEnd=2022-03-31T16:01:00&rangeStep=5min&attributes=["nom","etat", "ident", "nbplaces", "nbvelos", "gid"]'  # OK
# 0,4s
URL = 'https://data.bordeaux-metropole.fr/geojson/aggregate/ci_vcub_p?key=[KEY]&rangeStart=2022-03-29T15:50:00&filter={"ident":1}&rangeEnd=2022-03-31T16:01:00&rangeStep=5min&attributes={"nom": "mode", "etat": "mode", "nbplaces": "max", "nbvelos": "max"}'  # OK
# 0,7s
URL = 'https://data.bordeaux-metropole.fr/geojson/aggregate/ci_vcub_p?key=[KEY]&rangeStart=2022-03-29T15:50:00&filter={"ident":1}&rangeEnd=2022-03-31T16:01:00&rangeStep=5min&'  # OK

In [26]:
# Toute les stations sur une période déterminé

# 1 jours : 18 sec / 53835 lignes
URL = 'https://data.bordeaux-metropole.fr/geojson/aggregate/ci_vcub_p?key=[KEY]&rangeStart=2022-03-30T15:50:00&rangeEnd=2022-03-31T16:01:00&rangeStep=5min&attributes={"nom": "mode", "etat": "mode", "nbplaces": "max", "nbvelos": "max"}'  # OK

# 2 jours : 37,8 sec --> HS
URL = 'https://data.bordeaux-metropole.fr/geojson/aggregate/ci_vcub_p?key=[KEY]&rangeStart=2022-03-29T15:50:00&rangeEnd=2022-03-31T16:01:00&rangeStep=5min&attributes={"nom": "mode", "etat": "mode", "nbplaces": "max", "nbvelos": "max"}'  # OK

# 1.5 jours : 30,2 sec --> HS
URL = 'https://data.bordeaux-metropole.fr/geojson/aggregate/ci_vcub_p?key=[KEY]&rangeStart=2022-03-29T23:50:00&rangeEnd=2022-03-31T16:01:00&rangeStep=5min&attributes={"nom": "mode", "etat": "mode", "nbplaces": "max", "nbvelos": "max"}'  # OK

In [120]:
URL = 'https://data.bordeaux-metropole.fr/geojson/aggregate/ci_vcub_p?key=[KEY]&rangeStart=2022-03-24T00:00:00&filter={"ident":2}&rangeEnd=2022-03-31T16:01:00&rangeStep=5min&attributes={"nom": "mode", "etat": "mode", "nbplaces": "max", "nbvelos": "max"}'  # OK

Récapitulatif : 

- Pour les requêtes d'affichage d'une station (avec graph + anomalie) :
    Possibilité d'utiliser une requête de ce type qui est assez rapide (+/- 1 sec)
   
    `URL = 'https://data.bordeaux-metropole.fr/geojson/aggregate/ci_vcub_p?key=[KEY]&rangeStart=2022-03-24T00:00:00&filter={"ident":2}&rangeEnd=2022-03-31T16:01:00&rangeStep=5min&attributes={"nom": "mode", "etat": "mode", "nbplaces": "max", "nbvelos": "max"}'`

- Pour les requêtes lors de la détection d'ano toutes les 10 minutes : 
    On ne peut pas utiliser la même requête que plus haut sans filtre de stations (pas de retour de la requête / trop long / limitation API). Par contre il est possible de demander +/- 24h de données pour toutes les stations (+/- 18 sec)
    
    `URL = 'https://data.bordeaux-metropole.fr/geojson/aggregate/ci_vcub_p?key=[KEY]&rangeStart=2022-03-30T15:50:00&rangeEnd=2022-03-31T16:01:00&rangeStep=5min&attributes={"nom": "mode", "etat": "mode", "nbplaces": "max", "nbvelos": "max"}'`


In [121]:
response = requests.get(URL)  # noqa: S113
response

<Response [403]>

In [123]:
response.json()
# {"nom": "mode", "etat": "mode", "nbplaces": "max", "nbvelos": "max"}

{'error': 'Clé logicielle invalide'}

In [None]:
len(response.json()["features"])

In [32]:
station_df = pl.json_normalize(response.json()["features"], max_level=1)

# Naming from JSON DataFrame
station_df = station_df.rename(
    mapping={
        "properties.time": "date",
        "properties.ident": "station_id",
        "properties.etat": "status",
        "properties.nbplaces": "available_stands",
        "properties.nbvelos": "available_bikes",
    }
).drop("type", "properties.gid", "properties.nom")  # drop unused columns

In [34]:
station_df.tail()

date,station_id,status,available_stands,available_bikes
str,i64,str,i64,i64
"""2024-10-22T01:35:00+02:00""",1,"""CONNECTEE""",39,2
"""2024-10-22T01:40:00+02:00""",1,"""CONNECTEE""",39,2
"""2024-10-22T01:45:00+02:00""",1,"""CONNECTEE""",40,2
"""2024-10-22T01:50:00+02:00""",1,"""CONNECTEE""",40,1
"""2024-10-22T01:55:00+02:00""",1,"""CONNECTEE""",40,1


In [35]:
station_df = station_df.with_columns(
    # cast into datetime with tz_aware to Paris to none
    date=pl.col("date")
    .str.to_datetime(format="%Y-%m-%dT%H:%M:%S%z", time_zone="Europe/Paris")
    .dt.replace_time_zone(None),
)

In [36]:
station_df.tail()

date,station_id,status,available_stands,available_bikes
datetime[μs],i64,str,i64,i64
2024-10-22 01:35:00,1,"""CONNECTEE""",39,2
2024-10-22 01:40:00,1,"""CONNECTEE""",39,2
2024-10-22 01:45:00,1,"""CONNECTEE""",40,2
2024-10-22 01:50:00,1,"""CONNECTEE""",40,1
2024-10-22 01:55:00,1,"""CONNECTEE""",40,1


In [43]:
station_df["date"].n_unique()

2592

In [46]:
station_df["date"].min(), station_df["date"].max()

(datetime.datetime(2024, 10, 13, 2, 0), datetime.datetime(2024, 10, 22, 1, 55))

In [47]:
station_df.tail()

date,station_id,status,available_stands,available_bikes
datetime[μs],i64,str,i64,i64
2024-10-22 01:35:00,1,"""CONNECTEE""",39,2
2024-10-22 01:40:00,1,"""CONNECTEE""",39,2
2024-10-22 01:45:00,1,"""CONNECTEE""",40,2
2024-10-22 01:50:00,1,"""CONNECTEE""",40,1
2024-10-22 01:55:00,1,"""CONNECTEE""",40,1


In [48]:
response.json()["features"][0]["properties"]

{'time': '2024-10-13T02:00:00+02:00',
 'gid': 1,
 'ident': 1,
 'nom': 'Meriadeck',
 'etat': 'MAINTENANCE',
 'nbplaces': 43,
 'nbvelos': 0}

### Industrialisation

In [114]:
from vcub_keeper.production.data import get_data_from_api_bdx_by_station, transform_json_api_bdx_station_data_to_df

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [125]:
station_id = 174
# station_id=[124,  15,  60,]
# statoin_id = [124,  15,  60,  18,  10,  68, 130,   2, 105, 120,  11, 110,   9,
#         23,   3,  16,  20, 136,  42,  21, 131,  59,  45, 172,   6,  24,
#         36, 108,  19, 125, 135,  37, 139,  99,  28,  57,   7,  98,   8,
#         41,  40,  58,  55, 109,   1,   4, 100, 134, 174, 101,  56, 104,
#         43,  54,  44, 102, 133, 103,   5, 127,  65,  22, 123,  39, 106]
start_date = "2024-10-18"
stop_date = "2024-10-19"

In [126]:
from datetime import date, timedelta

# Get date
date_today = date.today()
end_date = date_today + timedelta(days=1)
start_date = date_today - timedelta(days=8)

start_date_str = start_date.strftime("%Y-%m-%d")
end_date_str = end_date.strftime("%Y-%m-%d")
print(start_date_str, end_date_str)

2024-10-14 2024-10-23


In [127]:
station_json = get_data_from_api_bdx_by_station(station_id=station_id, start_date=start_date, stop_date=stop_date)

station_df = transform_json_api_bdx_station_data_to_df(station_json)

https://data.bordeaux-metropole.fr/geojson/aggregate/ci_vcub_p?key=148889KPWZ&rangeStart=2024-10-14&filter={"ident":174}&rangeEnd=2024-10-19&rangeStep=5min&attributes={"nom": "mode", "etat": "mode", "nbplaces": "max", "nbvelos": "max", "ident": "min"}


In [128]:
station_df.collect().tail(10)

station_id,date,available_stands,available_bikes,status,transactions_in,transactions_out,transactions_all
i32,datetime[μs],i64,i64,u8,i64,i64,i64
174,2024-10-19 00:30:00,42,0,1,0,0,0
174,2024-10-19 00:40:00,42,0,1,0,0,0
174,2024-10-19 00:50:00,42,0,1,0,0,0
174,2024-10-19 01:00:00,42,0,1,0,0,0
174,2024-10-19 01:10:00,42,0,1,0,0,0
174,2024-10-19 01:20:00,42,0,1,0,0,0
174,2024-10-19 01:30:00,42,0,1,0,0,0
174,2024-10-19 01:40:00,42,0,1,0,0,0
174,2024-10-19 01:50:00,42,0,1,0,0,0
174,2024-10-19 02:00:00,42,0,1,0,0,0


In [129]:
station_df.collect()

station_id,date,available_stands,available_bikes,status,transactions_in,transactions_out,transactions_all
i32,datetime[μs],i64,i64,u8,i64,i64,i64
174,2024-10-14 02:10:00,40,2,1,0,0,0
174,2024-10-14 02:20:00,40,2,1,0,0,0
174,2024-10-14 02:30:00,40,2,1,0,0,0
174,2024-10-14 02:40:00,40,2,1,0,0,0
174,2024-10-14 02:50:00,40,2,1,0,0,0
…,…,…,…,…,…,…,…
174,2024-10-19 01:20:00,42,0,1,0,0,0
174,2024-10-19 01:30:00,42,0,1,0,0,0
174,2024-10-19 01:40:00,42,0,1,0,0,0
174,2024-10-19 01:50:00,42,0,1,0,0,0


In [130]:
station_df.collect_schema()

Schema([('station_id', Int32),
        ('date', Datetime(time_unit='us', time_zone=None)),
        ('available_stands', Int64),
        ('available_bikes', Int64),
        ('status', UInt8),
        ('transactions_in', Int64),
        ('transactions_out', Int64),
        ('transactions_all', Int64)])

In [773]:
# 73.3 ms ± 550 μs per loop (mean ± std. dev. of 10 runs, 10 loops each)
# %%timeit -r 10
station_df_pandas = transform_json_api_bdx_station_data_to_df(station_json)

73.3 ms ± 550 μs per loop (mean ± std. dev. of 10 runs, 10 loops each)


In [782]:
# %%timeit -r 10
station_df = transform_json_api_bdx_station_data_to_df(station_json)

In [756]:
station_df_pandas

Unnamed: 0,station_id,date,available_stands,available_bikes,status,transactions_in,transactions_out,transactions_all
0,1,2024-09-21 02:10:00+02:00,33,10,1,0,0,0
1,1,2024-09-21 02:20:00+02:00,33,10,1,0,0,0
2,1,2024-09-21 02:30:00+02:00,33,10,1,0,0,0
3,1,2024-09-21 02:40:00+02:00,34,10,1,0,1,0
4,1,2024-09-21 02:50:00+02:00,34,9,1,0,0,1
...,...,...,...,...,...,...,...,...
1265,1,2024-09-29 21:00:00+02:00,39,4,1,0,1,1
1266,1,2024-09-29 21:10:00+02:00,39,4,1,0,0,0
1267,1,2024-09-29 21:20:00+02:00,39,4,1,0,0,0
1268,1,2024-09-29 21:30:00+02:00,39,4,1,0,0,0


In [64]:
station_df["station_id"].unique().to_list()

TypeError: 'LazyFrame' object is not subscriptable (aside from slicing)

Use `select()` or `filter()` instead.

In [696]:
# station_json["features"]

In [65]:
station_df_pandas = pd.json_normalize(station_json, record_path=["features"])
station_df_pandas.head()

Unnamed: 0,type,properties.time,properties.gid,properties.ident,properties.nom,properties.etat,properties.nbplaces,properties.nbvelos
0,Feature,2024-10-14T02:00:00+02:00,1,1,Meriadeck,MAINTENANCE,43,0
1,Feature,2024-10-14T02:05:00+02:00,1,1,Meriadeck,MAINTENANCE,43,0
2,Feature,2024-10-14T02:10:00+02:00,1,1,Meriadeck,MAINTENANCE,43,0
3,Feature,2024-10-14T02:15:00+02:00,1,1,Meriadeck,MAINTENANCE,43,0
4,Feature,2024-10-14T02:20:00+02:00,1,1,Meriadeck,MAINTENANCE,43,0


In [757]:
# station_df = pl.DataFrame(station_json["data"]).explode("available_bikes", "available_stands", "status", "ts")
station_df = pl.json_normalize(station_json["features"], max_level=1)
station_df.head()

type,properties.time,properties.gid,properties.ident,properties.nom,properties.etat,properties.nbplaces,properties.nbvelos
str,str,i64,i64,str,str,i64,i64
"""Feature""","""2024-09-21T02:00:00+02:00""",1,1,"""Meriadeck""","""CONNECTEE""",33,10
"""Feature""","""2024-09-21T02:05:00+02:00""",1,1,"""Meriadeck""","""CONNECTEE""",33,10
"""Feature""","""2024-09-21T02:10:00+02:00""",1,1,"""Meriadeck""","""CONNECTEE""",33,10
"""Feature""","""2024-09-21T02:15:00+02:00""",1,1,"""Meriadeck""","""CONNECTEE""",33,10
"""Feature""","""2024-09-21T02:20:00+02:00""",1,1,"""Meriadeck""","""CONNECTEE""",33,10


In [751]:
from polars.testing import assert_frame_equal

assert_frame_equal(station_df, pl.from_pandas(station_df_pandas))

AssertionError: columns ['type', 'properties.time', 'properties.gid', 'properties.ident', 'properties.nom', 'properties.etat', 'properties.nbplaces', 'properties.nbvelos'] in left DataFrame, but not in right

In [678]:
station_df_pandas = station_df_pandas.rename(
    columns={
        "properties.time": "date",
        "properties.ident": "station_id",
        "properties.nom": "name",
        "properties.etat": "status",
        "properties.nbplaces": "available_stands",
        "properties.nbvelos": "available_bikes",
    }
)

In [679]:
station_df_pandas.head()

Unnamed: 0,type,date,properties.gid,station_id,name,status,available_stands,available_bikes
0,Feature,2024-09-21T02:00:00+02:00,1,1,Meriadeck,CONNECTEE,33,10
1,Feature,2024-09-21T02:05:00+02:00,1,1,Meriadeck,CONNECTEE,33,10
2,Feature,2024-09-21T02:10:00+02:00,1,1,Meriadeck,CONNECTEE,33,10
3,Feature,2024-09-21T02:15:00+02:00,1,1,Meriadeck,CONNECTEE,33,10
4,Feature,2024-09-21T02:20:00+02:00,1,1,Meriadeck,CONNECTEE,33,10


In [758]:
station_df = station_df.rename(
    mapping={
        "properties.time": "date",
        "properties.ident": "station_id",
        "properties.nom": "name",
        "properties.etat": "status",
        "properties.nbplaces": "available_stands",
        "properties.nbvelos": "available_bikes",
    }
).drop("type", "properties.gid", "name")  # drop unused columns

In [759]:
station_df.head()

date,station_id,status,available_stands,available_bikes
str,i64,str,i64,i64
"""2024-09-21T02:00:00+02:00""",1,"""CONNECTEE""",33,10
"""2024-09-21T02:05:00+02:00""",1,"""CONNECTEE""",33,10
"""2024-09-21T02:10:00+02:00""",1,"""CONNECTEE""",33,10
"""2024-09-21T02:15:00+02:00""",1,"""CONNECTEE""",33,10
"""2024-09-21T02:20:00+02:00""",1,"""CONNECTEE""",33,10


In [699]:
status_dict = {"CONNECTEE": 1, "DECONNECTEE": 0}
station_df_pandas["status"] = station_df_pandas["status"].map(status_dict)

In [760]:
station_df = station_df.with_columns(status=pl.col("status").replace(status_dict).cast(pl.UInt8))

In [761]:
station_df = station_df.with_columns(station_id=pl.col("station_id").cast(pl.Int32()))
station_df = station_df.with_columns(
    date=pl.col("date").str.to_datetime(format="%Y-%m-%dT%H:%M:%S%z", time_zone="Europe/Paris")
)

station_df = station_df.unique(subset=["station_id", "date"])
station_df = station_df.sort(["station_id", "date"], descending=[False, False])

In [762]:
station_df.head()

date,station_id,status,available_stands,available_bikes
"datetime[μs, Europe/Paris]",i32,u8,i64,i64
2024-09-21 02:00:00 CEST,1,1,33,10
2024-09-21 02:05:00 CEST,1,1,33,10
2024-09-21 02:10:00 CEST,1,1,33,10
2024-09-21 02:15:00 CEST,1,1,33,10
2024-09-21 02:20:00 CEST,1,1,33,10


In [763]:
# Create features
station_df = get_transactions_in(station_df)
station_df = get_transactions_out(station_df)
station_df = get_transactions_all(station_df)

In [764]:
station_df_resample = station_df.group_by_dynamic("date", group_by="station_id", every="10m", label="right").agg(
    pl.col("available_stands").last(),
    pl.col("available_bikes").last(),
    pl.col("status").max(),  # Empeche les micro déconnection à la station
    pl.col("transactions_in").sum(),
    pl.col("transactions_out").sum(),
    pl.col("transactions_all").sum(),
)  # .sort(["date", "station_id"]).upsample("date", every="10m", group_by="station_id")
# station_df_resample = station_df_resample.sort(["station_id", "date"],
#                                                descending=[False, False]).upsample("date", every="10m", group_by="station_id")
# station_df_resample = station_df_resample.with_columns(station_id=pl.col.station_id.forward_fill())
# station_df_resample = station_df_resample.with_columns(
#     transactions_in=pl.col.transactions_in.fill_null(0),
#     transactions_out=pl.col.transactions_out.fill_null(0),
#     transactions_all=pl.col.transactions_all.fill_null(0),
# )

# station_df_resample = station_df_resample.select("station_id", "date", "available_stands", "available_bikes", "status",
#          "transactions_in", "transactions_out", "transactions_all")

In [768]:
station_df_pandas.tail()

Unnamed: 0,station_id,date,available_stands,available_bikes,status,transactions_in,transactions_out,transactions_all
1265,1,2024-09-29 21:00:00+02:00,39,4,1,0,1,1
1266,1,2024-09-29 21:10:00+02:00,39,4,1,0,0,0
1267,1,2024-09-29 21:20:00+02:00,39,4,1,0,0,0
1268,1,2024-09-29 21:30:00+02:00,39,4,1,0,0,0
1269,1,2024-09-29 21:40:00+02:00,39,4,1,0,0,0


In [770]:
station_df_resample.tail()

station_id,date,available_stands,available_bikes,status,transactions_in,transactions_out,transactions_all
i32,"datetime[μs, Europe/Paris]",i64,i64,u8,i64,i64,i64
1,2024-09-29 21:00:00 CEST,39,4,1,0,1,1
1,2024-09-29 21:10:00 CEST,39,4,1,0,0,0
1,2024-09-29 21:20:00 CEST,39,4,1,0,0,0
1,2024-09-29 21:30:00 CEST,39,4,1,0,0,0
1,2024-09-29 21:40:00 CEST,39,4,1,0,0,0


In [772]:
assert_frame_equal(station_df_resample, pl.from_pandas(station_df_pandas), check_dtypes=False)

In [16]:
station_id = 106
start_date = "2023-07-03"
stop_date = "2023-08-03"

station_json = get_data_from_api_bdx_by_station(station_id=station_id, start_date=start_date, stop_date=stop_date)

station_df = transform_json_api_bdx_station_data_to_df(station_json)

In [17]:
station_df.groupby("station_id", as_index=False)["date"].min()

Unnamed: 0,station_id,date
0,106,2023-07-03 02:10:00+02:00


In [18]:
assert station_df["station_id"].nunique() == 1  # noqa: S101
assert station_df["station_id"].unique() == [106]  # noqa: S101

### Test de rapidité API Open Data VS Oslandia

In [835]:
from vcub_keeper.production.data import (
    get_data_from_api_bdx_by_station,
    get_data_from_api_by_station,
    transform_json_api_bdx_station_data_to_df,
    transform_json_station_data_to_df,
)


def test_api_bdx(station_id=station_id, start_date=start_date, stop_date=stop_date):
    """ """
    station_json = get_data_from_api_bdx_by_station(station_id=station_id, start_date=start_date, stop_date=stop_date)

    station_df = transform_json_api_bdx_station_data_to_df(station_json)
    return station_df


def test_api_oslandia(station_id=station_id, start_date=start_date, stop_date=stop_date):
    """ """
    station_json = get_data_from_api_by_station(station_id=station_id, start_date=start_date, stop_date=stop_date)

    station_df = transform_json_station_data_to_df(station_json)
    return station_df

#### Test de perf data pour une station sur une semaine

In [11]:
station_id = 2
start_date = "2020-10-14"
stop_date = "2020-10-17"

In [12]:
# 1.16 s ± 74.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit test_api_bdx(station_id=station_id, start_date=start_date, stop_date=stop_date)

1.53 s ± 365 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [13]:
# TO do en attendant que le serveur Oslandia soit UP
%timeit test_api_oslandia(station_id=station_id, start_date=start_date, stop_date=stop_date)

1.08 s ± 3.41 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


#### Test de perf data pour plusieurs stations sur une semaine

In [14]:
station_id = [
    124,
    15,
    60,
    18,
    10,
    68,
    130,
    2,
    105,
    120,
    11,
    110,
    9,
    23,
    3,
    16,
    20,
    136,
    42,
    21,
    131,
    59,
    45,
    172,
    6,
    24,
    36,
    108,
    19,
    125,
    135,
    37,
    139,
    99,
    28,
    57,
    7,
    98,
    8,
    41,
    40,
    58,
    55,
    109,
    1,
    4,
    100,
    134,
    174,
    101,
    56,
    104,
    43,
    54,
    44,
    102,
    133,
    103,
    5,
    127,
    65,
    22,
    123,
    39,
    106,
]
station_id = [19, 105, 102]
start_date = "2024-05-01"
stop_date = "2024-09-01"

In [15]:
# 40.1 s ± 3.39 s per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit test_api_bdx(station_id=station_id, start_date=start_date, stop_date=stop_date)

40.1 s ± 3.39 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [16]:
# 1.77 s ± 2.25 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit test_api_oslandia(station_id=station_id, start_date=start_date, stop_date=stop_date)

1.77 s ± 2.25 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


L'API d'Oslandia est plus rapide que celle de Bordeaux Métropole pour les requêtes de données sur une semaine pour un groupe de stations.