In [1]:
import pandas as pd
import json

from vcub_keeper.reader.reader import read_activity_vcub
from vcub_keeper.production.data import transform_json_api_bdx_station_data_to_df


from vcub_keeper.config import ROOT_TESTS_DATA

%load_ext autoreload
%autoreload 2

Permets d'extraire les données non simulée afin de faire les tests de benchmark pour les fonctions suivantes :
- `get_transactions_out()`
- `get_transactions_in()`

In [47]:
# Lecture de l'activité des stations
activite = read_activity_vcub()

In [48]:
list_station_id = [106, 22, 43, 102, 123]
start_date = "2017-07-10" 
end_date = "2017-07-14"

export = activite[(activite['station_id'].isin(list_station_id)) & (activite['date'] >= start_date) & (activite['date'] <= end_date)].copy()
export.shape

(5630, 8)

In [49]:
export.station_id.unique()

array([ 22,  43, 102, 106, 123], dtype=uint8)

In [50]:
export.head()

Unnamed: 0,gid,station_id,type,name,state,available_stands,available_bikes,date
454307,92,22,VLS,Hotel de Ville,1,31,2,2017-07-10 00:04:04
454308,92,22,VLS,Hotel de Ville,1,33,0,2017-07-10 00:09:04
454309,92,22,VLS,Hotel de Ville,1,33,0,2017-07-10 00:14:05
454310,92,22,VLS,Hotel de Ville,1,33,0,2017-07-10 00:19:04
454311,92,22,VLS,Hotel de Ville,1,33,0,2017-07-10 00:24:05


In [51]:
export.dtypes

gid                          uint8
station_id                   uint8
type                      category
name                string[python]
state                     category
available_stands             uint8
available_bikes              uint8
date                datetime64[ns]
dtype: object

In [52]:
# have to change some columns to simulate original data


state_dict = {1: "CONNECTEE", 0: "DECONNECTEE"}
export["state"] = export["state"].map(state_dict)

export = export.rename(columns={"station_id": "ident",
                                "date": "ts"})

### Export des données de test

In [53]:
export.to_csv(ROOT_TESTS_DATA + "activite_data.csv", index=False)

In [54]:
result_read_df = read_activity_vcub(file_path=ROOT_TESTS_DATA + "activite_data.csv")

In [55]:
export_test = activite[(activite['station_id'].isin(list_station_id)) & (activite['date'] >= start_date) & (activite['date'] <= end_date)].copy()

In [56]:
pd.testing.assert_frame_equal(result_read_df.reset_index(drop=True), 
                              export_test.reset_index(drop=True), 
                              check_categorical=False, 
                              check_dtype=False)

In [57]:
result_read_df.reset_index(drop=True).head()

Unnamed: 0,gid,station_id,type,name,state,available_stands,available_bikes,date
0,92,22,VLS,Hotel de Ville,1,31,2,2017-07-10 00:04:04
1,92,22,VLS,Hotel de Ville,1,33,0,2017-07-10 00:09:04
2,92,22,VLS,Hotel de Ville,1,33,0,2017-07-10 00:14:05
3,92,22,VLS,Hotel de Ville,1,33,0,2017-07-10 00:19:04
4,92,22,VLS,Hotel de Ville,1,33,0,2017-07-10 00:24:05


In [58]:
export_test.reset_index(drop=True).head()

Unnamed: 0,gid,station_id,type,name,state,available_stands,available_bikes,date
0,92,22,VLS,Hotel de Ville,1,31,2,2017-07-10 00:04:04
1,92,22,VLS,Hotel de Ville,1,33,0,2017-07-10 00:09:04
2,92,22,VLS,Hotel de Ville,1,33,0,2017-07-10 00:14:05
3,92,22,VLS,Hotel de Ville,1,33,0,2017-07-10 00:19:04
4,92,22,VLS,Hotel de Ville,1,33,0,2017-07-10 00:24:05


# Amélioriation de la performance de la fonction 

cf : https://github.com/armgilles/vcub_keeper/issues/103

In [1]:
import json

import pandas as pd

from vcub_keeper.config import ROOT_TESTS_DATA
from vcub_keeper.transform.features_factory import get_transactions_all, get_transactions_in, get_transactions_out


In [2]:
# Fonction de lecture des données de tests
def read_json_data(file_name="data_test_api_from_bdx.json"):
    """
    Read test json data
    From notebooks/04_tests/03_test_data_activite.ipynb
    """

    # Loading data from data test (.json)
    with open(ROOT_TESTS_DATA + file_name) as f:
        station_json_loaded = json.load(f)
    return station_json_loaded


station_json_loaded = read_json_data()

In [58]:
# Origingal fonction
def transform_json_api_bdx_station_data_to_df(station_json):
    """
    Tranforme la Time Serie d'activité d'une ou plusieurs station en DataFrame
    à partir de la fonction get_data_from_api_bdx_by_station()
    Effectue plusieurs transformation comme la fonction create/creator.py
    create_activity_time_series()
        - Naming des colonnes json
        - Structuration
        - Naming
        - Ajout de variables
        - Resampling sur 10min

    Parameters
    ----------
    station_json : json
        Time serie au format json de l'activité d'une station (ou plusieurs)
    Returns
    -------
    station_df_resample : DataFrame
        Time serie au format DataFrame de l'activité d'une ou plusieurs station
        resampler sur 10 min.

    Examples
    --------

    station_df = transform_json_api_bdx_station_data_to_df(station_json)

    """

    station_df = pd.json_normalize(station_json, record_path=["features"])

    # Naming from JSON DataFrame
    station_df.rename(columns={"properties.time": "time"}, inplace=True)
    station_df.rename(columns={"properties.ident": "ident"}, inplace=True)
    station_df.rename(columns={"properties.nom": "nom"}, inplace=True)
    station_df.rename(columns={"properties.etat": "etat"}, inplace=True)
    station_df.rename(columns={"properties.nbplaces": "nbplaces"}, inplace=True)
    station_df.rename(columns={"properties.nbvelos": "nbvelos"}, inplace=True)

    # naming api Bdx to vanilla api (get_data_from_api_by_station) from DataFrame
    # Naming
    station_df.rename(columns={"time": "date"}, inplace=True)
    station_df.rename(columns={"ident": "station_id"}, inplace=True)
    station_df.rename(columns={"nom": "name"}, inplace=True)
    station_df.rename(columns={"etat": "status"}, inplace=True)
    station_df.rename(columns={"nbvelos": "available_bikes"}, inplace=True)
    station_df.rename(columns={"nbplaces": "available_stands"}, inplace=True)

    # Status mapping
    status_dict = {"CONNECTEE": 1, "DECONNECTEE": 0}
    station_df["status"] = station_df["status"].map(status_dict).fillna(0)
    station_df["status"] = station_df["status"].astype("uint8")

    # Casting & sorting DataFrame on station_id & date
    station_df["date"] = pd.to_datetime(station_df["date"])
    try:
        station_df["date"] = pd.to_datetime(station_df["date"])
    except:  # Changemnent d'horraire https://github.com/armgilles/vcub_watcher/issues/44  # noqa: E722
        station_df["date"] = pd.to_datetime(station_df["date"], utc=True)
    try:
        station_df["date"] = station_df["date"].dt.tz_localize("Europe/Paris")
    except:  # try to convert TZ  # noqa: E722
        station_df["date"] = station_df["date"].dt.tz_convert("Europe/Paris")

    station_df["station_id"] = station_df["station_id"].astype(int)
    station_df = station_df.sort_values(["station_id", "date"], ascending=[1, 1])

    # Reset index
    station_df = station_df.reset_index(drop=True)

    # Dropduplicate station_id / date rows
    station_df = station_df.drop_duplicates(subset=["station_id", "date"]).reset_index(drop=True)

    # Create features
    station_df = get_transactions_in(station_df)
    station_df = get_transactions_out(station_df)
    station_df = get_transactions_all(station_df)

    ## Resampling

    # # cf Bug Pandas : https://github.com/pandas-dev/pandas/issues/33548
    station_df = station_df.set_index("date")

    station_df_resample = (
        station_df.groupby("station_id")
        .resample(
            "10min",
            label="right",
        )
        .agg(
            {
                "available_stands": "last",
                "available_bikes": "last",
                "status": "max",  # Empeche les micro déconnection à la station
                "transactions_in": "sum",
                "transactions_out": "sum",
                "transactions_all": "sum",
            }
        )
        .reset_index()
    )
    return station_df_resample

In [97]:
# New fuction optimized & clean
def transform_json_api_bdx_station_data_to_df(station_json):
    """
    Tranforme la Time Serie d'activité d'une ou plusieurs station en DataFrame
    à partir de la fonction get_data_from_api_bdx_by_station()
    Effectue plusieurs transformation comme la fonction create/creator.py
    create_activity_time_series()
        - Naming des colonnes json
        - Structuration
        - Naming
        - Ajout de variables
        - Resampling sur 10min

    Parameters
    ----------
    station_json : json
        Time serie au format json de l'activité d'une station (ou plusieurs)
    Returns
    -------
    station_df_resample : DataFrame
        Time serie au format DataFrame de l'activité d'une ou plusieurs station
        resampler sur 10 min.

    Examples
    --------

    station_df = transform_json_api_bdx_station_data_to_df(station_json)

    """

    station_df = pd.json_normalize(station_json, record_path=["features"])

    # Naming from JSON DataFrame
    station_df = station_df.rename(
        columns={
            "properties.time": "date",
            "properties.ident": "station_id",
            "properties.nom": "name",
            "properties.etat": "status",
            "properties.nbplaces": "available_stands",
            "properties.nbvelos": "available_bikes",
        }
    )

    # Status mapping
    status_dict = {"CONNECTEE": 1, "DECONNECTEE": 0}
    station_df["status"] = station_df["status"].map(status_dict).fillna(0)
    station_df["status"] = station_df["status"].astype("uint8")

    # Casting & sorting DataFrame on station_id & date
    station_df["date"] = pd.to_datetime(station_df["date"], utc=True)

    # Convert to Europe/Paris TZ
    try:
        station_df["date"] = station_df["date"].dt.tz_localize("Europe/Paris")
    except TypeError:  # try to convert TZ
        station_df["date"] = station_df["date"].dt.tz_convert("Europe/Paris")

    station_df["station_id"] = station_df["station_id"].astype(int)
    station_df = station_df.sort_values(["station_id", "date"], ascending=[1, 1])

    # Dropduplicate station_id / date rows
    station_df = station_df.drop_duplicates(subset=["station_id", "date"]).reset_index(drop=True)

    # Create features
    station_df = get_transactions_in(station_df)
    station_df = get_transactions_out(station_df)
    station_df = get_transactions_all(station_df)

    ## Resampling

    # # cf Bug Pandas : https://github.com/pandas-dev/pandas/issues/33548
    station_df = station_df.set_index("date")

    station_df_resample = (
        station_df.groupby("station_id")
        .resample(
            "10min",
            label="right",
        )
        .agg(
            {
                "available_stands": "last",
                "available_bikes": "last",
                "status": "max",  # Empeche les micro déconnection à la station
                "transactions_in": "sum",
                "transactions_out": "sum",
                "transactions_all": "sum",
            }
        )
        .reset_index()
    )
    return station_df_resample

In [95]:
%timeit -o -r 50 station_df_from_json = transform_json_api_bdx_station_data_to_df(station_json_loaded)

23.9 ms ± 97.3 μs per loop (mean ± std. dev. of 50 runs, 10 loops each)


<TimeitResult : 23.9 ms ± 97.3 μs per loop (mean ± std. dev. of 50 runs, 10 loops each)>

In [96]:
min(_.timings) * 1_000

23.862433433532715

In [None]:
# Original : 25.5 ms ± 433 μs per loop (mean ± std. dev. of 50 runs, 10 loops each)
# improve rename cols : 24.6 ms ± 994 μs per loop (mean ± std. dev. of 50 runs, 10 loops each)
# delete a reset_index() : 24.2 ms ± 826 μs per loop (mean ± std. dev. of 50 runs, 10 loops each)
# convert datetime utc=true by default : 23.9 ms ± 97.3 μs per loop (mean ± std. dev. of 50 runs, 10 loops each)

In [84]:
# Test pour la lecture d'heure d'été et d'heure d'hiver
from datetime import datetime
dates = [
    "2023-03-10T01:00:00+01:00",  # Hiver
    "2023-06-10T01:00:00+02:00",  # Été
    "2023-12-10T01:00:00+01:00",  # Hiver
    "2023-09-10T01:00:00+02:00"   # Été
]

# dates = [datetime.fromisoformat(date) for date in dates]

# Créer le DataFrame
station_df = pd.DataFrame({
    'date': dates})

In [85]:
station_df

Unnamed: 0,date
0,2023-03-10T01:00:00+01:00
1,2023-06-10T01:00:00+02:00
2,2023-12-10T01:00:00+01:00
3,2023-09-10T01:00:00+02:00


In [86]:
station_df.dtypes

date    object
dtype: object

In [87]:
station_df["date"] = pd.to_datetime(station_df["date"], utc=True)

In [88]:
station_df

Unnamed: 0,date
0,2023-03-10 00:00:00+00:00
1,2023-06-09 23:00:00+00:00
2,2023-12-10 00:00:00+00:00
3,2023-09-09 23:00:00+00:00


In [89]:
station_df["date"] = station_df["date"].dt.tz_convert("Europe/Paris")

In [90]:
station_df

Unnamed: 0,date
0,2023-03-10 01:00:00+01:00
1,2023-06-10 01:00:00+02:00
2,2023-12-10 01:00:00+01:00
3,2023-09-10 01:00:00+02:00
