In [12]:
import json

import pandas as pd

from vcub_keeper.config import ROOT_TESTS_DATA
from vcub_keeper.reader.reader import read_activity_vcub

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Permets d'extraire les données non simulée afin de faire les tests de benchmark pour les fonctions suivantes :
- `get_transactions_out()`
- `get_transactions_in()`

In [13]:
# Lecture de l'activité des stations
activite = read_activity_vcub()

In [14]:
activite

gid,station_id,type,name,state,available_stands,available_bikes,date
u8,u8,cat,str,str,u8,u8,datetime[μs]
83,1,"""VLS""","""Meriadeck""","""1""",18,2,2017-07-09 00:03:04
83,1,"""VLS""","""Meriadeck""","""1""",18,2,2017-07-09 00:04:04
83,1,"""VLS""","""Meriadeck""","""1""",18,2,2017-07-09 00:09:04
83,1,"""VLS""","""Meriadeck""","""1""",18,2,2017-07-09 00:14:03
83,1,"""VLS""","""Meriadeck""","""1""",18,2,2017-07-09 00:19:04
…,…,…,…,…,…,…,…
176,174,"""VLS""","""Darwin""","""1""",1,19,2017-09-26 14:39:02
176,174,"""VLS""","""Darwin""","""1""",0,20,2017-09-26 14:44:05
176,174,"""VLS""","""Darwin""","""1""",0,20,2017-09-26 14:49:05
176,174,"""VLS""","""Darwin""","""1""",0,20,2017-09-26 14:54:04


In [15]:
activite.collect_schema()

Schema([('gid', UInt8),
        ('station_id', UInt8),
        ('type', Categorical(ordering='physical')),
        ('name', String),
        ('state', String),
        ('available_stands', UInt8),
        ('available_bikes', UInt8),
        ('date', Datetime(time_unit='us', time_zone=None))])

In [None]:
# La suite est du Pandas (wip)

# TO DO
# State est un string pour Polars, alors que c'était un categorical pour Pandas

In [48]:
list_station_id = [106, 22, 43, 102, 123]
start_date = "2017-07-10"
end_date = "2017-07-14"

export = activite[
    (activite["station_id"].isin(list_station_id)) & (activite["date"] >= start_date) & (activite["date"] <= end_date)
].copy()
export.shape

(5630, 8)

In [49]:
export.station_id.unique()

array([ 22,  43, 102, 106, 123], dtype=uint8)

In [50]:
export.head()

Unnamed: 0,gid,station_id,type,name,state,available_stands,available_bikes,date
454307,92,22,VLS,Hotel de Ville,1,31,2,2017-07-10 00:04:04
454308,92,22,VLS,Hotel de Ville,1,33,0,2017-07-10 00:09:04
454309,92,22,VLS,Hotel de Ville,1,33,0,2017-07-10 00:14:05
454310,92,22,VLS,Hotel de Ville,1,33,0,2017-07-10 00:19:04
454311,92,22,VLS,Hotel de Ville,1,33,0,2017-07-10 00:24:05


In [51]:
export.dtypes

gid                          uint8
station_id                   uint8
type                      category
name                string[python]
state                     category
available_stands             uint8
available_bikes              uint8
date                datetime64[ns]
dtype: object

In [52]:
# have to change some columns to simulate original data


state_dict = {1: "CONNECTEE", 0: "DECONNECTEE"}
export["state"] = export["state"].map(state_dict)

export = export.rename(columns={"station_id": "ident", "date": "ts"})

### Export des données de test

In [53]:
export.to_csv(ROOT_TESTS_DATA + "activite_data.csv", index=False)

In [54]:
result_read_df = read_activity_vcub(file_path=ROOT_TESTS_DATA + "activite_data.csv")

In [55]:
export_test = activite[
    (activite["station_id"].isin(list_station_id)) & (activite["date"] >= start_date) & (activite["date"] <= end_date)
].copy()

In [56]:
pd.testing.assert_frame_equal(
    result_read_df.reset_index(drop=True),
    export_test.reset_index(drop=True),
    check_categorical=False,
    check_dtype=False,
)

In [57]:
result_read_df.reset_index(drop=True).head()

Unnamed: 0,gid,station_id,type,name,state,available_stands,available_bikes,date
0,92,22,VLS,Hotel de Ville,1,31,2,2017-07-10 00:04:04
1,92,22,VLS,Hotel de Ville,1,33,0,2017-07-10 00:09:04
2,92,22,VLS,Hotel de Ville,1,33,0,2017-07-10 00:14:05
3,92,22,VLS,Hotel de Ville,1,33,0,2017-07-10 00:19:04
4,92,22,VLS,Hotel de Ville,1,33,0,2017-07-10 00:24:05


In [58]:
export_test.reset_index(drop=True).head()

Unnamed: 0,gid,station_id,type,name,state,available_stands,available_bikes,date
0,92,22,VLS,Hotel de Ville,1,31,2,2017-07-10 00:04:04
1,92,22,VLS,Hotel de Ville,1,33,0,2017-07-10 00:09:04
2,92,22,VLS,Hotel de Ville,1,33,0,2017-07-10 00:14:05
3,92,22,VLS,Hotel de Ville,1,33,0,2017-07-10 00:19:04
4,92,22,VLS,Hotel de Ville,1,33,0,2017-07-10 00:24:05


# Amélioriation de la performance de la fonction 

cf : https://github.com/armgilles/vcub_keeper/issues/103

In [1]:
import pandas as pd

from vcub_keeper.config import ROOT_TESTS_DATA

In [2]:
# Fonction de lecture des données de tests
def read_json_data(file_name="data_test_api_from_bdx.json"):
    """
    Read test json data
    From notebooks/04_tests/03_test_data_activite.ipynb
    """

    # Loading data from data test (.json)
    with open(ROOT_TESTS_DATA + file_name) as f:
        station_json_loaded = json.load(f)
    return station_json_loaded


station_json_loaded = read_json_data()

In [95]:
%timeit -o -r 50 station_df_from_json = transform_json_api_bdx_station_data_to_df(station_json_loaded)

23.9 ms ± 97.3 μs per loop (mean ± std. dev. of 50 runs, 10 loops each)


<TimeitResult : 23.9 ms ± 97.3 μs per loop (mean ± std. dev. of 50 runs, 10 loops each)>

In [96]:
# min(_.timings) * 1_000

23.862433433532715

In [None]:
# Original : 25.5 ms ± 433 μs per loop (mean ± std. dev. of 50 runs, 10 loops each)
# improve rename cols : 24.6 ms ± 994 μs per loop (mean ± std. dev. of 50 runs, 10 loops each)
# delete a reset_index() : 24.2 ms ± 826 μs per loop (mean ± std. dev. of 50 runs, 10 loops each)
# convert datetime utc=true by default : 23.9 ms ± 97.3 μs per loop (mean ± std. dev. of 50 runs, 10 loops each)

In [84]:
# Test pour la lecture d'heure d'été et d'heure d'hiver
dates = [
    "2023-03-10T01:00:00+01:00",  # Hiver
    "2023-06-10T01:00:00+02:00",  # Été
    "2023-12-10T01:00:00+01:00",  # Hiver
    "2023-09-10T01:00:00+02:00",  # Été
]

# dates = [datetime.fromisoformat(date) for date in dates]

# Créer le DataFrame
station_df = pd.DataFrame({"date": dates})

In [85]:
station_df

Unnamed: 0,date
0,2023-03-10T01:00:00+01:00
1,2023-06-10T01:00:00+02:00
2,2023-12-10T01:00:00+01:00
3,2023-09-10T01:00:00+02:00


In [86]:
station_df.dtypes

date    object
dtype: object

In [87]:
station_df["date"] = pd.to_datetime(station_df["date"], utc=True)

In [88]:
station_df

Unnamed: 0,date
0,2023-03-10 00:00:00+00:00
1,2023-06-09 23:00:00+00:00
2,2023-12-10 00:00:00+00:00
3,2023-09-09 23:00:00+00:00


In [89]:
station_df["date"] = station_df["date"].dt.tz_convert("Europe/Paris")

In [90]:
station_df

Unnamed: 0,date
0,2023-03-10 01:00:00+01:00
1,2023-06-10 01:00:00+02:00
2,2023-12-10 01:00:00+01:00
3,2023-09-10 01:00:00+02:00
