In [34]:
import pandas as pd
import requests

from vcub_keeper.config import *
from vcub_keeper.reader.reader import *
from vcub_keeper.reader.reader_utils import filter_periode
from vcub_keeper.visualisation import *
from vcub_keeper.transform.features_factory import *

from sklearn.manifold import Isomap
from scipy import stats
from joblib import dump, load

%load_ext autoreload
%autoreload 2

pd.options.display.max_rows = 500
pd.options.display.max_columns = 500

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Objectifs

- Obtenir les données depuis l'api de Damien
- Structuré ces données pour les utiliser avec le pipeline de prédiction

# Get data from API

## API Oslandia

### Dev

In [30]:
station_id=25
#station_id="25,102"
start_date='2021-10-14'
stop_date='2021-10-20'

#url = "http://data.oslandia.io/bikes/api/bordeaux/timeseries/station/106?start=2020-10-09&stop=2020-10-17"

url = \
    "http://data.oslandia.io/bikes/api/bordeaux/timeseries/station/"+ str(station_id) +\
    "?start=" + start_date + "&stop=" + stop_date
    
response = requests.get(url)

In [31]:
url

'http://data.oslandia.io/bikes/api/bordeaux/timeseries/station/25?start=2021-10-14&stop=2021-10-20'

In [32]:
response.json()

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [33]:
station_df = pd.DataFrame(response.json()['data'][0])

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [5]:
station_df.id.unique()

array(['25'], dtype=object)

In [6]:
# Status mapping
status_dict = {'open' : 1,
               'closed' : 0
              }
station_df['status'] = station_df['status'].map(status_dict)
station_df['status'] = station_df['status'].astype('uint8')

# Naming
station_df.rename(columns={'id':'station_id'}, inplace=True)
station_df.rename(columns={'ts':'date'}, inplace=True)

# Casting date & sorting DataFrame on station_id & date
station_df['date'] = pd.to_datetime(station_df['date'])
station_df = station_df.sort_values(['station_id', 'date'], ascending=[1, 1])

# Reset index
station_df = station_df.reset_index(drop=True)

# Dropduplicate station_id / date rows
station_df = station_df.drop_duplicates(subset=['station_id', 'date']).reset_index(drop=True)

# Create features
station_df = get_transactions_in(station_df)
station_df = get_transactions_out(station_df)
station_df = get_transactions_all(station_df)

In [7]:
station_df.tail(10)

Unnamed: 0,available_bikes,available_stands,station_id,name,nb_stands,status,date,transactions_in,transactions_out,transactions_all
1619,0,0,25,François de Sourdis,14,0,2020-10-19 23:13:13,0.0,0.0,0.0
1620,0,0,25,François de Sourdis,14,0,2020-10-19 23:18:13,0.0,0.0,0.0
1621,0,0,25,François de Sourdis,14,0,2020-10-19 23:24:13,0.0,0.0,0.0
1622,0,0,25,François de Sourdis,14,0,2020-10-19 23:29:13,0.0,0.0,0.0
1623,0,0,25,François de Sourdis,14,0,2020-10-19 23:33:13,0.0,0.0,0.0
1624,0,0,25,François de Sourdis,14,0,2020-10-19 23:37:13,0.0,0.0,0.0
1625,0,0,25,François de Sourdis,14,0,2020-10-19 23:42:13,0.0,0.0,0.0
1626,0,0,25,François de Sourdis,14,0,2020-10-19 23:47:13,0.0,0.0,0.0
1627,0,0,25,François de Sourdis,14,0,2020-10-19 23:53:13,0.0,0.0,0.0
1628,0,0,25,François de Sourdis,14,0,2020-10-19 23:57:13,0.0,0.0,0.0


### Industrialisation

In [8]:
from vcub_keeper.config import *
from vcub_keeper.reader.reader import *
from vcub_keeper.config import THRESHOLD_PROFILE_STATION

#THRESHOLD_PROFILE_STATION = 0.3

station_profile = read_station_profile(path_directory=ROOT_DATA_REF)

stations_id_to_pred = \
    station_profile[station_profile['mean'] >= THRESHOLD_PROFILE_STATION]['station_id'].unique()

In [9]:
type(stations_id_to_pred)

numpy.ndarray

In [10]:
','.join(map(str, stations_id_to_pred))

'124,15,60,18,10,68,130,2,105,120,11,110,9,23,3,16,20,136,42,21,131,59,45,172,6,24,36,108,19,125,135,37,139,99,28,57,7,98,8,41,40,58,55,109,1,4,100,134,174,101,56,104,43,54,44,102,133,103,5,127,65,22,123,39,106'

In [11]:
def get_data_from_api_by_station(station_id, start_date, stop_date):
    """
    Permet d'obtenir les données d'activité d'une station via une API
    
    Parameters
    ----------
    station_id : Int or List
        Numéro de la station de Vcub
    start_date : str
        Date de début de la Time Serie
    stop_date : str
        Date de fin de la Time Serie
    
    Returns
    -------
    Time serie in Json format
        
    Examples
    --------
    
    station_json = get_data_from_api_by_station(station_id=19, 
                                                start_date='2020-10-14',
                                                stop_date='2020-10-17')
    """
    
    if isinstance(station_id, (list, np.ndarray)):
        station_id = ','.join(map(str, station_id))
    
    url = \
        "http://data.oslandia.io/bikes/api/bordeaux/timeseries/station/"+ str(station_id) +\
        "?start=" + start_date + "&stop=" + stop_date
    
    response = requests.get(url)
    return response.json()

def transform_json_station_data_to_df(station_json):
    """
    Tranforme la Time Serie d'activité d'une ou plusieurs station en DataFrame
    Effectue plusieurs transformation comme la fonction create/creator.py 
    create_activity_time_series()
        - Structuration
        - Naming
        - Ajout de variables
        - Resampling sur 10min
    
    Parameters
    ----------
    station_json : json
        Time serie au format json de l'activité d'une station (ou plusieurs)
    Returns
    -------
    station_df_resample : DataFrame
        Time serie au format DataFrame de l'activité d'une ou plusieurs station
        resampler sur 10 min.
        
    Examples
    --------
    
    station_df = transform_json_station_data_to_df(station_json)
    
    """
    
    # Si il y a plusieurs stations dans le json
    if len(station_json['data']) > 1:
        station_df = pd.DataFrame()
        for i in range(0, len(station_json['data'])):
            temp_station_df = pd.DataFrame(station_json['data'][i])
            station_df = pd.concat([station_df, temp_station_df])
    # Il y une seule station dans le json
    else:        
        station_df = pd.DataFrame(station_json['data'][0])
    
    # Status mapping
    status_dict = {'open' : 1,
                   'closed' : 0
                  }
    station_df['status'] = station_df['status'].map(status_dict)
    station_df['status'] = station_df['status'].astype('uint8')

    # Naming
    station_df.rename(columns={'id':'station_id'}, inplace=True)
    station_df.rename(columns={'ts':'date'}, inplace=True)

    # Casting & sorting DataFrame on station_id & date
    station_df['date'] = pd.to_datetime(station_df['date'])
    station_df['station_id'] = station_df['station_id'].astype(int)
    station_df = station_df.sort_values(['station_id', 'date'], ascending=[1, 1])

    # Reset index
    station_df = station_df.reset_index(drop=True)

    # Dropduplicate station_id / date rows
    station_df = station_df.drop_duplicates(subset=['station_id', 'date']).reset_index(drop=True)

    # Create features
    station_df = get_transactions_in(station_df)
    station_df = get_transactions_out(station_df)
    station_df = get_transactions_all(station_df)

    ## Resampling

    # cf Bug Pandas : https://github.com/pandas-dev/pandas/issues/33548
    station_df = station_df.set_index('date')

    station_df_resample = \
        station_df.groupby('station_id').resample('10T', 
                                                  label='right',
                                                 ).agg({'available_stands' : 'last',
                                                        'available_bikes' : 'last',
                                                        'status' : 'max', # Empeche les micro déconnection à la station
                                                        'transactions_in' : 'sum',
                                                        'transactions_out' : 'sum',
                                                        'transactions_all' : 'sum'}).reset_index()
    return station_df_resample

In [12]:
station_id='19'
#station_id=[19, 105, 102]
start_date='2020-10-14'
stop_date='2020-10-17'

station_json = get_data_from_api_by_station(station_id=station_id, 
                                            start_date=start_date,
                                            stop_date=stop_date)

station_df = transform_json_station_data_to_df(station_json)

In [13]:
station_df.tail()

Unnamed: 0,station_id,date,available_stands,available_bikes,status,transactions_in,transactions_out,transactions_all
427,19,2020-10-16 23:20:00,18.0,12.0,1.0,0.0,0.0,0.0
428,19,2020-10-16 23:30:00,18.0,12.0,1.0,0.0,0.0,0.0
429,19,2020-10-16 23:40:00,18.0,12.0,1.0,0.0,0.0,0.0
430,19,2020-10-16 23:50:00,18.0,12.0,1.0,0.0,0.0,0.0
431,19,2020-10-17 00:00:00,19.0,11.0,1.0,0.0,1.0,1.0


In [14]:
station_df.tail()

Unnamed: 0,station_id,date,available_stands,available_bikes,status,transactions_in,transactions_out,transactions_all
427,19,2020-10-16 23:20:00,18.0,12.0,1.0,0.0,0.0,0.0
428,19,2020-10-16 23:30:00,18.0,12.0,1.0,0.0,0.0,0.0
429,19,2020-10-16 23:40:00,18.0,12.0,1.0,0.0,0.0,0.0
430,19,2020-10-16 23:50:00,18.0,12.0,1.0,0.0,0.0,0.0
431,19,2020-10-17 00:00:00,19.0,11.0,1.0,0.0,1.0,1.0


In [15]:
station_df.dtypes

station_id                   int64
date                datetime64[ns]
available_stands           float64
available_bikes            float64
status                     float64
transactions_in            float64
transactions_out           float64
transactions_all           float64
dtype: object

### Script

In [15]:
from vcub_keeper.production.data import (get_data_from_api_by_station,
                                         transform_json_station_data_to_df)
from vcub_keeper.transform.features_factory import get_consecutive_no_transactions_out
from vcub_keeper.visualisation import *

In [2]:
station_id=37
#station_id=[19, 105, 102]
start_date='2022-02-01'
stop_date='2022-09'

In [3]:
station_json = get_data_from_api_by_station(station_id=station_id, 
                                            start_date=start_date,
                                            stop_date=stop_date)

station_df = transform_json_station_data_to_df(station_json)

In [10]:
test_df = pd.DataFrame(station_json['data'][0])
test_df.tail(25)

Unnamed: 0,available_bikes,available_stands,id,name,nb_stands,status,ts
2115,14,15,37,Jardin Public,30,open,2022-02-08T08:22:44
2116,14,15,37,Jardin Public,30,open,2022-02-08T08:27:44
2117,14,15,37,Jardin Public,30,open,2022-02-08T08:32:44
2118,14,15,37,Jardin Public,30,open,2022-02-08T08:37:44
2119,15,14,37,Jardin Public,30,open,2022-02-08T08:43:44
2120,14,15,37,Jardin Public,30,open,2022-02-08T08:48:44
2121,14,15,37,Jardin Public,30,open,2022-02-08T08:54:44
2122,15,14,37,Jardin Public,30,open,2022-02-08T08:56:44
2123,16,13,37,Jardin Public,30,open,2022-02-08T09:01:44
2124,15,14,37,Jardin Public,30,open,2022-02-08T09:07:44


In [5]:
station_df.tail(10)

Unnamed: 0,station_id,date,available_stands,available_bikes,status,transactions_in,transactions_out,transactions_all
1060,37,2022-02-08 08:50:00,15,14,1,1.0,1.0,2.0
1061,37,2022-02-08 09:00:00,14,15,1,1.0,0.0,1.0
1062,37,2022-02-08 09:10:00,14,15,1,1.0,1.0,2.0
1063,37,2022-02-08 09:20:00,12,17,1,2.0,0.0,2.0
1064,37,2022-02-08 09:30:00,10,19,1,2.0,0.0,2.0
1065,37,2022-02-08 09:40:00,10,19,1,0.0,0.0,0.0
1066,37,2022-02-08 09:50:00,11,18,1,0.0,1.0,1.0
1067,37,2022-02-08 10:00:00,11,18,1,0.0,0.0,0.0
1068,37,2022-02-08 10:10:00,11,18,1,0.0,0.0,0.0
1069,37,2022-02-08 10:20:00,12,17,1,0.0,1.0,1.0


In [20]:
station_df = get_consecutive_no_transactions_out(station_df) 

In [21]:
station_df

Unnamed: 0,station_id,date,available_stands,available_bikes,status,transactions_in,transactions_out,transactions_all,consecutive_no_transactions_out
0,19,2020-10-14 00:10:00,20.0,10.0,1.0,0.0,0.0,0.0,0
1,19,2020-10-14 00:20:00,20.0,10.0,1.0,0.0,0.0,0.0,1
2,19,2020-10-14 00:30:00,20.0,10.0,1.0,0.0,0.0,0.0,2
3,19,2020-10-14 00:40:00,20.0,10.0,1.0,0.0,0.0,0.0,3
4,19,2020-10-14 00:50:00,21.0,9.0,1.0,0.0,1.0,1.0,0
...,...,...,...,...,...,...,...,...,...
3451,105,2020-10-21 23:20:00,5.0,15.0,1.0,0.0,0.0,0.0,15
3452,105,2020-10-21 23:30:00,6.0,14.0,1.0,0.0,1.0,1.0,0
3453,105,2020-10-21 23:40:00,6.0,14.0,1.0,0.0,0.0,0.0,1
3454,105,2020-10-21 23:50:00,6.0,14.0,1.0,0.0,0.0,0.0,2


In [22]:
plot_station_activity(station_df, station_id=105, #105 #station_id
                      features_to_plot=['available_bikes', #'available_stands',
                                       'consecutive_no_transactions_out',
                                        'status'
                                       ],
                      #start_date=start_date,
                      #end_date=end_date,
                      return_data=False)

In [23]:
station_df[station_df.status == 0]

Unnamed: 0,station_id,date,available_stands,available_bikes,status,transactions_in,transactions_out,transactions_all,consecutive_no_transactions_out
1861,102,2020-10-18 22:20:00,5.0,15.0,0.0,0.0,0.0,0.0,0
1862,102,2020-10-18 22:30:00,5.0,15.0,0.0,0.0,0.0,0.0,0
3011,105,2020-10-18 22:00:00,1.0,19.0,0.0,0.0,0.0,0.0,0
3012,105,2020-10-18 22:10:00,1.0,19.0,0.0,0.0,0.0,0.0,0
3013,105,2020-10-18 22:20:00,1.0,19.0,0.0,0.0,0.0,0.0,0


In [25]:
station_df.status.value_counts()

1.0    3307
0.0       5
Name: status, dtype: int64

## API de open data Bordeaux

### Dev

In [285]:
# Day
URL = "https://data.bordeaux-metropole.fr/geojson/aggregate/ci_vcub_p?key=[KEY]&&rangeStart=2022-03-31T15:50:00&&rangeEnd=2022-03-31T16:00:00&rangeStep=day"
# 5min
URL = "https://data.bordeaux-metropole.fr/geojson/aggregate/ci_vcub_p?key=[KEY]&&rangeStart=2022-03-31T15:50:00&&rangeEnd=2022-03-31T16:00:00&rangeStep=5min"
URL = "https://data.bordeaux-metropole.fr/geojson/aggregate/ci_vcub_p?key=[KEY]&&rangeStart=2022-03-31T15:50:00&&rangeEnd=2022-03-31T16:01:00&rangeStep=5min"

# 
URL = "https://data.bordeaux-metropole.fr/geojson/aggregate/ci_vcub_p?key=[KEY]&&rangeStart=2022-03-24T15:50:00&&rangeEnd=2022-03-31T16:01:00&rangeStep=5min" # KO
URL = "https://data.bordeaux-metropole.fr/geojson/aggregate/ci_vcub_p?key=[KEY]&&rangeStart=2022-03-28&&rangeEnd=2022-03-31&rangeStep=5min" # KO
URL = "https://data.bordeaux-metropole.fr/geojson/aggregate/ci_vcub_p?key=[KEY]&&rangeStart=2022-03-30&&rangeEnd=2022-03-31&rangeStep=5min" # KO


# avec filtre station vcub
URL = 'https://data.bordeaux-metropole.fr/geojson/aggregate/ci_vcub_p?key=[KEY]&rangeStart=2022-03-31T15:50:00&filter={"ident":2}&rangeEnd=2022-03-31T16:01:00&rangeStep=5min' #OK

# filtre station vcub + 1 semaine data
URL = 'https://data.bordeaux-metropole.fr/geojson/aggregate/ci_vcub_p?key=[KEY]&rangeStart=2022-03-24T00:00:00&filter={"ident":2}&rangeEnd=2022-03-31T16:01:00&rangeStep=5min' #OK

# filtre station vcub + 1 semaine data + 1ere données


# 1 +/- semaine de data (toutes stations)
URL = 'https://data.bordeaux-metropole.fr/geojson/aggregate/ci_vcub_p?key=[KEY]&rangeStart=2022-03-24T15:50:00&rangeEnd=2022-03-31T16:01:00&rangeStep=5min&attributes=["nom","etat", "ident", "nbplaces", "nbvelos", "gid"]' # KO

# 1 jours de data (toutes stations)
# 17,9 s
URL = 'https://data.bordeaux-metropole.fr/geojson/aggregate/ci_vcub_p?key=[KEY]&rangeStart=2022-03-30T15:50:00&rangeEnd=2022-03-31T16:01:00&rangeStep=5min&attributes=["nom","etat", "ident", "nbplaces", "nbvelos", "gid"]' # OK

# 2 jours de data (toutes stations)
# 38,8 s
URL = 'https://data.bordeaux-metropole.fr/geojson/aggregate/ci_vcub_p?key=[KEY]&rangeStart=2022-03-30T15:50:00&rangeEnd=2022-03-31T16:01:00&rangeStep=5min&attributes=["nom","etat", "ident", "nbplaces", "nbvelos", "gid"]' # KO

# avec filtre sur plusieurs station vcub
URL = 'https://data.bordeaux-metropole.fr/geojson/aggregate/ci_vcub_p?key=[KEY]&rangeStart=2022-03-31T15:50:00&filter={"ident":{"$in":[1,2]}}&rangeEnd=2022-03-31T16:01:00&rangeStep=5min' # OK

# URL optimisé (on choisit les attribut de retour)
attributes=["nom","etat", "ident", "nbplaces", "nbvelos", "gid"]
attributes={"nom": "mode", "etat": "mode", "nbplaces": "max", "nbvelos": "max"}
# 0,4s
URL = 'https://data.bordeaux-metropole.fr/geojson/aggregate/ci_vcub_p?key=[KEY]&rangeStart=2022-03-29T15:50:00&filter={"ident":1}&rangeEnd=2022-03-31T16:01:00&rangeStep=5min&attributes=["nom","etat", "ident", "nbplaces", "nbvelos", "gid"]' #OK
# 0,4s
URL = 'https://data.bordeaux-metropole.fr/geojson/aggregate/ci_vcub_p?key=[KEY]&rangeStart=2022-03-29T15:50:00&filter={"ident":1}&rangeEnd=2022-03-31T16:01:00&rangeStep=5min&attributes={"nom": "mode", "etat": "mode", "nbplaces": "max", "nbvelos": "max"}' #OK
# 0,7s
URL = 'https://data.bordeaux-metropole.fr/geojson/aggregate/ci_vcub_p?key=[KEY]&rangeStart=2022-03-29T15:50:00&filter={"ident":1}&rangeEnd=2022-03-31T16:01:00&rangeStep=5min&' #OK


In [503]:
# Toute les stations sur une période déterminé 

# 1 jours : 18 sec / 53835 lignes
URL = 'https://data.bordeaux-metropole.fr/geojson/aggregate/ci_vcub_p?key=[KEY]&rangeStart=2022-03-30T15:50:00&rangeEnd=2022-03-31T16:01:00&rangeStep=5min&attributes={"nom": "mode", "etat": "mode", "nbplaces": "max", "nbvelos": "max"}' #OK

# 2 jours : 37,8 sec --> HS
URL = 'https://data.bordeaux-metropole.fr/geojson/aggregate/ci_vcub_p?key=[KEY]&rangeStart=2022-03-29T15:50:00&rangeEnd=2022-03-31T16:01:00&rangeStep=5min&attributes={"nom": "mode", "etat": "mode", "nbplaces": "max", "nbvelos": "max"}' #OK

# 1.5 jours : 30,2 sec --> HS
URL = 'https://data.bordeaux-metropole.fr/geojson/aggregate/ci_vcub_p?key=[KEY]&rangeStart=2022-03-29T23:50:00&rangeEnd=2022-03-31T16:01:00&rangeStep=5min&attributes={"nom": "mode", "etat": "mode", "nbplaces": "max", "nbvelos": "max"}' #OK

In [507]:
URL = 'https://data.bordeaux-metropole.fr/geojson/aggregate/ci_vcub_p?key=[KEY]&rangeStart=2022-03-24T00:00:00&filter={"ident":2}&rangeEnd=2022-03-31T16:01:00&rangeStep=5min&attributes={"nom": "mode", "etat": "mode", "nbplaces": "max", "nbvelos": "max"}' #OK

Récapitulatif : 

- Pour les requêtes d'affichage d'une station (avec graph + anomalie) :
    Possibilité d'utiliser une requête de ce type qui est assez rapide (+/- 1 sec)
   
    `URL = 'https://data.bordeaux-metropole.fr/geojson/aggregate/ci_vcub_p?key=[KEY]&rangeStart=2022-03-24T00:00:00&filter={"ident":2}&rangeEnd=2022-03-31T16:01:00&rangeStep=5min&attributes={"nom": "mode", "etat": "mode", "nbplaces": "max", "nbvelos": "max"}'`

- Pour les requêtes lors de la détection d'ano toutes les 10 minutes : 
    On ne peut pas utiliser la même requête que plus haut sans filtre de stations (pas de retour de la requête / trop long / limitation API). Par contre il est possible de demander +/- 24h de données pour toutes les stations (+/- 18 sec)
    
    `URL = 'https://data.bordeaux-metropole.fr/geojson/aggregate/ci_vcub_p?key=[KEY]&rangeStart=2022-03-30T15:50:00&rangeEnd=2022-03-31T16:01:00&rangeStep=5min&attributes={"nom": "mode", "etat": "mode", "nbplaces": "max", "nbvelos": "max"}'`


In [535]:
response = requests.get(URL)
response

<Response [200]>

In [536]:
response.json()
#{"nom": "mode", "etat": "mode", "nbplaces": "max", "nbvelos": "max"}

{'error': "Erreur interne. Si le problème persiste contactez l'administrateur"}

In [533]:
len(response.json()['features'])

2124

In [523]:
# Create Df with data in list
station_df = pd.json_normalize(response.json(), record_path =['features'])

# Naming
station_df.rename(columns={'properties.time':'time'}, inplace=True)
station_df.rename(columns={'properties.gid':'gid'}, inplace=True)
station_df.rename(columns={'properties.nom':'nom'}, inplace=True)
station_df.rename(columns={'properties.etat':'etat'}, inplace=True)
station_df.rename(columns={'properties.nbplaces':'nbplaces'}, inplace=True)
station_df.rename(columns={'properties.nbvelos':'nbvelos'}, inplace=True)

try :
# type 
    station_df['time'] = pd.to_datetime(station_df['time'])
except:
# AVEC backintime
    station_df['mdate'] = pd.to_datetime(station_df['mdate'])
    station_df.rename(columns={'mdate':'time'}, inplace=True)

In [524]:
station_df.time.unique()

array([datetime.datetime(2022, 3, 25, 0, 0, tzinfo=tzoffset(None, 3600)),
       datetime.datetime(2022, 3, 25, 0, 5, tzinfo=tzoffset(None, 3600)),
       datetime.datetime(2022, 3, 25, 0, 10, tzinfo=tzoffset(None, 3600)),
       ...,
       datetime.datetime(2022, 4, 1, 9, 45, tzinfo=tzoffset(None, 7200)),
       datetime.datetime(2022, 4, 1, 9, 50, tzinfo=tzoffset(None, 7200)),
       datetime.datetime(2022, 4, 1, 9, 55, tzinfo=tzoffset(None, 7200))],
      dtype=object)

In [525]:
station_df.time.min(), station_df.time.max()

(datetime.datetime(2022, 3, 25, 0, 0, tzinfo=tzoffset(None, 3600)),
 datetime.datetime(2022, 4, 1, 9, 55, tzinfo=tzoffset(None, 7200)))

In [526]:
station_df.tail(45)

Unnamed: 0,type,time,gid,nom,etat,nbplaces,nbvelos
2079,Feature,2022-04-01 06:15:00+02:00,2,St Bruno,CONNECTEE,8,12
2080,Feature,2022-04-01 06:20:00+02:00,2,St Bruno,CONNECTEE,8,12
2081,Feature,2022-04-01 06:25:00+02:00,2,St Bruno,CONNECTEE,8,12
2082,Feature,2022-04-01 06:30:00+02:00,2,St Bruno,CONNECTEE,8,12
2083,Feature,2022-04-01 06:35:00+02:00,2,St Bruno,CONNECTEE,8,12
2084,Feature,2022-04-01 06:40:00+02:00,2,St Bruno,CONNECTEE,8,12
2085,Feature,2022-04-01 06:45:00+02:00,2,St Bruno,CONNECTEE,8,12
2086,Feature,2022-04-01 06:50:00+02:00,2,St Bruno,CONNECTEE,8,12
2087,Feature,2022-04-01 06:55:00+02:00,2,St Bruno,CONNECTEE,9,12
2088,Feature,2022-04-01 07:00:00+02:00,2,St Bruno,CONNECTEE,9,11


In [527]:
station_df.gid.nunique()

1

In [529]:
response.json()['features'][0]['properties']

{'time': '2022-03-25T00:00:00+01:00',
 'gid': 2,
 'nom': 'St Bruno',
 'etat': 'CONNECTEE',
 'nbplaces': 10,
 'nbvelos': 10}

### Industrialisation

In [52]:
def get_data_from_api_bdx_by_station(station_id, start_date, stop_date):
    """
    Permet d'obtenir les données d'activité d'une station via une API d'open data Bordeaux
    
    Parameters
    ----------
    station_id : Int or List
        Numéro de la station de Vcub
    start_date : str
        Date de début de la Time Serie
    stop_date : str
        Date de fin de la Time Serie
    
    Returns
    -------
    Time serie in Json format
        
    Examples
    --------
    station_json = get_data_from_api_bdx_by_station(station_id=19, 
                                                    start_date='2020-10-14',
                                                    stop_date='2020-10-17')
    """
    
    # Si plusieurs station_id ([124,  15,  60,])
    if isinstance(station_id, (list, np.ndarray)):
        station_id = ','.join(map(str, station_id))

        url = \
            'https://data.bordeaux-metropole.fr/geojson/aggregate/ci_vcub_p?key=' + KEY_API_BDX + '&rangeStart=' + str(start_date) +\
                '&filter={"ident":{"$in":[' + str(station_id) +']}}&rangeEnd=' + str(stop_date) +\
                '&rangeStep=5min&attributes={"nom": "mode", "etat": "mode", "nbplaces": "max", "nbvelos": "max"}'
    # Si une seul station_id
    else:
        url = \
            'https://data.bordeaux-metropole.fr/geojson/aggregate/ci_vcub_p?key=' + KEY_API_BDX + '&rangeStart=' + str(start_date) +\
                '&filter={"ident":' + str(station_id) +'}&rangeEnd=' + str(stop_date) +\
                '&rangeStep=5min&attributes={"nom": "mode", "etat": "mode", "nbplaces": "max", "nbvelos": "max"}'


    response = requests.get(url)
    return response.json()

def transform_json_api_bdx_station_data_to_df(station_json):
    """
    Tranforme la Time Serie d'activité d'une ou plusieurs station en DataFrame
    à partir de la fonction get_data_from_api_bdx_by_station()
    Effectue plusieurs transformation comme la fonction create/creator.py 
    create_activity_time_series()
        - Naming des colonnes json
        - Structuration
        - Naming
        - Ajout de variables
        - Resampling sur 10min
    
    Parameters
    ----------
    station_json : json
        Time serie au format json de l'activité d'une station (ou plusieurs)
    Returns
    -------
    station_df_resample : DataFrame
        Time serie au format DataFrame de l'activité d'une ou plusieurs station
        resampler sur 10 min.
        
    Examples
    --------
    
    station_df = transform_json_api_bdx_station_data_to_df(station_json)
    
    """

    station_df = pd.json_normalize(station_json, record_path =['features'])

    # Naming from JSON DataFrame
    station_df.rename(columns={'properties.time':'time'}, inplace=True)
    station_df.rename(columns={'properties.gid':'gid'}, inplace=True)
    station_df.rename(columns={'properties.nom':'nom'}, inplace=True)
    station_df.rename(columns={'properties.etat':'etat'}, inplace=True)
    station_df.rename(columns={'properties.nbplaces':'nbplaces'}, inplace=True)
    station_df.rename(columns={'properties.nbvelos':'nbvelos'}, inplace=True)

    # naming api Bdx to vanilla api (get_data_from_api_by_station) from DataFrame
    # Naming
    station_df.rename(columns={'time':'date'}, inplace=True)
    station_df.rename(columns={'gid':'station_id'}, inplace=True)
    station_df.rename(columns={'nom':'name'}, inplace=True)
    station_df.rename(columns={'etat':'status'}, inplace=True)
    station_df.rename(columns={'nbvelos':'available_bikes'}, inplace=True)
    station_df.rename(columns={'nbplaces':'available_stands'}, inplace=True)

    # Status mapping
    status_dict = {'CONNECTEE' : 1,
                   'DECONNECTEE' : 0}
    station_df['status'] = station_df['status'].map(status_dict)
    station_df['status'] = station_df['status'].astype('uint8')

    # Casting & sorting DataFrame on station_id & date
    station_df['date'] = pd.to_datetime(station_df['date'])
    try:
        station_df['date'] = \
            pd.to_datetime(station_df['date'])
    except: # Changemnent d'horraire https://github.com/armgilles/vcub_watcher/issues/44
        station_df['date'] = \
            pd.to_datetime(station_df['date'], utc=True)
    try:
        station_df['date'] = \
            station_df['date'].dt.tz_localize('Europe/Paris')
    except:  # try to convert TZ
        station_df['date'] = \
            station_df['date'].dt.tz_convert('Europe/Paris')

    station_df['station_id'] = station_df['station_id'].astype(int)
    station_df = station_df.sort_values(['station_id', 'date'], ascending=[1, 1])

    # Reset index
    station_df = station_df.reset_index(drop=True)

    # Dropduplicate station_id / date rows
    station_df = station_df.drop_duplicates(subset=['station_id', 'date']).reset_index(drop=True)

    # Create features
    station_df = get_transactions_in(station_df)
    station_df = get_transactions_out(station_df)
    station_df = get_transactions_all(station_df)

    ## Resampling

    # # cf Bug Pandas : https://github.com/pandas-dev/pandas/issues/33548
    station_df = station_df.set_index('date')

    station_df_resample = \
    station_df.groupby('station_id').resample('10T', 
                                                label='right',
                                                ).agg({'available_stands' : 'last',
                                                       'available_bikes' : 'last',
                                                       'status' : 'max', # Empeche les micro déconnection à la station
                                                       'transactions_in' : 'sum',
                                                       'transactions_out' : 'sum',
                                                       'transactions_all' : 'sum'}).reset_index()
    return station_df_resample

In [1]:
from vcub_keeper.production.data import get_data_from_api_bdx_by_station, transform_json_api_bdx_station_data_to_df

In [2]:
station_id=2
# station_id=[124,  15,  60,]
# statoin_id = [124,  15,  60,  18,  10,  68, 130,   2, 105, 120,  11, 110,   9,
#         23,   3,  16,  20, 136,  42,  21, 131,  59,  45, 172,   6,  24,
#         36, 108,  19, 125, 135,  37, 139,  99,  28,  57,   7,  98,   8,
#         41,  40,  58,  55, 109,   1,   4, 100, 134, 174, 101,  56, 104,
#         43,  54,  44, 102, 133, 103,   5, 127,  65,  22, 123,  39, 106]
start_date='2022-03-24'
stop_date='2022-04-02'

In [3]:
from datetime import timedelta, date

# Get date
date_today = date.today()
end_date = date_today + timedelta(days=1)
start_date = date_today - timedelta(days=8)

start_date_str = start_date.strftime('%Y-%m-%d')
end_date_str = end_date.strftime('%Y-%m-%d')
print(start_date_str, end_date_str)

2022-03-24 2022-04-02


In [4]:
station_json = get_data_from_api_bdx_by_station(station_id=station_id, 
                                                start_date=start_date,
                                                stop_date=stop_date)

station_df = transform_json_api_bdx_station_data_to_df(station_json)

In [5]:
station_df

Unnamed: 0,station_id,date,available_stands,available_bikes,status,transactions_in,transactions_out,transactions_all
0,2,2022-03-24 01:10:00+01:00,18,2,1,0.0,0.0,0.0
1,2,2022-03-24 01:20:00+01:00,18,2,1,0.0,0.0,0.0
2,2,2022-03-24 01:30:00+01:00,18,2,1,0.0,0.0,0.0
3,2,2022-03-24 01:40:00+01:00,18,2,1,0.0,0.0,0.0
4,2,2022-03-24 01:50:00+01:00,18,2,1,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
1228,2,2022-04-01 14:50:00+02:00,13,7,1,0.0,0.0,0.0
1229,2,2022-04-01 15:00:00+02:00,13,8,1,1.0,0.0,1.0
1230,2,2022-04-01 15:10:00+02:00,11,9,1,1.0,0.0,1.0
1231,2,2022-04-01 15:20:00+02:00,11,9,1,0.0,0.0,0.0


### Test de rapidité API Open Data VS Oslandia

In [17]:
def test_api_bdx(station_id=station_id, start_date=start_date, stop_date=stop_date):
    """
    """
    station_json = get_data_from_api_bdx_by_station(station_id=station_id, 
                                                start_date=start_date,
                                                stop_date=stop_date)

    station_df = transform_json_api_bdx_station_data_to_df(station_json)
    return station_df


from vcub_keeper.production.data import (get_data_from_api_by_station,
                                         transform_json_station_data_to_df)
from vcub_keeper.transform.features_factory import get_consecutive_no_transactions_out
from vcub_keeper.visualisation import *

def test_api_oslandia(station_id=station_id, start_date=start_date, stop_date=stop_date):
    """
    """
    station_json = get_data_from_api_by_station(station_id=station_id, 
                                            start_date=start_date,
                                            stop_date=stop_date)

    station_df = transform_json_station_data_to_df(station_json)
    return station_df

#### Test de perf data pour une station sur une semaine

In [23]:
station_id=2
start_date='2022-03-24'
stop_date='2022-04-02'

In [24]:
# 1.16 s ± 74.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit test_api_bdx(station_id=station_id, start_date=start_date, stop_date=stop_date)

1.16 s ± 74.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [25]:
# TO do en attendant que le serveur Oslandia soit UP
%timeit test_api_oslandia(station_id=station_id, start_date=start_date, stop_date=stop_date)

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

#### Test de perf data pour plusieurs stations sur une semaine

In [27]:
statoin_id = [124,  15,  60,  18,  10,  68, 130,   2, 105, 120,  11, 110,   9,
        23,   3,  16,  20, 136,  42,  21, 131,  59,  45, 172,   6,  24,
        36, 108,  19, 125, 135,  37, 139,  99,  28,  57,   7,  98,   8,
        41,  40,  58,  55, 109,   1,   4, 100, 134, 174, 101,  56, 104,
        43,  54,  44, 102, 133, 103,   5, 127,  65,  22, 123,  39, 106]
start_date='2022-03-24'
stop_date='2022-04-02'

In [28]:
# 3.1 s ± 182 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit test_api_bdx(station_id=station_id, start_date=start_date, stop_date=stop_date)

1.16 s ± 70.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [29]:
# TO do en attendant que le serveur Oslandia soit UP
%timeit test_api_oslandia(station_id=station_id, start_date=start_date, stop_date=stop_date)

JSONDecodeError: Expecting value: line 1 column 1 (char 0)