In [1]:
import pandas as pd
import requests

from vcub_keeper.config import *
from vcub_keeper.reader.reader import *
from vcub_keeper.reader.reader_utils import filter_periode
from vcub_keeper.visualisation import *
from vcub_keeper.transform.features_factory import *

from sklearn.manifold import Isomap
from scipy import stats
from joblib import dump, load

%load_ext autoreload
%autoreload 2

pd.options.display.max_rows = 500
pd.options.display.max_columns = 500

## Objectifs

- Obtenir les données depuis l'api de Damien
- Structuré ces données pour les utiliser avec le pipeline de prédiction

## Get data from API

### Dev

In [2]:
station_id=25
#station_id="25,102"
start_date='2020-10-14'
stop_date='2020-10-20'

#url = "http://data.oslandia.io/bikes/api/bordeaux/timeseries/station/106?start=2020-10-09&stop=2020-10-17"

url = \
    "http://data.oslandia.io/bikes/api/bordeaux/timeseries/station/"+ str(station_id) +\
    "?start=" + start_date + "&stop=" + stop_date
    
response = requests.get(url)

In [3]:
url

'http://data.oslandia.io/bikes/api/bordeaux/timeseries/station/25?start=2020-10-14&stop=2020-10-20'

In [4]:
station_df = pd.DataFrame(response.json()['data'][0])

In [5]:
station_df.id.unique()

array(['25'], dtype=object)

In [6]:
# Status mapping
status_dict = {'open' : 1,
               'closed' : 0
              }
station_df['status'] = station_df['status'].map(status_dict)
station_df['status'] = station_df['status'].astype('uint8')

# Naming
station_df.rename(columns={'id':'station_id'}, inplace=True)
station_df.rename(columns={'ts':'date'}, inplace=True)

# Casting date & sorting DataFrame on station_id & date
station_df['date'] = pd.to_datetime(station_df['date'])
station_df = station_df.sort_values(['station_id', 'date'], ascending=[1, 1])

# Reset index
station_df = station_df.reset_index(drop=True)

# Dropduplicate station_id / date rows
station_df = station_df.drop_duplicates(subset=['station_id', 'date']).reset_index(drop=True)

# Create features
station_df = get_transactions_in(station_df)
station_df = get_transactions_out(station_df)
station_df = get_transactions_all(station_df)

In [7]:
station_df.tail(10)

Unnamed: 0,available_bikes,available_stands,station_id,name,nb_stands,status,date,transactions_in,transactions_out,transactions_all
1619,18,2,105,Rue du Mirail,20,1,2020-10-19 23:13:13,0.0,0.0,0.0
1620,18,2,105,Rue du Mirail,20,1,2020-10-19 23:18:13,0.0,0.0,0.0
1621,18,2,105,Rue du Mirail,20,1,2020-10-19 23:24:13,0.0,0.0,0.0
1622,18,2,105,Rue du Mirail,20,1,2020-10-19 23:29:13,0.0,0.0,0.0
1623,18,2,105,Rue du Mirail,20,1,2020-10-19 23:33:13,0.0,0.0,0.0
1624,18,2,105,Rue du Mirail,20,1,2020-10-19 23:37:13,0.0,0.0,0.0
1625,18,2,105,Rue du Mirail,20,1,2020-10-19 23:42:13,0.0,0.0,0.0
1626,18,2,105,Rue du Mirail,20,1,2020-10-19 23:47:13,0.0,0.0,0.0
1627,18,2,105,Rue du Mirail,20,1,2020-10-19 23:53:13,0.0,0.0,0.0
1628,18,2,105,Rue du Mirail,20,1,2020-10-19 23:57:13,0.0,0.0,0.0


### Industrialisation

In [1]:
from vcub_keeper.config import *
from vcub_keeper.reader.reader import *
from vcub_keeper.config import THRESHOLD_PROFILE_STATION

#THRESHOLD_PROFILE_STATION = 0.3

station_profile = read_station_profile(path_directory=ROOT_DATA_REF)

stations_id_to_pred = \
    station_profile[station_profile['mean'] >= THRESHOLD_PROFILE_STATION]['station_id'].unique()

In [2]:
type(stations_id_to_pred)

numpy.ndarray

In [3]:
','.join(map(str, stations_id_to_pred))

'124,118,111,10,62,66,105,130,112,2,68,23,11,60,120,9,3,45,136,24,138,21,108,16,131,36,20,110,139,6,42,125,19,135,59,41,28,7,109,4,57,55,37,8,58,99,134,104,40,98,172,1,56,43,44,100,133,103,101,102,5,65,174,123,54,39,22,127,106'

In [9]:
def get_data_from_api_by_station(station_id, start_date, stop_date):
    """
    Permet d'obtenir les données d'activité d'une station via une API
    
    Parameters
    ----------
    station_id : Int or List
        Numéro de la station de Vcub
    start_date : str
        Date de début de la Time Serie
    stop_date : str
        Date de fin de la Time Serie
    
    Returns
    -------
    Time serie in Json format
        
    Examples
    --------
    
    station_json = get_data_from_api_by_station(station_id=19, 
                                                start_date='2020-10-14',
                                                stop_date='2020-10-17')
    """
    
    if isinstance(station_id, (list, np.ndarray)):
        station_id = ','.join(map(str, station_id))
    
    url = \
        "http://data.oslandia.io/bikes/api/bordeaux/timeseries/station/"+ str(station_id) +\
        "?start=" + start_date + "&stop=" + stop_date
    
    response = requests.get(url)
    return response.json()

def transform_json_station_data_to_df(station_json):
    """
    Tranforme la Time Serie d'activité d'une ou plusieurs station en DataFrame
    Effectue plusieurs transformation comme la fonction create/creator.py 
    create_activity_time_series()
        - Structuration
        - Naming
        - Ajout de variables
        - Resampling sur 10min
    
    Parameters
    ----------
    station_json : json
        Time serie au format json de l'activité d'une station (ou plusieurs)
    Returns
    -------
    station_df_resample : DataFrame
        Time serie au format DataFrame de l'activité d'une ou plusieurs station
        resampler sur 10 min.
        
    Examples
    --------
    
    station_df = transform_json_station_data_to_df(station_json)
    
    """
    
    # Si il y a plusieurs stations dans le json
    if len(station_json['data']) > 1:
        station_df = pd.DataFrame()
        for i in range(0, len(station_json['data'])):
            temp_station_df = pd.DataFrame(station_json['data'][i])
            station_df = pd.concat([station_df, temp_station_df])
    # Il y une seule station dans le json
    else:        
        station_df = pd.DataFrame(station_json['data'][0])
    
    # Status mapping
    status_dict = {'open' : 1,
                   'closed' : 0
                  }
    station_df['status'] = station_df['status'].map(status_dict)
    station_df['status'] = station_df['status'].astype('uint8')

    # Naming
    station_df.rename(columns={'id':'station_id'}, inplace=True)
    station_df.rename(columns={'ts':'date'}, inplace=True)

    # Casting & sorting DataFrame on station_id & date
    station_df['date'] = pd.to_datetime(station_df['date'])
    station_df['station_id'] = station_df['station_id'].astype(int)
    station_df = station_df.sort_values(['station_id', 'date'], ascending=[1, 1])

    # Reset index
    station_df = station_df.reset_index(drop=True)

    # Dropduplicate station_id / date rows
    station_df = station_df.drop_duplicates(subset=['station_id', 'date']).reset_index(drop=True)

    # Create features
    station_df = get_transactions_in(station_df)
    station_df = get_transactions_out(station_df)
    station_df = get_transactions_all(station_df)

    ## Resampling

    # cf Bug Pandas : https://github.com/pandas-dev/pandas/issues/33548
    station_df = station_df.set_index('date')

    station_df_resample = \
        station_df.groupby('station_id').resample('10T', 
                                                  label='right',
                                                 ).agg({'available_stands' : 'last',
                                                        'available_bikes' : 'last',
                                                        'status' : 'max', # Empeche les micro déconnection à la station
                                                        'transactions_in' : 'sum',
                                                        'transactions_out' : 'sum',
                                                        'transactions_all' : 'sum'}).reset_index()
    return station_df_resample

In [10]:
station_id='19'
#station_id=[19, 105, 102]
start_date='2020-10-14'
stop_date='2020-10-17'

station_json = get_data_from_api_by_station(station_id=station_id, 
                                            start_date=start_date,
                                            stop_date=stop_date)

station_df = transform_json_station_data_to_df(station_json)

In [11]:
station_df.tail()

Unnamed: 0,station_id,date,available_stands,available_bikes,status,transactions_in,transactions_out,transactions_all
427,19,2020-10-16 23:20:00,18.0,12.0,1.0,0.0,0.0,0.0
428,19,2020-10-16 23:30:00,18.0,12.0,1.0,0.0,0.0,0.0
429,19,2020-10-16 23:40:00,18.0,12.0,1.0,0.0,0.0,0.0
430,19,2020-10-16 23:50:00,18.0,12.0,1.0,0.0,0.0,0.0
431,19,2020-10-17 00:00:00,19.0,11.0,1.0,0.0,1.0,1.0


In [12]:
station_df.tail()

Unnamed: 0,station_id,date,available_stands,available_bikes,status,transactions_in,transactions_out,transactions_all
427,19,2020-10-16 23:20:00,18.0,12.0,1.0,0.0,0.0,0.0
428,19,2020-10-16 23:30:00,18.0,12.0,1.0,0.0,0.0,0.0
429,19,2020-10-16 23:40:00,18.0,12.0,1.0,0.0,0.0,0.0
430,19,2020-10-16 23:50:00,18.0,12.0,1.0,0.0,0.0,0.0
431,19,2020-10-17 00:00:00,19.0,11.0,1.0,0.0,1.0,1.0


In [13]:
station_df.dtypes

station_id                   int64
date                datetime64[ns]
available_stands           float64
available_bikes            float64
status                     float64
transactions_in            float64
transactions_out           float64
transactions_all           float64
dtype: object

### Script

In [6]:
from vcub_keeper.production.data import (get_data_from_api_by_station,
                                         transform_json_station_data_to_df)
from vcub_keeper.transform.features_factory import get_consecutive_no_transactions_out
from vcub_keeper.visualisation import *

In [7]:
station_id=102
station_id=[19, 105, 102]
start_date='2020-10-14'
stop_date='2020-10-22'

In [8]:
station_json = get_data_from_api_by_station(station_id=station_id, 
                                            start_date=start_date,
                                            stop_date=stop_date)

station_df = transform_json_station_data_to_df(station_json)

In [9]:
station_df.tail()

Unnamed: 0,station_id,date,available_stands,available_bikes,status,transactions_in,transactions_out,transactions_all
3451,105,2020-10-21 23:20:00,5.0,15.0,1.0,0.0,0.0,0.0
3452,105,2020-10-21 23:30:00,6.0,14.0,1.0,0.0,1.0,1.0
3453,105,2020-10-21 23:40:00,6.0,14.0,1.0,0.0,0.0,0.0
3454,105,2020-10-21 23:50:00,6.0,14.0,1.0,0.0,0.0,0.0
3455,105,2020-10-22 00:00:00,6.0,14.0,1.0,0.0,0.0,0.0


In [10]:
station_df = get_consecutive_no_transactions_out(station_df) 

In [11]:
station_df

Unnamed: 0,station_id,date,available_stands,available_bikes,status,transactions_in,transactions_out,transactions_all,consecutive_no_transactions_out
0,19,2020-10-14 00:10:00,20.0,10.0,1.0,0.0,0.0,0.0,0
1,19,2020-10-14 00:20:00,20.0,10.0,1.0,0.0,0.0,0.0,1
2,19,2020-10-14 00:30:00,20.0,10.0,1.0,0.0,0.0,0.0,2
3,19,2020-10-14 00:40:00,20.0,10.0,1.0,0.0,0.0,0.0,3
4,19,2020-10-14 00:50:00,21.0,9.0,1.0,0.0,1.0,1.0,0
...,...,...,...,...,...,...,...,...,...
3451,105,2020-10-21 23:20:00,5.0,15.0,1.0,0.0,0.0,0.0,15
3452,105,2020-10-21 23:30:00,6.0,14.0,1.0,0.0,1.0,1.0,0
3453,105,2020-10-21 23:40:00,6.0,14.0,1.0,0.0,0.0,0.0,1
3454,105,2020-10-21 23:50:00,6.0,14.0,1.0,0.0,0.0,0.0,2


In [13]:
plot_station_activity(station_df, station_id=105, #105 #station_id
                      features_to_plot=['available_bikes', #'available_stands',
                                       'consecutive_no_transactions_out',
                                        'status'
                                       ],
                      #start_date=start_date,
                      #end_date=end_date,
                      return_data=False)

In [14]:
station_df[station_df.status == 0]

Unnamed: 0,station_id,date,available_stands,available_bikes,status,transactions_in,transactions_out,transactions_all,consecutive_no_transactions_out
1861,102,2020-10-18 22:20:00,5.0,15.0,0.0,0.0,0.0,0.0,0
1862,102,2020-10-18 22:30:00,5.0,15.0,0.0,0.0,0.0,0.0,0
3011,105,2020-10-18 22:00:00,1.0,19.0,0.0,0.0,0.0,0.0,0
3012,105,2020-10-18 22:10:00,1.0,19.0,0.0,0.0,0.0,0.0,0
3013,105,2020-10-18 22:20:00,1.0,19.0,0.0,0.0,0.0,0.0,0


In [15]:
station_df.status.value_counts()

1.0    3307
0.0       5
Name: status, dtype: int64