In [1]:
import pandas as pd
import requests

from vcub_keeper.config import *
from vcub_keeper.reader.reader import *
from vcub_keeper.reader.reader_utils import filter_periode
from vcub_keeper.visualisation import *
from vcub_keeper.transform.features_factory import *

from sklearn.manifold import Isomap
from scipy import stats
from joblib import dump, load

%load_ext autoreload
%autoreload 2

pd.options.display.max_rows = 500
pd.options.display.max_columns = 500

## Objectifs

- Obtenir les données depuis l'api de Damien
- Structuré ces données pour les utiliser avec le pipeline de prédiction

## Get data from API

### Dev

In [31]:
station_id=25
start_date='2020-10-14'
stop_date='2020-10-20'

#url = "http://data.oslandia.io/bikes/api/bordeaux/timeseries/station/106?start=2020-10-09&stop=2020-10-17"

url = \
    "http://data.oslandia.io/bikes/api/bordeaux/timeseries/station/"+ str(station_id) +\
    "?start=" + start_date + "&stop=" + stop_date
    
response = requests.get(url)

In [32]:
requests.get(url).json()['data'][0].keys()

dict_keys(['available_bikes', 'available_stands', 'id', 'name', 'nb_stands', 'status', 'ts'])

In [33]:
station_df = pd.DataFrame(response.json()['data'][0])

In [6]:
# Status mapping
status_dict = {'open' : 1,
               'closed' : 0
              }
station_df['status'] = station_df['status'].map(status_dict)
station_df['status'] = station_df['status'].astype('uint8')

# Naming
station_df.rename(columns={'id':'station_id'}, inplace=True)
station_df.rename(columns={'ts':'date'}, inplace=True)

# Casting date & sorting DataFrame on station_id & date
station_df['date'] = pd.to_datetime(station_df['date'])
station_df = station_df.sort_values(['station_id', 'date'], ascending=[1, 1])

# Reset index
station_df = station_df.reset_index(drop=True)

# Dropduplicate station_id / date rows
station_df = station_df.drop_duplicates(subset=['station_id', 'date']).reset_index(drop=True)

# Create features
station_df = get_transactions_in(station_df)
station_df = get_transactions_out(station_df)
station_df = get_transactions_all(station_df)

In [7]:
station_df.tail(10)

Unnamed: 0,available_bikes,available_stands,station_id,name,nb_stands,status,date,transactions_in,transactions_out,transactions_all
1619,18,2,105,Rue du Mirail,20,1,2020-10-19 23:13:13,0.0,0.0,0.0
1620,18,2,105,Rue du Mirail,20,1,2020-10-19 23:18:13,0.0,0.0,0.0
1621,18,2,105,Rue du Mirail,20,1,2020-10-19 23:24:13,0.0,0.0,0.0
1622,18,2,105,Rue du Mirail,20,1,2020-10-19 23:29:13,0.0,0.0,0.0
1623,18,2,105,Rue du Mirail,20,1,2020-10-19 23:33:13,0.0,0.0,0.0
1624,18,2,105,Rue du Mirail,20,1,2020-10-19 23:37:13,0.0,0.0,0.0
1625,18,2,105,Rue du Mirail,20,1,2020-10-19 23:42:13,0.0,0.0,0.0
1626,18,2,105,Rue du Mirail,20,1,2020-10-19 23:47:13,0.0,0.0,0.0
1627,18,2,105,Rue du Mirail,20,1,2020-10-19 23:53:13,0.0,0.0,0.0
1628,18,2,105,Rue du Mirail,20,1,2020-10-19 23:57:13,0.0,0.0,0.0


### Industrialisation

In [11]:
def get_data_from_api_by_station(station_id, start_date, stop_date):
    """
    Permet d'obtenir les données d'activité d'une station via une API
    
    Parameters
    ----------
    station_id : Int
        Numéro de la station de Vcub
    start_date : str
        Date de début de la Time Serie
    stop_date : str
        Date de fin de la Time Serie
    
    Returns
    -------
    Time serie in Json format
        
    Examples
    --------
    
    station_json = get_data_from_api_by_station(station_id=19, 
                                                start_date='2020-10-14',
                                                stop_date='2020-10-17')
    """
    url = \
        "http://data.oslandia.io/bikes/api/bordeaux/timeseries/station/"+ str(station_id) +\
        "?start=" + start_date + "&stop=" + stop_date
    
    response = requests.get(url)
    return response.json()

def transform_json_station_data_to_df(station_json):
    """
    Tranforme la Time Serie d'activité de la station en DataFrame
    Effectue plusieurs transformation comme la fonction create/creator.py 
    create_activity_time_series()
        - Structuration
        - Naming
        - Ajout de variables
        - Resampling sur 10min
    
    Parameters
    ----------
    station_json : json
        Time serie au format json de l'activité d'une station
    Returns
    -------
    station_df_resample : DataFrame
        Time serie au format DataFrame de l'activité d'une station
        resampler sur 10 min.
        
    Examples
    --------
    
    station_df = transform_json_station_data_to_df(station_json)
    
    """
    
    station_df = pd.DataFrame(station_json['data'][0])
    
    # Status mapping
    status_dict = {'open' : 1,
                   'closed' : 0
                  }
    station_df['status'] = station_df['status'].map(status_dict)
    station_df['status'] = station_df['status'].astype('uint8')

    # Naming
    station_df.rename(columns={'id':'station_id'}, inplace=True)
    station_df.rename(columns={'ts':'date'}, inplace=True)

    # Casting & sorting DataFrame on station_id & date
    station_df['date'] = pd.to_datetime(station_df['date'])
    station_df['station_id'] = station_df['station_id'].astype(int)
    station_df = station_df.sort_values(['station_id', 'date'], ascending=[1, 1])

    # Reset index
    station_df = station_df.reset_index(drop=True)

    # Dropduplicate station_id / date rows
    station_df = station_df.drop_duplicates(subset=['station_id', 'date']).reset_index(drop=True)

    # Create features
    station_df = get_transactions_in(station_df)
    station_df = get_transactions_out(station_df)
    station_df = get_transactions_all(station_df)

    ## Resampling

    # cf Bug Pandas : https://github.com/pandas-dev/pandas/issues/33548
    station_df = station_df.set_index('date')

    station_df_resample = \
        station_df.groupby('station_id').resample('10T', 
                                                  label='right',
                                                 ).agg({'available_stands' : 'last',
                                                        'available_bikes' : 'last',
                                                        'status' : 'max', # Empeche les micro déconnection à la station
                                                        'transactions_in' : 'sum',
                                                        'transactions_out' : 'sum',
                                                        'transactions_all' : 'sum'}).reset_index()
    return station_df_resample

In [12]:
station_id='19'
start_date='2020-10-14'
stop_date='2020-10-17'

station_json = get_data_from_api_by_station(station_id=station_id, 
                                            start_date=start_date,
                                            stop_date=stop_date)

station_df = transform_json_station_data_to_df(station_json)

In [13]:
station_df.tail()

Unnamed: 0,station_id,date,available_stands,available_bikes,status,transactions_in,transactions_out,transactions_all
427,19,2020-10-16 23:20:00,18.0,12.0,1.0,0.0,0.0,0.0
428,19,2020-10-16 23:30:00,18.0,12.0,1.0,0.0,0.0,0.0
429,19,2020-10-16 23:40:00,18.0,12.0,1.0,0.0,0.0,0.0
430,19,2020-10-16 23:50:00,18.0,12.0,1.0,0.0,0.0,0.0
431,19,2020-10-17 00:00:00,19.0,11.0,1.0,0.0,1.0,1.0


In [130]:
station_df.tail()

Unnamed: 0,station_id,date,available_stands,available_bikes,transactions_in,transactions_out,transactions_all,status,consecutive_no_transactions_out
381,106,2020-10-16 15:40:00,25.0,15.0,0.0,1.0,1.0,1,0
382,106,2020-10-16 15:50:00,25.0,15.0,0.0,0.0,0.0,1,1
383,106,2020-10-16 16:00:00,22.0,18.0,3.0,0.0,3.0,1,2
384,106,2020-10-16 16:10:00,21.0,19.0,1.0,0.0,1.0,1,3
385,106,2020-10-16 16:20:00,22.0,18.0,0.0,1.0,1.0,1,0


In [131]:
station_df.dtypes

station_id                                 object
date                               datetime64[ns]
available_stands                          float64
available_bikes                           float64
transactions_in                           float64
transactions_out                          float64
transactions_all                          float64
status                                      int64
consecutive_no_transactions_out             int64
dtype: object

### Script

In [24]:
from vcub_keeper.production.data import (get_data_from_api_by_station,
                                         transform_json_station_data_to_df)
from vcub_keeper.visualisation import *

In [25]:
station_id=106
start_date='2020-10-14'
stop_date='2020-10-17'

In [26]:
station_json = get_data_from_api_by_station(station_id=station_id, 
                                            start_date=start_date,
                                            stop_date=stop_date)

station_df = transform_json_station_data_to_df(station_json)

In [27]:
station_df.tail()

Unnamed: 0,station_id,date,available_stands,available_bikes,status,transactions_in,transactions_out,transactions_all
427,106,2020-10-16 23:20:00,19.0,21.0,1.0,0.0,0.0,0.0
428,106,2020-10-16 23:30:00,19.0,21.0,1.0,0.0,0.0,0.0
429,106,2020-10-16 23:40:00,19.0,21.0,1.0,0.0,0.0,0.0
430,106,2020-10-16 23:50:00,18.0,22.0,1.0,1.0,0.0,1.0
431,106,2020-10-17 00:00:00,18.0,22.0,1.0,0.0,0.0,0.0


In [28]:
station_df = get_consecutive_no_transactions_out(station_df) 

In [29]:
station_df

Unnamed: 0,station_id,date,available_stands,available_bikes,status,transactions_in,transactions_out,transactions_all,consecutive_no_transactions_out
0,106,2020-10-14 00:10:00,16.0,24.0,1.0,0.0,0.0,0.0,0
1,106,2020-10-14 00:20:00,16.0,24.0,1.0,0.0,0.0,0.0,1
2,106,2020-10-14 00:30:00,16.0,24.0,1.0,0.0,0.0,0.0,2
3,106,2020-10-14 00:40:00,16.0,24.0,1.0,1.0,1.0,2.0,0
4,106,2020-10-14 00:50:00,16.0,24.0,1.0,0.0,0.0,0.0,1
5,106,2020-10-14 01:00:00,16.0,24.0,1.0,0.0,0.0,0.0,2
6,106,2020-10-14 01:10:00,16.0,24.0,1.0,0.0,0.0,0.0,3
7,106,2020-10-14 01:20:00,16.0,24.0,1.0,0.0,0.0,0.0,4
8,106,2020-10-14 01:30:00,16.0,24.0,1.0,0.0,0.0,0.0,5
9,106,2020-10-14 01:40:00,17.0,23.0,1.0,0.0,1.0,1.0,0


In [30]:
plot_station_activity(station_df, station_id=station_id, 
                      features_to_plot=['available_bikes', #'available_stands',
                                       'consecutive_no_transactions_out',
                                        'status'
                                       ],
                      #start_date=start_date,
                      #end_date=end_date,
                      return_data=False)