In [22]:
import pandas as pd

from vcub_keeper.reader.reader import *
from vcub_keeper.visualisation import plot_station_activity
from vcub_keeper.transform.features_factory import *

import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline

%load_ext autoreload
%autoreload 2

pd.options.display.max_rows = 500

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Objectifs :

- Profilage des stations par leurs activités et fréquences
- Export d'un fichier de référence
- [Issue Github](https://github.com/armgilles/vcub_keeper/issues/28)

## Profilage des stations selon leurs activités et fréquence

In [4]:
# Lecture du fichier activité
ts_activity = read_time_serie_activity()

# Some features
ts_activity = get_transactions_in(ts_activity)
ts_activity = get_transactions_out(ts_activity)
ts_activity = get_transactions_all(ts_activity)
ts_activity = get_consecutive_no_transactions_out(ts_activity)

In [5]:
profile_station = \
    ts_activity[ts_activity['status'] == 1].groupby('station_id', 
                                                    as_index=False)['transactions_all'].agg({'total_point' : 'size',
                                                                                           'mean' : 'mean',
                                                                                           'median' : 'median',
                                                                                           'std' : 'std',
                                                                                           '95%': lambda x: x.quantile(0.95),
                                                                                           '98%': lambda x: x.quantile(0.98),
                                                                                           '99%': lambda x: x.quantile(0.99),
                                                                                           'max' : 'max'})
profile_station = profile_station.sort_values('mean')

On classe les stations suivant leurs moyennes d'activité (basé sur fréquence de 10 minutes)

In [19]:
profile_station['profile_station_activity'] = \
    pd.cut(profile_station['mean'], 3, labels=["low", "medium", "hight"])

In [33]:
profile_station = profile_station.round(2)

In [34]:
profile_station

Unnamed: 0,station_id,total_point,mean,median,std,95%,98%,99%,max,profile_station_activity
180,181,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,low
159,160,71946,0.0,0.0,0.06,0.0,0.0,0.0,5.0,low
185,250,17835,0.01,0.0,0.08,0.0,0.0,0.0,2.0,low
160,161,71666,0.01,0.0,0.1,0.0,0.0,0.0,6.0,low
179,180,39308,0.01,0.0,0.12,0.0,0.0,0.0,8.0,low
91,92,72857,0.01,0.0,0.12,0.0,0.0,0.0,6.0,low
166,167,70073,0.01,0.0,0.14,0.0,0.0,0.0,9.0,low
182,183,24909,0.01,0.0,0.14,0.0,0.0,0.0,8.0,low
167,168,56094,0.01,0.0,0.13,0.0,0.0,0.0,6.0,low
80,81,72673,0.01,0.0,0.16,0.0,0.0,1.0,8.0,low


In [35]:
profile_station.profile_station_activity.value_counts()

low       131
medium     49
hight       7
Name: profile_station_activity, dtype: int64

In [49]:
## Export
profile_station[['station_id', 
                 'profile_station_activity']].to_csv(ROOT_DATA_REF+'station_profile.csv',
                                                     index=False, encoding='utf-8')

## Industrialisation

In [59]:
def create_station_profilage_activity():
    """
    Création d'un fichier classifiant les stations suivant leurs activités et 
    leurs fréquences d'utilation
    Création du fichier `station_profile.csv` dans ROOT_DATA_REF

    Parameters
    ----------
    None
    
    Returns
    -------
    None
        
    Examples
    --------
    
    create_station_profilage_activity()
    """
    
    # Lecture du fichier activité
    ts_activity = read_time_serie_activity()

    # Some features
    ts_activity = get_transactions_in(ts_activity)
    ts_activity = get_transactions_out(ts_activity)
    ts_activity = get_transactions_all(ts_activity)
    ts_activity = get_consecutive_no_transactions_out(ts_activity)
    
    # Aggrégation de l'activité par stations
    profile_station = \
        ts_activity[ts_activity['status'] == 1].groupby('station_id', 
                                                        as_index=False)['transactions_all'].agg({'total_point' : 'size',
                                                                                                 'mean' : 'mean',
                                                                                                 'median' : 'median',
                                                                                                 'std' : 'std',
                                                                                                 '95%': lambda x: x.quantile(0.95),
                                                                                                 '98%': lambda x: x.quantile(0.98),
                                                                                                 '99%': lambda x: x.quantile(0.99),
                                                                                                 'max' : 'max'})
    profile_station = profile_station.sort_values('mean')
    # Classification en 3 activités (low / medium / hight)
    profile_station['profile_station_activity'] = \
        pd.cut(profile_station['mean'], 3, labels=["low", "medium", "hight"])
    
    ## Export
    profile_station.to_csv(ROOT_DATA_REF+'station_profile.csv',
                                                         index=False, encoding='utf-8')

In [60]:
def read_station_profile(file_name='station_profile.csv'):
    """
    Lecture du fichier sur qui classifie les stations par rapport à leurs activité et 
    fréquences d'utilisation.
    Ce fichier est situé dans ROOT_DATA_REF
    
    Parameters
    ----------
    file_name : str
        Nom du fichier
    
    Returns
    -------
    station_profile : DataFrame
        
    Examples
    --------
    
    station_profile = read_station_profile()
    """
    station_profile = pd.read_csv(ROOT_DATA_REF+file_name, sep=',')
    
    return station_profile
    

In [97]:
from vcub_keeper.reader.reader import read_station_profile
from vcub_keeper.create.creator import create_station_profilage_activity

In [98]:
create_station_profilage_activity()

In [99]:
station_profile = read_station_profile()

In [100]:
station_profile

Unnamed: 0,station_id,total_point,mean,median,std,95%,98%,99%,max,profile_station_activity
0,181,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,low
1,160,71946,0.002614,0.0,0.0641,0.0,0.0,0.0,5.0,low
2,250,17835,0.006169,0.0,0.079016,0.0,0.0,0.0,2.0,low
3,161,71666,0.006337,0.0,0.096354,0.0,0.0,0.0,6.0,low
4,180,39308,0.007176,0.0,0.115033,0.0,0.0,0.0,8.0,low
5,92,72857,0.008581,0.0,0.118668,0.0,0.0,0.0,6.0,low
6,167,70073,0.009108,0.0,0.138421,0.0,0.0,0.0,9.0,low
7,183,24909,0.011363,0.0,0.137929,0.0,0.0,0.0,8.0,low
8,168,56094,0.011378,0.0,0.125753,0.0,0.0,0.0,6.0,low
9,81,72673,0.014233,0.0,0.157725,0.0,0.0,1.0,8.0,low


### Analyse de l'activité des stations :

In [101]:
# Lecture du profile activité des stations
station_profile = read_station_profile()

# lecture des attributs des stations
stations = read_stations_attributes()

In [102]:
station_profile.profile_station_activity.value_counts()

low       131
medium     49
hight       7
Name: profile_station_activity, dtype: int64

In [103]:
# Filtrage des colonnes
stations = stations[['station_id', 'COMMUNE', 'NOM']]

In [104]:
station_profile = station_profile.merge(stations, on='station_id', how='left')

In [107]:
station_profile

Unnamed: 0,station_id,total_point,mean,median,std,95%,98%,99%,max,profile_station_activity,COMMUNE,NOM
0,181,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,low,,
1,160,71946,0.002614,0.0,0.0641,0.0,0.0,0.0,5.0,low,SAINT-AUBIN-DE-MEDOC,Eglise St Aubin
2,250,17835,0.006169,0.0,0.079016,0.0,0.0,0.0,2.0,low,,
3,161,71666,0.006337,0.0,0.096354,0.0,0.0,0.0,6.0,low,LE TAILLAN-MEDOC,Le Taillan Mairie
4,180,39308,0.007176,0.0,0.115033,0.0,0.0,0.0,8.0,low,,
5,92,72857,0.008581,0.0,0.118668,0.0,0.0,0.0,6.0,low,SAINT-MEDARD-EN-JALLES,St Médard République
6,167,70073,0.009108,0.0,0.138421,0.0,0.0,0.0,9.0,low,SAINT-MEDARD-EN-JALLES,Centre commercial St Médard
7,183,24909,0.011363,0.0,0.137929,0.0,0.0,0.0,8.0,low,,
8,168,56094,0.011378,0.0,0.125753,0.0,0.0,0.0,6.0,low,BLANQUEFORT,Mairie de Blanquefort
9,81,72673,0.014233,0.0,0.157725,0.0,0.0,1.0,8.0,low,PESSAC,Morin Cazalet


In [106]:
fig = px.box(station_profile, x="profile_station_activity", y="mean",
            points='all',
            color='profile_station_activity',
            hover_data=['NOM', 'total_point', 'mean', '95%', 'std', 'COMMUNE'],
            labels={'profile_station_activity' : 'Profile activité',
                    'NOM' : 'Nom',
                    'COMMUNE' : 'Commune'},
            boxmode='group')
fig.show()