In [161]:
import pandas as pd

from vcub_keeper.reader.reader_utils import filter_periode
from vcub_keeper.reader.reader import *
from vcub_keeper.visualisation import plot_station_activity
from vcub_keeper.transform.features_factory import *

import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline

%load_ext autoreload
%autoreload 2

pd.options.display.max_rows = 500

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Objectifs :

- Profilage des stations par leurs activités et fréquences
- On ne prend pas la période de confinement et les stations non utlisées par le grand public.
- Export d'un fichier de référence
- [Issue Github](https://github.com/armgilles/vcub_keeper/issues/28)

## Profilage des stations selon leurs activités et fréquence

In [114]:
# Lecture du fichier activité
ts_activity = read_time_serie_activity()

# Some features
ts_activity = get_transactions_in(ts_activity)
ts_activity = get_transactions_out(ts_activity)
ts_activity = get_transactions_all(ts_activity)
ts_activity = get_consecutive_no_transactions_out(ts_activity)

In [121]:
# Filter data with confinement & non use by consumer
ts_activity = filter_periode(ts_activity)

In [122]:
ts_activity.shape

(14952568, 9)

In [124]:
profile_station = \
    ts_activity[ts_activity['status'] == 1].groupby('station_id', 
                                                    as_index=False)['transactions_all'].agg({'total_point' : 'size',
                                                                                           'mean' : 'mean',
                                                                                           'median' : 'median',
                                                                                           'std' : 'std',
                                                                                           '95%': lambda x: x.quantile(0.95),
                                                                                           '98%': lambda x: x.quantile(0.98),
                                                                                           '99%': lambda x: x.quantile(0.99),
                                                                                           'max' : 'max'})
profile_station = profile_station.sort_values('mean')

On classe les stations suivant leurs moyennes d'activité (basé sur fréquence de 10 minutes)

In [125]:
profile_station['profile_station_activity'] = \
    pd.cut(profile_station['mean'], 3, labels=["low", "medium", "hight"])

In [126]:
profile_station = profile_station.round(2)

In [127]:
profile_station

Unnamed: 0,station_id,total_point,mean,median,std,95%,98%,99%,max,profile_station_activity
180,181,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,low
159,160,64496,0.0,0.0,0.07,0.0,0.0,0.0,5.0,low
160,161,64028,0.01,0.0,0.1,0.0,0.0,0.0,6.0,low
179,180,31672,0.01,0.0,0.12,0.0,0.0,0.0,8.0,low
91,92,65219,0.01,0.0,0.12,0.0,0.0,0.0,6.0,low
166,167,62437,0.01,0.0,0.14,0.0,0.0,0.0,9.0,low
167,168,50066,0.01,0.0,0.13,0.0,0.0,1.0,6.0,low
182,183,17277,0.01,0.0,0.14,0.0,0.0,1.0,5.0,low
80,81,65125,0.02,0.0,0.16,0.0,0.0,1.0,8.0,low
149,150,64702,0.02,0.0,0.16,0.0,0.0,1.0,9.0,low


In [128]:
profile_station.profile_station_activity.value_counts()

low       128
medium     50
hight       6
Name: profile_station_activity, dtype: int64

In [36]:
fig = px.box(profile_station, x="profile_station_activity", y="mean",
            points='all',
            color='profile_station_activity',
            hover_data=['total_point', 'mean', '95%', 'std'],
            labels={'profile_station_activity' : 'Profile activité'},
            boxmode='group')
fig.show()

In [49]:
## Export
profile_station[['station_id', 
                 'profile_station_activity']].to_csv(ROOT_DATA_REF+'station_profile.csv',
                                                     index=False, encoding='utf-8')

## Industrialisation

In [130]:
def create_station_profilage_activity():
    """
    Création d'un fichier classifiant les stations suivant leurs activités et 
    leurs fréquences d'utilation (données filtré par reader_utils.py filter_periode() )
    Création du fichier `station_profile.csv` dans ROOT_DATA_REF

    Parameters
    ----------
    None
    
    Returns
    -------
    None
        
    Examples
    --------
    
    create_station_profilage_activity()
    """
    
    # Lecture du fichier activité
    ts_activity = read_time_serie_activity()

    # Some features
    ts_activity = get_transactions_in(ts_activity)
    ts_activity = get_transactions_out(ts_activity)
    ts_activity = get_transactions_all(ts_activity)
    ts_activity = get_consecutive_no_transactions_out(ts_activity)
    
    # Filter data with confinement & non use by consumer
    ts_activity = filter_periode(ts_activity)
    
    # Aggrégation de l'activité par stations
    profile_station = \
        ts_activity[ts_activity['status'] == 1].groupby('station_id', 
                                                        as_index=False)['transactions_all'].agg({'total_point' : 'size',
                                                                                                 'mean' : 'mean',
                                                                                                 'median' : 'median',
                                                                                                 'std' : 'std',
                                                                                                 '95%': lambda x: x.quantile(0.95),
                                                                                                 '98%': lambda x: x.quantile(0.98),
                                                                                                 '99%': lambda x: x.quantile(0.99),
                                                                                                 'max' : 'max'})
    profile_station = profile_station.sort_values('mean')
    # Classification en 3 activités (low / medium / hight)
    profile_station['profile_station_activity'] = \
        pd.cut(profile_station['mean'], 3, labels=["low", "medium", "high"])
    
    ## Export
    profile_station.to_csv(ROOT_DATA_REF+'station_profile.csv',
                                                         index=False, encoding='utf-8')

In [131]:
def read_station_profile(file_name='station_profile.csv'):
    """
    Lecture du fichier sur qui classifie les stations par rapport à leurs activité et 
    fréquences d'utilisation.
    Ce fichier est situé dans ROOT_DATA_REF
    
    Parameters
    ----------
    file_name : str
        Nom du fichier
    
    Returns
    -------
    station_profile : DataFrame
        
    Examples
    --------
    
    station_profile = read_station_profile()
    """
    station_profile = pd.read_csv(ROOT_DATA_REF+file_name, sep=',')
    
    return station_profile
    

In [162]:
from vcub_keeper.reader.reader import read_station_profile
from vcub_keeper.create.creator import create_station_profilage_activity

In [156]:
create_station_profilage_activity()

In [157]:
station_profile = read_station_profile()

In [158]:
station_profile

Unnamed: 0,station_id,total_point,mean,median,std,95%,98%,99%,max,profile_station_activity
0,181,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,low
1,160,64496,0.002807,0.0,0.066428,0.0,0.0,0.0,5.0,low
2,161,64028,0.006937,0.0,0.100847,0.0,0.0,0.0,6.0,low
3,180,31672,0.007359,0.0,0.12123,0.0,0.0,0.0,8.0,low
4,92,65219,0.008589,0.0,0.118351,0.0,0.0,0.0,6.0,low
5,167,62437,0.009597,0.0,0.139767,0.0,0.0,0.0,9.0,low
6,168,50066,0.012568,0.0,0.132083,0.0,0.0,1.0,6.0,low
7,183,17277,0.012852,0.0,0.136782,0.0,0.0,1.0,5.0,low
8,81,65125,0.015591,0.0,0.163543,0.0,0.0,1.0,8.0,low
9,150,64702,0.015693,0.0,0.155258,0.0,0.0,1.0,9.0,low


### Analyse de l'activité des stations :

In [163]:
# Lecture du profile activité des stations
station_profile = read_station_profile()

# lecture des attributs des stations
stations = read_stations_attributes()

In [164]:
station_profile.profile_station_activity.value_counts()

low       128
medium     50
high        6
Name: profile_station_activity, dtype: int64

In [165]:
# Filtrage des colonnes
stations = stations[['station_id', 'COMMUNE', 'NOM']]

In [166]:
station_profile = station_profile.merge(stations, on='station_id', how='left')

In [167]:
station_profile

Unnamed: 0,station_id,total_point,mean,median,std,95%,98%,99%,max,profile_station_activity,COMMUNE,NOM
0,181,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,low,,
1,160,64496,0.002807,0.0,0.066428,0.0,0.0,0.0,5.0,low,SAINT-AUBIN-DE-MEDOC,Eglise St Aubin
2,161,64028,0.006937,0.0,0.100847,0.0,0.0,0.0,6.0,low,LE TAILLAN-MEDOC,Le Taillan Mairie
3,180,31672,0.007359,0.0,0.12123,0.0,0.0,0.0,8.0,low,,
4,92,65219,0.008589,0.0,0.118351,0.0,0.0,0.0,6.0,low,SAINT-MEDARD-EN-JALLES,St Médard République
5,167,62437,0.009597,0.0,0.139767,0.0,0.0,0.0,9.0,low,SAINT-MEDARD-EN-JALLES,Centre commercial St Médard
6,168,50066,0.012568,0.0,0.132083,0.0,0.0,1.0,6.0,low,BLANQUEFORT,Mairie de Blanquefort
7,183,17277,0.012852,0.0,0.136782,0.0,0.0,1.0,5.0,low,,
8,81,65125,0.015591,0.0,0.163543,0.0,0.0,1.0,8.0,low,PESSAC,Morin Cazalet
9,150,64702,0.015693,0.0,0.155258,0.0,0.0,1.0,9.0,low,ARTIGUES-PRES-BORDEAUX,Artigues Feydeau


In [168]:
station_profile[station_profile.station_id >= 175].sort_values('station_id')

Unnamed: 0,station_id,total_point,mean,median,std,95%,98%,99%,max,profile_station_activity,COMMUNE,NOM
64,175,64701,0.144125,0.0,0.485897,1.0,1.0,2.0,14.0,low,,
82,176,59468,0.205985,0.0,0.583721,1.0,2.0,2.0,15.0,low,,
38,177,32005,0.069142,0.0,0.341783,1.0,1.0,1.0,11.0,low,,
65,178,58755,0.162384,0.0,0.537831,1.0,2.0,2.0,14.0,low,,
93,179,59095,0.234955,0.0,0.620703,1.0,2.0,2.0,14.0,low,,
3,180,31672,0.007359,0.0,0.12123,0.0,0.0,0.0,8.0,low,,
0,181,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,low,,
18,182,17182,0.028234,0.0,0.234343,0.0,1.0,1.0,10.0,low,,
7,183,17277,0.012852,0.0,0.136782,0.0,0.0,1.0,5.0,low,,
60,251,10033,0.132715,0.0,0.563336,1.0,2.0,2.0,15.0,low,,


In [169]:
station_profile = station_profile.round(2)

In [170]:
fig = px.box(station_profile, x="profile_station_activity", y="mean",
            points='all',
            color='profile_station_activity',
            hover_data=['station_id','NOM', 'total_point', 'mean', '95%', 'std', 'COMMUNE'],
            labels={'profile_station_activity' : 'Profile activité',
                    'NOM' : 'Nom',
                    'COMMUNE' : 'Commune'},
            boxmode='group')
fig.show()

In [154]:
station_profile

Unnamed: 0,station_id,total_point,mean,median,std,95%,98%,99%,max,profile_station_activity,COMMUNE,NOM
0,181,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,low,,
1,160,64496,0.0,0.0,0.07,0.0,0.0,0.0,5.0,low,SAINT-AUBIN-DE-MEDOC,Eglise St Aubin
2,161,64028,0.01,0.0,0.1,0.0,0.0,0.0,6.0,low,LE TAILLAN-MEDOC,Le Taillan Mairie
3,180,31672,0.01,0.0,0.12,0.0,0.0,0.0,8.0,low,,
4,92,65219,0.01,0.0,0.12,0.0,0.0,0.0,6.0,low,SAINT-MEDARD-EN-JALLES,St Médard République
5,167,62437,0.01,0.0,0.14,0.0,0.0,0.0,9.0,low,SAINT-MEDARD-EN-JALLES,Centre commercial St Médard
6,168,50066,0.01,0.0,0.13,0.0,0.0,1.0,6.0,low,BLANQUEFORT,Mairie de Blanquefort
7,183,17277,0.01,0.0,0.14,0.0,0.0,1.0,5.0,low,,
8,81,65125,0.02,0.0,0.16,0.0,0.0,1.0,8.0,low,PESSAC,Morin Cazalet
9,150,64702,0.02,0.0,0.16,0.0,0.0,1.0,9.0,low,ARTIGUES-PRES-BORDEAUX,Artigues Feydeau


In [152]:
station_id = 174
plot_station_activity(ts_activity, station_id=station_id, 
                      features_to_plot=['transactions_all'],
                      #start_date=start_date,
                      #end_date=end_date,
                      return_data=False
                     )