In [1]:
from vcub_keeper.production.data import (get_data_from_api_by_station,
                                         transform_json_station_data_to_df)
from vcub_keeper.visualisation import *
from vcub_keeper.ml.cluster import train_cluster_station, predict_anomalies_station
from vcub_keeper.transform.features_factory import *
from vcub_keeper.ml.cluster_utils import load_model

%load_ext autoreload
%autoreload 2

pd.options.display.max_rows = 500

## Objectifs

- Faire des prédictions d'anomalies sur une station via les données de production

## Prédiction sur une station

In [2]:
station_id=59
start_date='2020-10-15'
#start_date='2020-09-01'
stop_date='2020-10-23'

In [3]:
station_json = get_data_from_api_by_station(station_id=station_id, 
                                            start_date=start_date,
                                            stop_date=stop_date)

station_df = transform_json_station_data_to_df(station_json)

In [4]:
station_json['data'][0].keys()

dict_keys(['available_bikes', 'available_stands', 'id', 'name', 'nb_stands', 'status', 'ts'])

In [5]:
station_df = get_consecutive_no_transactions_out(station_df) 

In [6]:
(station_df.tail(1)['status'] == 0).any()

False

In [7]:
station_df.tail()

Unnamed: 0,station_id,date,available_stands,available_bikes,status,transactions_in,transactions_out,transactions_all,consecutive_no_transactions_out
1063,59,2020-10-22 09:20:00,8.0,9.0,1.0,0.0,0.0,0.0,57
1064,59,2020-10-22 09:30:00,7.0,9.0,1.0,0.0,0.0,0.0,58
1065,59,2020-10-22 09:40:00,6.0,10.0,1.0,1.0,0.0,1.0,59
1066,59,2020-10-22 09:50:00,6.0,10.0,1.0,0.0,0.0,0.0,60
1067,59,2020-10-22 10:00:00,6.0,10.0,1.0,0.0,0.0,0.0,61


In [8]:
# Load clf 
clf = load_model(station_id=station_id)

In [9]:
# Faire une prédiction d'une station
station_pred = predict_anomalies_station(data=station_df, clf=clf, station_id=station_id)

In [11]:
plot_station_anomalies(data=station_df, clf=clf, station_id=station_id)

In [23]:
plot_station_anomalies(data=station_df, clf=clf, station_id=station_id)

In [12]:
station_pred.tail()

Unnamed: 0,station_id,date,available_stands,available_bikes,status,transactions_in,transactions_out,transactions_all,consecutive_no_transactions_out,weekday,hours,Sin_weekday,Cos_weekday,Sin_hours,Cos_hours,anomaly
1063,59,2020-10-22 09:20:00,8.0,9.0,1.0,0.0,0.0,0.0,57,3,9,0.433884,-0.900969,0.707107,-0.707107,1
1064,59,2020-10-22 09:30:00,7.0,9.0,1.0,0.0,0.0,0.0,58,3,9,0.433884,-0.900969,0.707107,-0.707107,1
1065,59,2020-10-22 09:40:00,6.0,10.0,1.0,1.0,0.0,1.0,59,3,9,0.433884,-0.900969,0.707107,-0.707107,1
1066,59,2020-10-22 09:50:00,6.0,10.0,1.0,0.0,0.0,0.0,60,3,9,0.433884,-0.900969,0.707107,-0.707107,1
1067,59,2020-10-22 10:00:00,6.0,10.0,1.0,0.0,0.0,0.0,61,3,10,0.433884,-0.900969,0.5,-0.866025,-1


In [20]:
#station_pred['date_day'] = station_pred['date'].dt.date
station_pred['date_day'] = station_pred['date'].dt.strftime('%Y-%m-%d %H')

grp = \
    station_pred[station_pred['anomaly'] == -1].groupby('date_day',
                                                  as_index=False)['date'].agg({'min' : 'min',
                                                                               'max' : 'max'})


In [22]:
def plot_station_anomalies(data, clf, station_id,
                   start_date='',
                   end_date='',
                   return_data=False,
                   offline_plot=False):
    """
    Plot Time Series
    Parameters
    ----------
    data : pd.DataFrame
        Tableau temporelle de l'activité des stations Vcub
    clf : Pipeline Scikit Learn
        Estimator already fit
    station_id : Int
        ID station
    start_date : str [opt]
        Date de début du graphique yyyy-mm-dd
    end_date : str [opt]
        Date de fin du graphique yyyy-mm-dd
    return_data : bool [opt]
        Retour le DataFrame lié à la station demandé et au contraintes de date si remplie.
    offline_plot : bool [opt]
        Pour exporter le graphique

    Returns
    -------
    data : pd.DataFrame
        Could return it if return_data is True.

    Examples
    --------

    plot_station_anomalies(data=ts_activity, clf=clf, station_id=22)
    """

    # Filter on station_id
    data_station = data[data['station_id'] == station_id].copy()

    if 'consecutive_no_transactions_out' not in data.columns:
        # Some features
        data_station = get_transactions_in(data_station)
        data_station = get_transactions_out(data_station)
        data_station = get_transactions_all(data_station)
        data_station = get_consecutive_no_transactions_out(data_station)

    data_pred = predict_anomalies_station(data=data_station,
                                             clf=clf,
                                             station_id=station_id)

    if start_date != '':
        data_pred = data_pred[data_pred['date'] >= start_date]

    if end_date != '':
        data_pred = data_pred[data_pred['date'] <= end_date]

    # Init list of trace
    data_graph = []

    # Axe 1
    trace = go.Scatter(x=data_pred['date'],
                       y=data_pred['available_bikes'],
                       mode='lines',
                       line= {'width': 2},
                       name="Vélo disponible")
    data_graph.append(trace)

    # Axe 2
    trace_ano = go.Scatter(x=data_pred['date'],
                           y=data_pred['consecutive_no_transactions_out'],
                           mode='lines',
                           line= {'width': 1,
                                  'dash': 'dot',
                                  'color' : 'rgba(189,189,189,1)'},
                           yaxis='y2',

                           name='Absence consécutive de prise de vélo')
    data_graph.append(trace_ano)

    # For shape hoverdata anomaly
    data_pred['ano_hover_text'] = np.NaN
    data_pred.loc[data_pred['anomaly'] == -1,
                 'ano_hover_text'] = data_pred['available_bikes']
    trace_ano2 = go.Scatter(x=data_pred['date'],
                            y=data_pred['ano_hover_text'],
                            mode='lines',
                            text='x',
                            connectgaps=False,
                            line= {'width': 2,
                                   'color' : 'red'},
                            name='anomaly')
    data_graph.append(trace_ano2)

    # Shapes anomaly
    shapes = []
    data_pred['date_day'] = data_pred['date'].dt.strftime('%Y-%m-%d %H')
    grp = \
        data_pred[data_pred['anomaly'] == -1].groupby('date_day',
                                                      as_index=False)['date'].agg({'min' : 'min',
                                                                                   'max' : 'max'})

    max_value = data_pred['available_bikes'].max()
    for idx, row in grp.iterrows():
        shapes.append(dict(type="rect",
                           xref="x",
                           yref="y",
                           x0=row['min'],
                           y0=0,
                           x1=row['max'],
                           y1=max_value,
                           fillcolor="red",
                           opacity=0.7,
                           layer="below",
                           line_width=0
                      ))

    data_pred = data_pred.drop('date_day', axis=1)

    # Design graph
    layout = dict(
        title="Détection d'anomalies sur la stations N° " + str(station_id),
        showlegend=True,
        legend=dict(orientation="h",
                    yanchor="top",
                    xanchor="center",
                    y=1.1,
                    x=0.5
                   ),
        xaxis=dict(
                rangeslider=dict(
                    visible=True
                ),
                type='date',
                tickformat='%a %Y-%m-%d %H:%M',
        ),
        yaxis=dict(
            title='Valeurs'
        ),
        yaxis2={'overlaying': 'y',
                'side': 'right',
                'visible': False},
        template='plotly_white',
        hovermode='x',
        shapes=shapes
    )

    fig = dict(data=data_graph, layout=layout)
    if offline_plot is False:
        iplot(fig)
    else:
        offline.plot(fig)

    if return_data is True:
        return data_pred

In [24]:
grp

Unnamed: 0,date_day,min,max
0,2020-10-14 06,2020-10-14 06:40:00,2020-10-14 06:50:00
1,2020-10-14 07,2020-10-14 07:00:00,2020-10-14 07:50:00
2,2020-10-14 08,2020-10-14 08:00:00,2020-10-14 08:20:00
3,2020-10-15 06,2020-10-15 06:40:00,2020-10-15 06:50:00
4,2020-10-20 08,2020-10-20 08:10:00,2020-10-20 08:20:00
5,2020-10-20 13,2020-10-20 13:40:00,2020-10-20 13:50:00
6,2020-10-20 14,2020-10-20 14:00:00,2020-10-20 14:10:00


In [18]:
station_pred['date'].dt.strftime('%Y-%m-%d %h')

0       2020-10-13 Oct
1       2020-10-13 Oct
2       2020-10-13 Oct
3       2020-10-13 Oct
4       2020-10-13 Oct
             ...      
1107    2020-10-20 Oct
1108    2020-10-20 Oct
1109    2020-10-20 Oct
1110    2020-10-20 Oct
1111    2020-10-20 Oct
Name: date, Length: 1112, dtype: object

In [19]:
station_pred['date'].dt.strftime('%Y-%m-%d %H')

0       2020-10-13 00
1       2020-10-13 00
2       2020-10-13 00
3       2020-10-13 00
4       2020-10-13 00
            ...      
1107    2020-10-20 16
1108    2020-10-20 16
1109    2020-10-20 17
1110    2020-10-20 17
1111    2020-10-20 17
Name: date, Length: 1112, dtype: object

In [110]:
from vcub_keeper.reader.reader import *

station_profile = read_station_profile()

THRESHOLD_PROFILE_STATION = 0.3
station_profile[station_profile.station_id == station_id]

Unnamed: 0,station_id,total_point,mean,median,std,95%,98%,99%,max,profile_station_activity
177,174,62372,0.67,0.0,1.27,3.0,5.0,6.0,43.0,hight


In [111]:
station_profile

Unnamed: 0,station_id,total_point,mean,median,std,95%,98%,99%,max,profile_station_activity
0,181,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,low
1,160,64496,0.0,0.0,0.07,0.0,0.0,0.0,5.0,low
2,161,64028,0.01,0.0,0.1,0.0,0.0,0.0,6.0,low
3,180,31672,0.01,0.0,0.12,0.0,0.0,0.0,8.0,low
4,92,65219,0.01,0.0,0.12,0.0,0.0,0.0,6.0,low
5,167,62437,0.01,0.0,0.14,0.0,0.0,0.0,9.0,low
6,168,50066,0.01,0.0,0.13,0.0,0.0,1.0,6.0,low
7,183,17277,0.01,0.0,0.14,0.0,0.0,1.0,5.0,low
8,81,65125,0.02,0.0,0.16,0.0,0.0,1.0,8.0,low
9,150,64702,0.02,0.0,0.16,0.0,0.0,1.0,9.0,low


## Prediction sur toutes les stations

In [100]:
start_date='2020-10-15'
stop_date='2020-10-23'

In [101]:
from vcub_keeper.reader.reader import *
from vcub_keeper.config import THRESHOLD_PROFILE_STATION

#THRESHOLD_PROFILE_STATION = 0.3

station_profile = read_station_profile()

stations_id_to_pred = \
    station_profile[station_profile['mean'] >= THRESHOLD_PROFILE_STATION]['station_id'].unique()

In [102]:
for station_id in stations_id_to_pred:
    station_json = get_data_from_api_by_station(station_id=station_id, 
                                            start_date=start_date,
                                            stop_date=stop_date)

    station_df = transform_json_station_data_to_df(station_json)
    
    station_df = get_consecutive_no_transactions_out(station_df) 
    
    # Load clf 
    clf = load_model(station_id=station_id)
    
    # Faire une prédiction d'une station
    station_pred = predict_anomalies_station(data=station_df, clf=clf, station_id=station_id)
    
    if (station_pred.tail(10)['anomaly'] == -1).any():
        print('Recent Anomly detected for station N° ' + str(station_id))
    elif (station_df.tail(1)['status'] == 0).any():
        print('Station N° ' + str(station_id) + " is inactive")
    else:
        print('Clean for station N°' + str(station_id))

Clean for station N°124
Clean for station N°118
Clean for station N°111
Clean for station N°10
Clean for station N°62
Recent Anomly detected for station N° 66
Clean for station N°105
Clean for station N°130
Recent Anomly detected for station N° 112
Clean for station N°2
Clean for station N°68
Clean for station N°23
Clean for station N°11
Clean for station N°60
Clean for station N°120
Clean for station N°9
Clean for station N°3
Clean for station N°45
Clean for station N°136
Clean for station N°24
Station N° 138 is inactive
Clean for station N°21
Clean for station N°108
Clean for station N°16
Clean for station N°131
Clean for station N°36
Clean for station N°20
Clean for station N°110
Clean for station N°139
Clean for station N°6
Clean for station N°42
Clean for station N°125
Clean for station N°19
Clean for station N°135
Recent Anomly detected for station N° 59
Clean for station N°41
Clean for station N°28
Clean for station N°7
Clean for station N°109
Clean for station N°4
Clean for sta

In [103]:
len(stations_id_to_pred)

69