In [14]:
import pandas as pd

from vcub_keeper.config import ROOT_MODEL
from vcub_keeper.ml.cluster import predict_anomalies_station
from vcub_keeper.ml.cluster_utils import load_model
from vcub_keeper.production.data import (
    get_data_from_api_bdx_by_station,
    get_data_from_api_by_station,
    transform_json_api_bdx_station_data_to_df,
    transform_json_station_data_to_df,
)
from vcub_keeper.transform.features_factory import get_consecutive_no_transactions_out
from vcub_keeper.visualisation import plot_station_anomalies, plot_station_anomalies_with_score

%load_ext autoreload
%autoreload 2

pd.options.display.max_rows = 500

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Objectifs

- Faire des prédictions d'anomalies sur une station via les données de production

## Prédiction sur une station

In [56]:
station_id=106
start_date='2024-05-01'
#start_date='2020-09-01'
stop_date='2024-07-31'

In [57]:
# API Oslandia
station_json = get_data_from_api_by_station(station_id=station_id, 
                                            start_date=start_date,
                                            stop_date=stop_date)

station_df = transform_json_station_data_to_df(station_json)

In [58]:
station_df

Unnamed: 0,station_id,date,available_stands,available_bikes,status,transactions_in,transactions_out,transactions_all
0,106,2024-05-01 00:10:00,22.0,25.0,1.0,0.0,0.0,0.0
1,106,2024-05-01 00:20:00,22.0,25.0,1.0,0.0,0.0,0.0
2,106,2024-05-01 00:30:00,22.0,25.0,1.0,0.0,0.0,0.0
3,106,2024-05-01 00:40:00,23.0,24.0,1.0,0.0,1.0,1.0
4,106,2024-05-01 00:50:00,23.0,24.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
13099,106,2024-07-30 23:20:00,12.0,19.0,0.0,0.0,0.0,0.0
13100,106,2024-07-30 23:30:00,12.0,19.0,0.0,0.0,0.0,0.0
13101,106,2024-07-30 23:40:00,12.0,19.0,0.0,0.0,0.0,0.0
13102,106,2024-07-30 23:50:00,12.0,19.0,0.0,0.0,0.0,0.0


In [59]:
# API Open Data Bordeaux
station_json = get_data_from_api_bdx_by_station(station_id=[106,  15,  60,], 
                                                start_date=start_date,
                                                stop_date=stop_date)

station_df = transform_json_api_bdx_station_data_to_df(station_json)

In [60]:
station_df

Unnamed: 0,station_id,date,available_stands,available_bikes,status,transactions_in,transactions_out,transactions_all
0,15,2024-05-01 02:10:00+02:00,1.0,0.0,0.0,0.0,0.0,0.0
1,15,2024-05-01 02:20:00+02:00,1.0,0.0,0.0,0.0,0.0,0.0
2,15,2024-05-01 02:30:00+02:00,1.0,0.0,0.0,0.0,0.0,0.0
3,15,2024-05-01 02:40:00+02:00,1.0,0.0,0.0,0.0,0.0,0.0
4,15,2024-05-01 02:50:00+02:00,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
39307,106,2024-07-31 01:20:00+02:00,12.0,19.0,0.0,0.0,0.0,0.0
39308,106,2024-07-31 01:30:00+02:00,12.0,19.0,0.0,0.0,0.0,0.0
39309,106,2024-07-31 01:40:00+02:00,12.0,19.0,0.0,0.0,0.0,0.0
39310,106,2024-07-31 01:50:00+02:00,12.0,19.0,0.0,0.0,0.0,0.0


In [61]:
station_df.station_id.unique()

array([ 15,  60, 106])

In [62]:
station_df = get_consecutive_no_transactions_out(station_df) 

In [63]:
# Load clf 
clf = load_model(station_id=station_id, path_directory=ROOT_MODEL)

In [64]:
# Faire une prédiction d'une station
station_pred = predict_anomalies_station(data=station_df, clf=clf, station_id=station_id)

In [65]:
station_pred.head()

Unnamed: 0,station_id,date,available_stands,available_bikes,status,transactions_in,transactions_out,transactions_all,consecutive_no_transactions_out,quarter,weekday,hours,Sin_quarter,Cos_quarter,Sin_weekday,Cos_weekday,Sin_hours,Cos_hours,anomaly
26208,106,2024-05-01 02:10:00+02:00,25.0,23.0,1.0,0.0,1.0,0.0,0,2,2,2,1.224647e-16,-1.0,0.974928,-0.222521,0.5,0.866025,1
26209,106,2024-05-01 02:20:00+02:00,25.0,22.0,1.0,0.0,0.0,1.0,1,2,2,2,1.224647e-16,-1.0,0.974928,-0.222521,0.5,0.866025,1
26210,106,2024-05-01 02:30:00+02:00,26.0,22.0,1.0,0.0,1.0,0.0,0,2,2,2,1.224647e-16,-1.0,0.974928,-0.222521,0.5,0.866025,1
26211,106,2024-05-01 02:40:00+02:00,25.0,22.0,1.0,0.0,0.0,0.0,1,2,2,2,1.224647e-16,-1.0,0.974928,-0.222521,0.5,0.866025,1
26212,106,2024-05-01 02:50:00+02:00,25.0,22.0,1.0,0.0,0.0,0.0,2,2,2,2,1.224647e-16,-1.0,0.974928,-0.222521,0.5,0.866025,1


In [66]:
station_pred[station_pred['anomaly'] == -1].groupby('quarter')['anomaly'].count()

quarter
2    124
3     15
Name: anomaly, dtype: int64

In [None]:
plot_station_anomalies(data=station_df, clf=clf, station_id=station_id)

In [68]:
plot_station_anomalies_with_score(data=station_df, clf=clf, station_id=station_id)

In [71]:
station_df[(station_df["station_id"] == station_id)].status.value_counts()

status
1.0    9204
0.0    3900
Name: count, dtype: int64

In [72]:
from vcub_keeper.config import ROOT_DATA_REF
from vcub_keeper.reader.reader import read_station_profile

station_profile = read_station_profile(path_directory=ROOT_DATA_REF)

#THRESHOLD_PROFILE_STATION = 0.3
station_profile[station_profile.station_id == station_id]

Unnamed: 0,station_id,total_point,mean,median,std,95%,98%,99%,max,profile_station_activity
182,106,64812,0.310639,0.0,0.462759,1.0,1.0,1.0,1.0,very high


In [73]:
station_profile

Unnamed: 0,station_id,total_point,mean,median,std,95%,98%,99%,max,profile_station_activity
0,181,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,low
1,160,20033,0.003648,0.0,0.060293,0.0,0.0,0.0,1.0,low
2,161,26966,0.006755,0.0,0.081914,0.0,0.0,0.0,1.0,low
3,180,10663,0.006948,0.0,0.083067,0.0,0.0,0.0,1.0,low
4,167,31011,0.007261,0.0,0.084901,0.0,0.0,0.0,1.0,low
5,92,29083,0.007536,0.0,0.086485,0.0,0.0,0.0,1.0,low
6,81,42080,0.009654,0.0,0.097779,0.0,0.0,0.0,1.0,low
7,183,9627,0.01008,0.0,0.099897,0.0,0.0,0.78,1.0,low
8,72,54201,0.010429,0.0,0.101588,0.0,0.0,1.0,1.0,low
9,150,47232,0.011015,0.0,0.104373,0.0,0.0,1.0,1.0,low


## Prediction sur toutes les stations

In [75]:
from datetime import date, timedelta

station_id = 102
date_today = date.today()
end_date = date_today + timedelta(days=1)
start_date = date_today - timedelta(days=8)

start_date_str = start_date.strftime('%Y-%m-%d')
end_date_str = end_date.strftime('%Y-%m-%d')

#start_date_str='2020-10-16'
#end_date_str='2020-10-24'

In [76]:
from vcub_keeper.config import ROOT_MODEL, THRESHOLD_PROFILE_STATION

#THRESHOLD_PROFILE_STATION = 0.3

station_profile = read_station_profile(path_directory=ROOT_DATA_REF)

stations_id_to_pred = \
    station_profile[station_profile['mean'] >= THRESHOLD_PROFILE_STATION]['station_id'].unique()

In [77]:
station_json = get_data_from_api_by_station(station_id=stations_id_to_pred, 
                                            start_date=start_date_str,
                                            stop_date=end_date_str)

station_df = transform_json_station_data_to_df(station_json)

station_df = get_consecutive_no_transactions_out(station_df) 
    
for station_id in stations_id_to_pred:    
    # Load clf 
    clf = load_model(station_id=station_id, path_directory=ROOT_MODEL)

    # Faire une prédiction d'une station
    station_pred = predict_anomalies_station(data=station_df, clf=clf, station_id=station_id)

    if len(station_pred) == 0:
        print('No data for station N°' + str(station_id))
    elif (station_pred.tail(10)['anomaly'] == -1).any():
        print('Recent Anomly detected for station N° ' + str(station_id))
    elif (station_pred.tail(1)['status'] == 0).any():
        print('Station N° ' + str(station_id) + " is inactive")
    else:
        print('Clean for station N°' + str(station_id))

Station N° 124 is inactive
Clean for station N°15
Clean for station N°60
Clean for station N°18
Clean for station N°10
Clean for station N°68
Clean for station N°130
Clean for station N°2
Station N° 105 is inactive
Clean for station N°120
Clean for station N°11
Clean for station N°110
Clean for station N°9
Clean for station N°23
Clean for station N°3
Station N° 16 is inactive
Clean for station N°20
Clean for station N°136
Clean for station N°42
Clean for station N°21
Clean for station N°131
Clean for station N°59
Clean for station N°45
Clean for station N°172
Station N° 6 is inactive
Clean for station N°24
Clean for station N°36
Clean for station N°108
Clean for station N°19
Clean for station N°125
Clean for station N°135
Clean for station N°37
Clean for station N°139
Clean for station N°99
Clean for station N°28
Clean for station N°57
Clean for station N°7
Clean for station N°98
No data for station_id 8
No data for station N°8
Clean for station N°41
Clean for station N°40
Clean for st

In [83]:
station_pred.tail()

Unnamed: 0,station_id,date,available_stands,available_bikes,status,transactions_in,transactions_out,transactions_all,consecutive_no_transactions_out,quarter,weekday,hours,Sin_quarter,Cos_quarter,Sin_weekday,Cos_weekday,Sin_hours,Cos_hours,anomaly
56667,106,2024-09-20 12:40:00,47,10,1,0.0,1.0,1.0,0,3,4,12,-1.0,-1.83697e-16,-0.433884,-0.900969,1.224647e-16,-1.0,1
56668,106,2024-09-20 12:50:00,52,5,1,0.0,5.0,5.0,0,3,4,12,-1.0,-1.83697e-16,-0.433884,-0.900969,1.224647e-16,-1.0,1
56669,106,2024-09-20 13:00:00,53,4,1,0.0,1.0,1.0,0,3,4,13,-1.0,-1.83697e-16,-0.433884,-0.900969,-0.258819,-0.965926,1
56670,106,2024-09-20 13:10:00,53,4,1,1.0,1.0,2.0,0,3,4,13,-1.0,-1.83697e-16,-0.433884,-0.900969,-0.258819,-0.965926,1
56671,106,2024-09-20 13:20:00,54,3,1,0.0,1.0,1.0,0,3,4,13,-1.0,-1.83697e-16,-0.433884,-0.900969,-0.258819,-0.965926,1


In [84]:
station_pred["anomaly"].value_counts()

anomaly
1    1232
Name: count, dtype: int64

In [85]:
station_pred[(station_pred["anomaly"] == 1) & (station_pred["date"] == station_pred["date"].max())].tail()

Unnamed: 0,station_id,date,available_stands,available_bikes,status,transactions_in,transactions_out,transactions_all,consecutive_no_transactions_out,quarter,weekday,hours,Sin_quarter,Cos_quarter,Sin_weekday,Cos_weekday,Sin_hours,Cos_hours,anomaly
56671,106,2024-09-20 13:20:00,54,3,1,0.0,1.0,1.0,0,3,4,13,-1.0,-1.83697e-16,-0.433884,-0.900969,-0.258819,-0.965926,1


In [86]:
station_pred[(station_pred["anomaly"] == -1)].sort_values("date", ascending=False).head(50)

Unnamed: 0,station_id,date,available_stands,available_bikes,status,transactions_in,transactions_out,transactions_all,consecutive_no_transactions_out,quarter,weekday,hours,Sin_quarter,Cos_quarter,Sin_weekday,Cos_weekday,Sin_hours,Cos_hours,anomaly


In [82]:
plot_station_anomalies_with_score(data=station_df, clf=clf, station_id=106)