In [51]:
import pandas as pd
import polars as pl

from vcub_keeper.config import ROOT_MODEL
from vcub_keeper.ml.cluster import predict_anomalies_station
from vcub_keeper.ml.cluster_utils import load_model
from vcub_keeper.production.data import (
    get_data_from_api_bdx_by_station,
    get_data_from_api_by_station,
    transform_json_api_bdx_station_data_to_df,
    transform_json_station_data_to_df,
)
from vcub_keeper.transform.features_factory import get_consecutive_no_transactions_out
from vcub_keeper.visualisation import plot_station_anomalies, plot_station_anomalies_with_score

%load_ext autoreload
%autoreload 2

pd.options.display.max_rows = 500

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Objectifs

- Faire des prédictions d'anomalies sur une station via les données de production

## Prédiction sur une station

In [7]:
station_id = 106
start_date = "2024-05-01"
# start_date='2020-09-01'
stop_date = "2024-07-31"

In [8]:
# API Oslandia
station_json = get_data_from_api_by_station(station_id=station_id, start_date=start_date, stop_date=stop_date)

station_df = transform_json_station_data_to_df(station_json).collect()

In [9]:
station_df.head()

date,station_id,available_stands,available_bikes,status,transactions_in,transactions_out,transactions_all
datetime[μs],i32,i64,i64,u8,i64,i64,i64
2024-05-01 00:10:00,106,22,25,1,0,0,0
2024-05-01 00:20:00,106,22,25,1,0,0,0
2024-05-01 00:30:00,106,22,25,1,0,0,0
2024-05-01 00:40:00,106,23,24,1,0,1,1
2024-05-01 00:50:00,106,23,24,1,0,0,0


In [10]:
station_df.group_by("status").len()

status,len
u8,u32
0.0,3836
,125
1.0,9143


In [11]:
station_df.filter(pl.col("status").is_null())

date,station_id,available_stands,available_bikes,status,transactions_in,transactions_out,transactions_all
datetime[μs],i32,i64,i64,u8,i64,i64,i64
2024-05-18 00:10:00,106,,,,0,0,0
2024-05-18 00:20:00,106,,,,0,0,0
2024-05-18 00:30:00,106,,,,0,0,0
2024-05-18 00:40:00,106,,,,0,0,0
2024-05-18 00:50:00,106,,,,0,0,0
…,…,…,…,…,…,…,…
2024-07-28 03:10:00,106,,,,0,0,0
2024-07-28 03:30:00,106,,,,0,0,0
2024-07-28 08:10:00,106,,,,0,0,0
2024-07-28 22:50:00,106,,,,0,0,0


In [38]:
# API Open Data Bordeaux
station_json = get_data_from_api_bdx_by_station(
    station_id=[
        106,
        15,
        60,
    ],
    start_date=start_date,
    stop_date=stop_date,
)

station_df = transform_json_api_bdx_station_data_to_df(station_json).collect()

In [39]:
print(station_df.group_by("status").len())

shape: (2, 2)
┌────────┬───────┐
│ status ┆ len   │
│ ---    ┆ ---   │
│ u8     ┆ u32   │
╞════════╪═══════╡
│ 1      ┆ 29470 │
│ 0      ┆ 9699  │
└────────┴───────┘


In [40]:
station_df.select("station_id").unique()

station_id
i32
15
60
106


In [41]:
station_df = get_consecutive_no_transactions_out(station_df).lazy()

In [42]:
# Load clf
clf = load_model(station_id=station_id, path_directory=ROOT_MODEL)

In [43]:
# Faire une prédiction d'une station
station_pred = predict_anomalies_station(data=station_df, clf=clf, station_id=station_id)

In [44]:
station_pred.head()

station_id,date,available_stands,available_bikes,status,transactions_in,transactions_out,transactions_all,consecutive_no_transactions_out,quarter,weekday,hours,Sin_quarter,Cos_quarter,Sin_weekday,Cos_weekday,Sin_hours,Cos_hours,anomaly
i32,"datetime[μs, UTC]",i64,i64,u8,i64,i64,i64,i64,i8,i8,i8,f64,f64,f64,f64,f64,f64,i64
106,2024-05-01 00:10:00 UTC,25,23,1,0,1,0,0,2,3,0,1.2246e-16,-1.0,0.433884,-0.900969,0.0,1.0,1
106,2024-05-01 00:20:00 UTC,25,22,1,0,0,1,1,2,3,0,1.2246e-16,-1.0,0.433884,-0.900969,0.0,1.0,1
106,2024-05-01 00:30:00 UTC,26,22,1,0,1,0,0,2,3,0,1.2246e-16,-1.0,0.433884,-0.900969,0.0,1.0,1
106,2024-05-01 00:40:00 UTC,25,22,1,0,0,0,1,2,3,0,1.2246e-16,-1.0,0.433884,-0.900969,0.0,1.0,1
106,2024-05-01 00:50:00 UTC,25,22,1,0,0,0,2,2,3,0,1.2246e-16,-1.0,0.433884,-0.900969,0.0,1.0,1


In [45]:
station_pred.filter(pl.col("anomaly") == 1).group_by("quarter").agg(pl.count("anomaly").alias("count"))

quarter,count
i8,u32
3,4314
2,8547


In [None]:
plot_station_anomalies(data=station_df, clf=clf, station_id=station_id)

In [None]:
plot_station_anomalies_with_score(data=station_df, clf=clf, station_id=station_id)

In [56]:
station_df.filter(pl.col("station_id") == station_id).group_by("status").len().collect()

status,len
u8,u32
0,3900
1,9204


In [59]:
from vcub_keeper.config import ROOT_DATA_REF
from vcub_keeper.reader.reader import read_station_profile

station_profile = read_station_profile(path_directory=ROOT_DATA_REF)

# THRESHOLD_PROFILE_STATION = 0.3
# station_profile[station_profile.station_id == station_id]
station_df.filter(pl.col("station_id") == station_id).collect()

station_id,date,available_stands,available_bikes,status,transactions_in,transactions_out,transactions_all,consecutive_no_transactions_out
i32,"datetime[μs, UTC]",i64,i64,u8,i64,i64,i64,i64
106,2024-05-01 00:10:00 UTC,25,23,1,0,1,0,0
106,2024-05-01 00:20:00 UTC,25,22,1,0,0,1,1
106,2024-05-01 00:30:00 UTC,26,22,1,0,1,0,0
106,2024-05-01 00:40:00 UTC,25,22,1,0,0,0,1
106,2024-05-01 00:50:00 UTC,25,22,1,0,0,0,2
…,…,…,…,…,…,…,…,…
106,2024-07-30 23:20:00 UTC,12,19,0,0,0,0,0
106,2024-07-30 23:30:00 UTC,12,19,0,0,0,0,0
106,2024-07-30 23:40:00 UTC,12,19,0,0,0,0,0
106,2024-07-30 23:50:00 UTC,12,19,0,0,0,0,0


In [60]:
station_profile

station_id,total_point,mean,median,std,95%,98%,99%,max,profile_station_activity
u16,i64,f64,f64,f64,f64,f64,f64,i64,str
181,5,0.0,0.0,0.0,0.0,0.0,0.0,0,"""low"""
160,13541,0.005613,0.0,0.074709,0.0,0.0,0.0,1,"""low"""
92,27366,0.008258,0.0,0.090501,0.0,0.0,0.0,1,"""low"""
161,22750,0.008264,0.0,0.090531,0.0,0.0,0.0,1,"""low"""
167,26927,0.008653,0.0,0.09262,0.0,0.0,0.0,1,"""low"""
…,…,…,…,…,…,…,…,…,…
65,64732,0.22088,0.0,0.414843,1.0,1.0,1.0,1,"""hight"""
22,64613,0.241608,0.0,0.428061,1.0,1.0,1.0,1,"""very high"""
123,58482,0.241698,0.0,0.428116,1.0,1.0,1.0,1,"""very high"""
39,64643,0.245564,0.0,0.430424,1.0,1.0,1.0,1,"""very high"""


## Prediction sur toutes les stations

In [61]:
from datetime import date, timedelta

station_id = 102
date_today = date.today()
end_date = date_today + timedelta(days=1)
start_date = date_today - timedelta(days=8)

start_date_str = start_date.strftime("%Y-%m-%d")
end_date_str = end_date.strftime("%Y-%m-%d")

# start_date_str='2020-10-16'
# end_date_str='2020-10-24'

In [72]:
from vcub_keeper.config import ROOT_MODEL, THRESHOLD_PROFILE_STATION

# THRESHOLD_PROFILE_STATION = 0.3

station_profile = read_station_profile(path_directory=ROOT_DATA_REF)

stations_id_to_pred = (
    station_profile.filter(pl.col("mean") >= THRESHOLD_PROFILE_STATION)
    .select("station_id")
    .to_series()
    .unique()
    .to_numpy()
)
stations_id_to_pred

array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  15,  16,
        18,  19,  20,  21,  22,  23,  24,  28,  36,  37,  39,  40,  41,
        42,  43,  44,  45,  54,  55,  56,  57,  58,  59,  60,  65,  68,
        98,  99, 100, 101, 102, 103, 104, 105, 106, 108, 109, 110, 120,
       123, 124, 125, 127, 130, 131, 133, 134, 135, 136, 139, 172, 174],
      dtype=uint16)

In [73]:
station_json = get_data_from_api_by_station(
    station_id=stations_id_to_pred, start_date=start_date_str, stop_date=end_date_str
)

station_df = transform_json_station_data_to_df(station_json)

station_df = get_consecutive_no_transactions_out(station_df)

for station_id in stations_id_to_pred:
    # Load clf
    clf = load_model(station_id=station_id, path_directory=ROOT_MODEL)

    # Faire une prédiction d'une station
    station_pred = predict_anomalies_station(data=station_df, clf=clf, station_id=station_id)

    if len(station_pred) == 0:
        print("No data for station N°" + str(station_id))
    elif (station_pred.tail(10)["anomaly"] == -1).any():
        print("Recent Anomly detected for station N° " + str(station_id))
    elif (station_pred.tail(1)["status"] == 0).any():
        print("Station N° " + str(station_id) + " is inactive")
    else:
        print("Clean for station N°" + str(station_id))

Clean for station N°1
Clean for station N°2
Clean for station N°3
Clean for station N°4
Clean for station N°5
Station N° 6 is inactive
Clean for station N°7
No data for station_id 8
No data for station N°8
Clean for station N°9
Clean for station N°10
Clean for station N°11
Station N° 15 is inactive
Clean for station N°16
Clean for station N°18
Clean for station N°19
Clean for station N°20
Clean for station N°21
Clean for station N°22
Clean for station N°23
Clean for station N°24
Clean for station N°28
Clean for station N°36
Clean for station N°37
Clean for station N°39
Recent Anomly detected for station N° 40
Clean for station N°41
Clean for station N°42
Clean for station N°43
Clean for station N°44
Clean for station N°45
Clean for station N°54
Clean for station N°55
Clean for station N°56
Clean for station N°57
Clean for station N°58
Clean for station N°59
Clean for station N°60
Clean for station N°65
Clean for station N°68
Clean for station N°98
Station N° 99 is inactive
Station N° 1

In [74]:
station_pred.tail()

date,station_id,available_stands,available_bikes,status,transactions_in,transactions_out,transactions_all,consecutive_no_transactions_out,quarter,weekday,hours,Sin_quarter,Cos_quarter,Sin_weekday,Cos_weekday,Sin_hours,Cos_hours,anomaly
datetime[μs],i32,i64,i64,u8,i64,i64,i64,i64,i8,i8,i8,f64,f64,f64,f64,f64,f64,i64
2024-10-18 10:40:00,174,30,13,1,0,0,0,8,4,5,10,-2.4493e-16,1.0,-0.974928,-0.222521,0.5,-0.866025,1
2024-10-18 10:50:00,174,30,13,1,1,1,2,0,4,5,10,-2.4493e-16,1.0,-0.974928,-0.222521,0.5,-0.866025,1
2024-10-18 11:00:00,174,34,9,1,0,4,4,0,4,5,11,-2.4493e-16,1.0,-0.974928,-0.222521,0.258819,-0.965926,1
2024-10-18 11:10:00,174,34,9,1,0,0,0,1,4,5,11,-2.4493e-16,1.0,-0.974928,-0.222521,0.258819,-0.965926,1
2024-10-18 11:20:00,174,34,9,1,0,0,0,2,4,5,11,-2.4493e-16,1.0,-0.974928,-0.222521,0.258819,-0.965926,1


In [75]:
station_pred.group_by("anomaly").len()

anomaly,len
i64,u32
-1,17
1,1203


In [76]:
station_pred.filter((pl.col("anomaly") == 1) & (pl.col("date") == station_pred.select("date").max()))

date,station_id,available_stands,available_bikes,status,transactions_in,transactions_out,transactions_all,consecutive_no_transactions_out,quarter,weekday,hours,Sin_quarter,Cos_quarter,Sin_weekday,Cos_weekday,Sin_hours,Cos_hours,anomaly
datetime[μs],i32,i64,i64,u8,i64,i64,i64,i64,i8,i8,i8,f64,f64,f64,f64,f64,f64,i64
2024-10-18 11:20:00,174,34,9,1,0,0,0,2,4,5,11,-2.4493e-16,1.0,-0.974928,-0.222521,0.258819,-0.965926,1


In [78]:
station_pred.filter(pl.col("anomaly") == -1)

date,station_id,available_stands,available_bikes,status,transactions_in,transactions_out,transactions_all,consecutive_no_transactions_out,quarter,weekday,hours,Sin_quarter,Cos_quarter,Sin_weekday,Cos_weekday,Sin_hours,Cos_hours,anomaly
datetime[μs],i32,i64,i64,u8,i64,i64,i64,i64,i8,i8,i8,f64,f64,f64,f64,f64,f64,i64
2024-10-10 06:00:00,174,32,10,1,0,0,0,34,4,4,6,-2.4493e-16,1.0,-0.433884,-0.900969,1.0,6.1232e-17,-1
2024-10-10 06:10:00,174,32,9,1,0,0,1,35,4,4,6,-2.4493e-16,1.0,-0.433884,-0.900969,1.0,6.1232e-17,-1
2024-10-10 06:20:00,174,32,9,1,0,0,0,36,4,4,6,-2.4493e-16,1.0,-0.433884,-0.900969,1.0,6.1232e-17,-1
2024-10-10 06:30:00,174,32,9,1,0,0,0,37,4,4,6,-2.4493e-16,1.0,-0.433884,-0.900969,1.0,6.1232e-17,-1
2024-10-10 06:40:00,174,32,9,1,0,0,0,38,4,4,6,-2.4493e-16,1.0,-0.433884,-0.900969,1.0,6.1232e-17,-1
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
2024-10-11 06:30:00,174,35,6,1,0,0,0,44,4,5,6,-2.4493e-16,1.0,-0.974928,-0.222521,1.0,6.1232e-17,-1
2024-10-11 06:40:00,174,35,6,1,0,0,0,45,4,5,6,-2.4493e-16,1.0,-0.974928,-0.222521,1.0,6.1232e-17,-1
2024-10-11 06:50:00,174,35,6,1,0,0,0,46,4,5,6,-2.4493e-16,1.0,-0.974928,-0.222521,1.0,6.1232e-17,-1
2024-10-12 07:20:00,174,34,7,1,0,0,0,50,4,6,7,-2.4493e-16,1.0,-0.781831,0.62349,0.965926,-0.258819,-1


In [80]:
plot_station_anomalies_with_score(data=station_df, clf=clf, station_id=174)