In [None]:
from pathlib import Path
import pandas as pd
import geopandas as gpd
import os

FOLDER_PATH = '..'
base = Path(FOLDER_PATH) / "data" / "PT" / "pt_data" # Base path for the data files

agg = "15min"  
modes = ["subway", "tramway", "bus"]

dfs = {}  # mode -> DataFrame

for mode in ['subway','tramway','bus']:
    csv_path = f"{base}/{mode}_indiv_{agg}/{mode}_indiv_{agg}.csv"
    df = pd.read_csv(csv_path,index_col = 0, low_memory=False)
    if 'VAL_DATE' in df.columns:
        df['VAL_DATE'] = pd.to_datetime(df['VAL_DATE']) 
    else: 
        df.index = pd.to_datetime(df.index)
    dfs[mode] = df

In [31]:
dfs["subway"].columns

Index(['AMP', 'BEL', 'BRO', 'CHA', 'COR', 'CPA', 'CRO', 'CUI', 'CUS', 'DEB',
       'FLA', 'FOC', 'VAI', 'VEN', 'GAR', 'GOR', 'BLA', 'GRA', 'GIL', 'HEN',
       'HOT', 'MAC', 'LAE', 'BON', 'MAS', 'MER', 'LUM', 'OGA', 'PRY', 'PAR',
       'PER', 'GUI', 'JAU', 'REP', 'SAN', 'SAX', 'GER', 'VMY', 'SOI', 'JEA',
       'Flow'],
      dtype='object')

In [None]:
df_subway=dfs["subway"]
df_subway['Flow']=dfs["subway"].sum(axis=1)
df_subway["Flow"] = pd.to_numeric(df_subway["Flow"], errors="coerce")
df_subway.head()

Unnamed: 0_level_0,AMP,BEL,BRO,CHA,COR,CPA,CRO,CUI,CUS,DEB,...,GUI,JAU,REP,SAN,SAX,GER,VMY,SOI,JEA,Flow
VAL_DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-11-01 00:00:00,20.0,164.0,10.0,59.0,88.0,5.0,20.0,1.0,5.0,21.0,...,13.0,16.0,12.0,49.0,68.0,8.0,23.0,3.0,59.0,1343.0
2019-11-01 00:15:00,6.0,82.0,11.0,30.0,43.0,3.0,8.0,0.0,2.0,3.0,...,7.0,7.0,3.0,11.0,33.0,2.0,16.0,0.0,38.0,618.0
2019-11-01 00:30:00,0.0,4.0,0.0,0.0,2.0,2.0,6.0,0.0,1.0,0.0,...,0.0,0.0,3.0,5.0,2.0,0.0,3.0,1.0,0.0,76.0
2019-11-01 00:45:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2019-11-01 04:15:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0


In [33]:
# colonnes existantes dans ton DF (hors Flow)
existing_stations = set(df_subway.columns) - {"Flow"}

# listes brutes TCL
ligne_A = ['SOI','BON','CUS','FLA','GRA','REP','CHA','MAS','FOC','HOT','COR','BEL','AMP','PER']
ligne_B = ['CHA','BRO','PAR','JEU','SAX','GAM','GAR','JEH','OUL','OUC']
ligne_C = ['HOT','CRO','HEN','CUS','CUI']
ligne_D = ['GVA','VAL','GOR','GRA','SAX','GAM','GUE','SIE','GAR','MON','PAR','MER','GRE','VAI','VEN']

# on purge les stations absentes
ligne_A = [s for s in ligne_A if s in existing_stations]
ligne_B = [s for s in ligne_B if s in existing_stations]
ligne_C = [s for s in ligne_C if s in existing_stations]
ligne_D = [s for s in ligne_D if s in existing_stations]

#ligne_A, ligne_B, ligne_C, ligne_D


In [44]:
df_subway.columns

Index(['AMP', 'BEL', 'BRO', 'CHA', 'COR', 'CPA', 'CRO', 'CUI', 'CUS', 'DEB',
       'FLA', 'FOC', 'VAI', 'VEN', 'GAR', 'GOR', 'BLA', 'GRA', 'GIL', 'HEN',
       'HOT', 'MAC', 'LAE', 'BON', 'MAS', 'MER', 'LUM', 'OGA', 'PRY', 'PAR',
       'PER', 'GUI', 'JAU', 'REP', 'SAN', 'SAX', 'GER', 'VMY', 'SOI', 'JEA',
       'Flow'],
      dtype='object')

In [71]:
# 1. S'assurer que l'index est datetime
df_subway.index

# 2. Total réseau par jour
daily = df_subway.resample("D").sum().sum(axis=1)


# 3. Stats
mu = daily.mean()
sigma = daily.std()

# 4. Z-score
z = (daily - mu) / sigma

# 5. Détection anomalies
seuil = 3
alerts = daily[z.abs() > seuil]

alerts


Series([], Freq: D, dtype: float64)

In [72]:
station_cols = df_subway.columns[:-1]

daily_station = (
    df_subway[station_cols]
    .resample("D").sum()
    .fillna(0)     # <= tu veux drop les NaN ? On les remplace par 0
)

z_station = (daily_station - daily_station.mean()) / daily_station.std()

anomalies_station = z_station[z_station.abs() > 3]
anomalies_station


Unnamed: 0_level_0,AMP,BEL,BRO,CHA,COR,CPA,CRO,CUI,CUS,DEB,...,PER,GUI,JAU,REP,SAN,SAX,GER,VMY,SOI,JEA
VAL_DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-11-01,,,,,,,,,,,...,,,,,,,,,,
2019-11-02,,,,,,,,,,,...,,,,,,,,,,
2019-11-03,,,,,,,,,,,...,,,,,,,,,,
2019-11-04,,,,,,,,,,,...,,,,,,,,,,
2019-11-05,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-03-26,,,,,,,,,,,...,,,,,,,,,,
2020-03-27,,,,,,,,,,,...,,,,,,,,,,
2020-03-28,,,,,,,,,,,...,,,,,,,,,,
2020-03-29,,,,,,,,,,,...,,,,,,,,,,


In [70]:
station_cols = df_subway.columns[:-1]

daily_station = df_subway[station_cols].resample("D").sum().fillna(0)
#display(daily_station.count())


z_station = (daily_station - daily_station.mean()) / daily_station.std()

anomalies_station = z_station[z_station.abs() > 3].dropna(how="all")
anomalies_station



Unnamed: 0_level_0,AMP,BEL,BRO,CHA,COR,CPA,CRO,CUI,CUS,DEB,...,PER,GUI,JAU,REP,SAN,SAX,GER,VMY,SOI,JEA
VAL_DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-12-05,,,,,,,,,,,...,,,,,,,,,,3.364265
2019-12-06,,3.429106,,,,3.886452,,,,,...,,,,,,,,,,4.91113
2019-12-07,,3.566712,,,,5.703853,,,,,...,,,,,,3.924851,,,,5.402272
