# Imports

In [None]:
import os
from datetime import datetime
from pathlib import Path

import graph_tool as gt
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import polars as pl
from graph_tool.all import Graph, graph_draw
from sqlalchemy import create_engine

# DB

In [None]:
DATABASE_URL = os.environ["DB_URL_RPC"]

In [None]:
OUTPUT_PATH = Path("outputs")
OUTPUT_PATH.absolute()

In [None]:
db_engine = create_engine(DATABASE_URL)

# Paramètres

In [None]:
aom_with_incentives = [
    "Montpellier Méditerranée Métropole",
    "Métropole Rouen Normandie",
    "Métropole d'Aix-Marseille-Provence",
    "Nantes Métropole",
    "Métropole Nice Côte d'Azur",
    "CA Lorient Agglomération",
    "SM Artois Mobilités",
]

aom_without_incentives = [
    "Bordeaux Métropole",
    "Dijon Métropole",
    "Eurométropole de Strasbourg",
    "Brest Métropole",
    "Métropole Européenne de Lille",
    "Metz Métropole",
    "Métropole du Grand Nancy",
    "Orléans Métropole",
    "Métropole Nice Côte d'Azur",
    "Rennes Métropole",
    "Saint-Etienne Métropole",
    "Métropole Toulon-Provence-Méditerranée",
]

selected_aoms = [
    "Montpellier Méditerranée Métropole",
    "Métropole Rouen Normandie",
    "SM Artois Mobilités",
    "Bordeaux Métropole",
    "Dijon Métropole",
    "Rennes Métropole",
]

# Couleurs

In [None]:
cohortes_color_mapping = {
    "2022": "#f39c12",
    "t1_23": "#d7e1ed",
    "t2_23": "#89a6c7",
    "t3_23": "#3E6DA1",
    "t4_23": "#1a334e",
    "t1_24": "rgba(113, 88, 226,1.0)",
}

In [None]:
color_mapping = {
    "BlaBlaCar Daily": "#f8c291",
    "Ecov": "#82ccdd",
    "FranceCovoit": "rgba(61, 193, 211,1.0)",
    "Karos": "#e55039",
    "Klaxit": "#f6b93b",
    "La Roue Verte": "#4a69bd",
    "MOOVANCE": "#3c6382",
    "Mobicoop": "#78e08f",
    "Mov'ici": "rgba(247, 143, 179,1.0)",
    "OuestGo": "#b71540",
    "Pass Pass Covoiturage": "#3d3d3d",
    "YNSTANT": "#cd84f1",
    "Tout opérateur": "rgba(200, 214, 229,1.0)",
    "Autres opérateurs": "black",
}

# Requêtes de construction des cohortes


Table pour les cohortes CEE

```sql
create table luis.cee_drivers as (select
	i."uuid",
	min(ca."_id"::text) as id_first_cee,
	min(c.datetime) as date_first_cee,
	min(ca.operator_id) as id_operateur_cee
from carpool.carpools c
inner join carpool.identities i on c.identity_id = i._id
inner join cee.cee_applications ca on
	c."_id" = ca.carpool_id
where ca.journey_type = 'short'
and not ca.is_specific
group by 1)
```

---

```sql
create table luis.cee_drivers_v2 as (
with cee_trip as (
	select
		ca."_id"::text as id_first_cee,
		c.identity_id as identity_id_cee,
		c.datetime as date_first_cee,
		ca.operator_id as id_operateur_cee
	from cee.cee_applications ca
	inner join carpool.carpools c  on c._id = ca.carpool_id
	where ca.journey_type = 'short'
	and not ca.is_specific
)
select
	i."uuid",
	ct.*,
	o."name" as nom_operateur
from cee_trip ct inner join carpool.identities i on ct.identity_id_cee=i._id
inner join "operator".operators o on o._id = ct.id_operateur_cee
)

```

---

```sql
create table luis.cee_drivers_v4 as (
with trips as (
select
	i."uuid",
	c.trip_id,
	c.datetime,
	c.operator_id,
	p.aom,
	p.l_aom
from
	carpool.carpools c
inner join carpool.identities i on
	c.identity_id = i."_id"
left join geo.perimeters p on
	c.start_geo_code = p.arr
where
	is_driver
	and status = cast('ok' as covoiturage_production.carpool.carpool_status_enum)
 ),
trips_aom_agg as (
select
		t."uuid",
		t.aom,
		max(t.l_aom) as l_aom,
		count(distinct trip_id)
from
	trips t
group by
	1,
	2
),
trips_aom_ranked as (
select
		*,
		row_number() over (partition by t."uuid"
order by
	count desc) as "rank"
from
	trips_aom_agg t
),
cee_trip as (
select
		ca."_id"::text as id_first_cee,
		c.identity_id as identity_id_cee,
		c.datetime as date_first_cee,
		ca.operator_id as cee_operator_id,
		p.aom as cee_aom_id,
		p.l_aom as cee_aom_name
from
	cee.cee_applications ca
inner join carpool.carpools c on
	c._id = ca.carpool_id
inner join geo.perimeters p on
	p.arr = c.start_geo_code
	and p.year = geo.get_latest_millesime()
where
	ca.journey_type = 'short'
	and not ca.is_specific
)
select
	i."uuid",
	max(ct.id_first_cee) as id_first_cee,
	max(ct.date_first_cee) as date_first_cee,
	case
		when max(ct.date_first_cee) between '2023-01-01' and '2023-03-31' then 't1_23'
		when max(ct.date_first_cee) between '2023-04-01' and '2023-06-30' then 't2_23'
		when max(ct.date_first_cee) between '2023-07-01' and '2023-09-30' then 't3_23'
		when max(ct.date_first_cee) between '2023-10-01' and '2023-12-31' then 't4_23'
		when max(ct.date_first_cee) between '2024-01-01' and '2024-03-31' then 't1_24'
	end as cohorte,
	max(ct.cee_operator_id) as cee_operator_id,
	max(o."name") as cee_operator_name,
	max(ct.cee_aom_id) as cee_aom_id,
	max(ct.cee_aom_name) cee_aom_name,
	count(distinct ft.trip_id) filter (
where
	date_first_cee > ft.datetime) as num_trips_before_cee,
	count(distinct ft.trip_id) filter (
where
	date_first_cee <= ft.datetime) as num_trips_after_cee,
	count(distinct ft.trip_id) filter (
where
	date_first_cee <= ft.datetime
	and ct.cee_operator_id = ft.operator_id) as num_trips_after_cee_same_operator,
	min(ft.datetime) as date_first_trip,
	max(ta.aom) as max_aom_id,
	max(ta.l_aom) as max_aom_name
from
	cee_trip ct
inner join carpool.identities i on
	ct.identity_id_cee = i._id
inner join "operator".operators o on
	o._id = ct.cee_operator_id
left join trips ft on
	i."uuid" = ft.uuid
left join trips_aom_ranked ta on
	i."uuid" = ta."uuid"
	and ta."rank" = 1
group by
	1
)
```


Table pour la cohorte 2022

```sql
create table luis.cohorte_2022_v2 as (
with driver_trips as (
select
	i.uuid,
	p.aom,
	max(p.l_aom) as l_aom,
	min(c.datetime) as date_first_trip,
	count(distinct trip_id) as num_trips
from
	carpool.carpools c
left join carpool.identities i on
	c.identity_id = i._id
left join geo.perimeters p on
	c.start_geo_code = p.arr
	and p.year = geo.get_latest_millesime()
where
	is_driver
	and status = cast('ok' as covoiturage_production.carpool.carpool_status_enum)
group by
	1,
	2),
driver_trips_ranked as (
select
		*,
		row_number () over (partition by "uuid"
order by
	num_trips desc nulls last) as rn_aom,
		row_number () over (partition by "uuid"
order by
	date_first_trip asc nulls last) as rn_date
from
	driver_trips
)
select
	"uuid",
	min(date_first_trip) filter (
	where rn_date = 1) as date_first_trip,
	max(aom) filter (
	where rn_aom = 1) as aom_id,
	max(l_aom) filter (
	where rn_aom = 1) as aom_name
from
	driver_trips_ranked
group by
	1
having
	min(date_first_trip) filter (
	where rn_date = 1) between '2022-01-01' and '2022-09-30'
)
```


# Statistiques cohorte 2022

In [None]:
df_drivers_2022 = pl.read_database(
    """
select 
	uuid::text,
    date_first_trip,
    aom_id,
    aom_name
from luis.cohorte_2022_v2
""",
    connection=db_engine,
)

In [None]:
df_drivers_2022.describe()

In [None]:
# Part des conducteurs de la cohorte 2022 sur le total des conducteurs 2022
# (obtenus avec une requête externe)
100 * df_drivers_2022.select(pl.col("uuid").n_unique()) / 250596

# Statistiques CEE globales


## Nombre de CEE par mois et opérateur

In [None]:
df_cee_by_operator_month = pl.read_database(
    """
select 
	date_trunc('month',datetime) as "month",
	operator_id,
	max(o.name) as operator_name,
	count(*) as num_cee
from cee.cee_applications ca 
left join "operator".operators o on ca.operator_id = o."_id" 
where journey_type = 'short'
and not is_specific 
group by 1,2
order by 1,2
""",
    connection=db_engine,
)

In [None]:
fig_cee_by_operator_and_month = px.bar(
    df_cee_by_operator_month,
    x="month",
    y="num_cee",
    text="num_cee",
    text_auto=".1s",
    color="operator_name",
    color_discrete_map=color_mapping,
    template="simple_white",
    labels={
        "month": "Mois",
        "num_cee": "Nombre de CEE",
        "operator_name": "Opérateur :",
    },
    title="Evolution du nombre de CEE par mois et par opérateurs",
)
fig_cee_by_operator_and_month.update_layout(
    legend_orientation="h",
    height=700,
)
fig_cee_by_operator_and_month.update_yaxes(
    range=[0, 45_000], side="right", showgrid=True
)
fig_cee_by_operator_and_month.update_xaxes(title=None)
fig_cee_by_operator_and_month.show()
fig_cee_by_operator_and_month.write_html(OUTPUT_PATH / "fig_stats_cee_mensuelles.html")
fig_cee_by_operator_and_month.write_image(
    OUTPUT_PATH / "fig_stats_cee_mensuelles.svg", format="svg", width=1280, height=720
)

## Part des trjets CEE vs trajets non CEE

In [None]:
df_cee_trips_by_week = pl.read_database(
    """
with cee_drivers as (
select
	i."uuid",
  c.datetime
from
	cee.cee_applications ca
inner join carpool.carpools c on
	ca.carpool_id = c._id
inner join carpool.identities i on
	c.identity_id = i._id
)
select 
	date_trunc('week',c.datetime) as "week",
	count(distinct c.trip_id) filter (where cd."uuid" is not null) as trip_cee,
	count(distinct c.trip_id) as total_trips
from carpool.carpools c
inner join carpool.identities i on c.identity_id = i._id 
left join cee_drivers cd on cd."uuid" = i."uuid" and cd.datetime<=c.datetime
where c.datetime >= '2023-01-01'
group by 1
""",
    connection=db_engine,
)

In [None]:
df_cee_trips_by_week

In [None]:
df_cee_trips_by_week_melted = (
    df_cee_trips_by_week.filter(pl.col("week").dt.year().is_in([2023, 2024]))
    .with_columns(
        (100 * pl.col("trip_cee") / pl.col("total_trips")).alias("avec_cee"),
        (100 * (1 - pl.col("trip_cee") / pl.col("total_trips"))).alias("sans_cee"),
    )
    .melt(
        id_vars="week",
        value_vars=["avec_cee", "sans_cee"],
        variable_name="type",
        value_name="value",
    )
)
df_cee_trips_by_week_melted

In [None]:
fig_trips_by_week_and_type = px.area(
    df_cee_trips_by_week_melted,
    x="week",
    y="value",
    color="type",
    template="simple_white",
    labels={
        "value": "% des trajets",
        "week": "Date du trajet",
        "type": "Type de trajet",
    },
    title="Évolution de la part des trajets fait par des conducteurs CEE",
)
fig_trips_by_week_and_type.update_layout(legend_orientation="h", height=500)
fig_trips_by_week_and_type.update_xaxes(title=None)
fig_trips_by_week_and_type.update_yaxes(
    range=[0, 105],
    tickvals=[0, 25, 50, 75, 100],
    side="right",
    showgrid=True,
    gridcolor="black",
    griddash="dot",
    gridwidth=1,
)
fig_trips_by_week_and_type.write_html(OUTPUT_PATH / "fig_trajets_par_type_semaine.html")
fig_trips_by_week_and_type.write_image(
    OUTPUT_PATH / "fig_trajets_par_type_semaine.svg",
    format="svg",
    width=1280,
    height=720,
)
fig_trips_by_week_and_type

# Statistique cohortes CEE


In [None]:
df_cohortes_cee = pl.read_database(
    """
select
	*
from
	luis.cee_drivers_v4
where cohorte is not null
""",
    connection=db_engine,
)

In [None]:
df_cohortes_cee.describe()

In [None]:
df_cohortes_cee.group_by(pl.col("cohorte")).agg(
    pl.len().alias("nombre_de_conducteurs")
).sort(pl.col("cohorte").str.split("_").list.reverse().list.join(""))

In [None]:
# Part des conducteurs CEE sur l'ensemble des bénéficiaires de CEE courte distance
100 * df_cohortes_cee.select(
    pl.col("uuid").map_elements(lambda x: str(x), return_dtype=pl.String).n_unique()
) / 448766

## Part des opérateurs par cohorte CEE

In [None]:
df_cohortes_cee_by_operator = (
    df_cohortes_cee.group_by(["cohorte", "cee_operator_name"])
    .agg(pl.col("uuid").len())
    .with_columns(
        (100 * pl.col("uuid") / pl.col("uuid").sum().over("cohorte"))
        .round(2)
        .alias("share")
    )
    .sort(["cohorte", "cee_operator_name"], descending=[False, True])
)

In [None]:
traces = []
for operator in sorted(
    df_cohortes_cee_by_operator["cee_operator_name"].unique().to_list()
):
    data = df_cohortes_cee_by_operator.filter(
        pl.col("cee_operator_name") == operator
    ).sort(pl.col("cohorte").str.reverse())
    trace = go.Bar(
        x=data["cohorte"].replace(
            {
                "t1_23": "CEE T1 2023",
                "t2_23": "CEE T2 2023",
                "t3_23": "CEE T3 2023",
                "t4_23": "CEE T4 2023",
                "t1_24": "CEE T1 2024",
            }
        ),
        y=data["share"],
        text=data["share"].map_elements(
            lambda x: f"{x:.0f}%" if x >= 1 else "", return_dtype=pl.String
        ),
        hovertemplate="%{fullData.name} représente %{y:.2f}% des conducteurs de la cohorte %{x}<extra></extra>",
        name=operator,
        marker_color=color_mapping.get(operator, "black"),
    )
    traces.append(trace)
fig_cohortes_stats = go.Figure(traces)
fig_cohortes_stats.update_layout(
    barmode="stack", plot_bgcolor="white", legend_orientation="h", height=600
)
fig_cohortes_stats.update_yaxes(
    range=[0, 115],
    showgrid=True,
    gridcolor="black",
    griddash="dot",
    layer="above traces",
    showline=True,
    title="Part des conducteurs (%)",
)
fig_cohortes_stats.update_xaxes(
    zeroline=True, zerolinecolor="black", zerolinewidth=0.5, title="Cohorte"
)
fig_cohortes_stats.write_html(OUTPUT_PATH / "stats_cohortes_cee.html")
fig_cohortes_stats.write_image(
    OUTPUT_PATH / "stats_cohortes_cee.svg", format="svg", width=1280, height=720
)
fig_cohortes_stats.show()

## Evolution de la part de CEE par opérateur

In [None]:
fig_cee_opeator_share_by_week = px.area(
    df_cohortes_cee.group_by(
        [pl.col("date_first_cee").dt.truncate("1w"), "cee_operator_name"]
    )
    .agg(pl.col("uuid").count().alias("num_drivers"))
    .with_columns(
        (
            100
            * pl.col("num_drivers")
            / pl.col("num_drivers").sum().over("date_first_cee")
        ).alias("share")
    )
    .sort(pl.col("date_first_cee", "cee_operator_name")),
    x="date_first_cee",
    y="share",
    color="cee_operator_name",
    color_discrete_map=color_mapping,
    labels={
        "date_first_cee": "Date du CEE",
        "share": "Part des conducteurs (%)",
        "cee_operator_name": "Opérateur",
    },
    template="simple_white",
    title="Évolution de la part des CEE octroyés par opérateur<br><sub>Les données sont aggrégées hebdomadairement.",
)
fig_cee_opeator_share_by_week.update_layout(
    legend_orientation="h", height=700, legend_title=None
)
fig_cee_opeator_share_by_week.update_xaxes(title=None)
fig_cee_opeator_share_by_week.update_yaxes(
    showgrid=True, gridcolor="black", griddash="dot", side="right"
)
fig_cee_opeator_share_by_week.write_html(OUTPUT_PATH / "part_cee_par_ope.html")
fig_cee_opeator_share_by_week.write_image(
    OUTPUT_PATH / "part_cee_par_ope.svg", format="svg", width=1280, height=720
)
fig_cee_opeator_share_by_week

In [None]:
OUTPUT_PATH.absolute()

## Nombre de CEE simple vs bonus


In [None]:
df_cee_by_operator = pl.read_database(
    """
select
	cee_operator_name,
	count(*) as num_cee,
	count(*) filter (where num_trips_after_cee_same_operator>=10) as num_cee_bonus
from
	luis.cee_drivers_v3 cdv 
where cohorte is not null
  group by 1
""",
    connection=db_engine,
)

In [None]:
df_cee_by_operator.describe()

### Part des conducteurs CEE qui ont obtenus leurs bonus sur l'ensemble des conducteurs CEE :

In [None]:
df_cee_by_operator["num_cee_bonus"].sum() / df_cee_by_operator["num_cee"].sum()

### Comparaison par opérateur

In [None]:
df_cee_by_operator = df_cee_by_operator.with_columns(
    (pl.col("num_cee_bonus") / pl.col("num_cee")).alias("share_cee_bonus")
).filter(pl.col("cee_operator_name") != "Picholines")

df_cee_by_operator = pl.concat(
    [
        df_cee_by_operator,
        df_cee_by_operator.select(
            pl.col("num_cee_bonus").sum(),
            pl.col("num_cee").sum(),
            (pl.col("num_cee_bonus").sum() / pl.col("num_cee").sum()).alias(
                "share_cee_bonus"
            ),
            pl.lit("Tout opérateur").alias("cee_operator_name"),
        ),
    ],
    how="diagonal_relaxed",
)

df_cee_by_operator

In [None]:
trace_cee_without_bonus = go.Bar(
    x=df_cee_by_operator["cee_operator_name"],
    y=(1 - df_cee_by_operator["share_cee_bonus"]) * 100,
    marker_color=df_cee_by_operator["cee_operator_name"].replace(color_mapping),
    hovertemplate="%{y:.2f}% des conducteurs <i>%{x}</i> ont touché la prime CEE <b>sans</b> le bonus<extra></extra>",
    name="Prime CEE sans bonus",
    marker_pattern_shape=".",
    marker_pattern_size=4,
)
trace_cee_with_bonus = go.Bar(
    x=df_cee_by_operator["cee_operator_name"],
    y=df_cee_by_operator["share_cee_bonus"] * 100,
    text=df_cee_by_operator["share_cee_bonus"] * 100,
    texttemplate="%{text:.2f}%",
    textposition="inside",
    marker_color=df_cee_by_operator["cee_operator_name"].replace(color_mapping),
    hovertemplate="%{y:.2f}% des conducteurs <i>%{x}</i> ont touché la prime CEE <b>avec</b> le bonus<extra></extra>",
    name="Prime CEE avec bonus",
)

fig_cee_stats = go.Figure([trace_cee_with_bonus, trace_cee_without_bonus])
fig_cee_stats.update_layout(
    barmode="stack",
    plot_bgcolor="white",
    title="Quelle part des conducteurs a touché la prime CEE et son bonus ?<br><sub>Les barres en pointillés représentent la part des conducteurs n'ayant touché que la première partie de la prime.</sub>",
    showlegend=False,
)
fig_cee_stats.update_yaxes(
    range=[0, 105],
    showgrid=True,
    gridcolor="black",
    griddash="dot",
    layer="above traces",
    showline=True,
    title="Part des conducteurs (%)",
    tickvals=[0, 25, 50, 75, 100],
    ticksuffix="%",
)
fig_cee_stats.update_xaxes(zeroline=True, zerolinecolor="black", zerolinewidth=1)
fig_cee_stats.write_html(OUTPUT_PATH / "stats_cee_bonus.html")
fig_cee_stats.write_image(
    OUTPUT_PATH / "stats_cee_bonus.svg", format="svg", width=1280, height=720
)
fig_cee_stats.show()

# Nombre de trajets effectués


## Requêtes


### 2022

In [None]:
df_trips_by_driver = pl.read_database(
    """
select
	i.uuid,
  max(aom_name) as aom_name,
	count(distinct trip_id) as num_trajets,
    min(c.datetime) as date_premier_trajet,
    max(c.datetime) as date_dernier_trajet
from
	carpool.carpools c
inner join carpool.identities i on c.identity_id = i._id
inner JOIN luis.cohorte_2022_v2 ft ON ft.uuid=i.uuid
WHERE c.datetime BETWEEN ft.date_first_trip AND ft.date_first_trip + INTERVAL '23 weeks'
and ft.date_first_trip<='2022-09-30'
and is_driver
and status=cast('ok' as covoiturage_production.carpool.carpool_status_enum)
group by
	1
""",
    connection=db_engine,
)

In [None]:
df_trips_by_driver.describe()

### CEE

In [None]:
df_trips_by_driver_cohortes = pl.read_database(
    """
with cohortes as (select 
	*
from luis.cee_drivers_v4 cd)
select 
	ch.uuid,
	count(distinct trip_id) as num_trajets,
	min(c.datetime) as date_premier_trajet,
	max(c.datetime) as date_dernier_trajet,
	max(ch.date_first_cee) as date_premier_cee,
	max(ch.cohorte) as cohorte,
  max(ch.cee_operator_name) as cee_operator_name,
  max(ch.max_aom_name) as aom_name
from
	carpool.carpools c
inner join carpool.identities i on
	c.identity_id = i._id
inner join cohortes ch on
	i."uuid" = ch.uuid
where
	c.datetime BETWEEN ch.date_first_cee AND ch.date_first_cee+ INTERVAL '23 weeks'
	and ch.cohorte is not null
	and c.is_driver
    and status=cast('ok' as covoiturage_production.carpool.carpool_status_enum)
group by 1
""",
    connection=db_engine,
)

In [None]:
df_trips_by_driver_cohortes.describe()

In [None]:
df_trips_by_driver_cohortes.group_by("cohorte").agg(pl.col("num_trajets").sum()).sort(
    pl.col("cohorte").str.reverse()
)

### AOM

In [None]:
df_trips_by_driver_aom = pl.read_database(
    """
with trips as (
select
	i.uuid,
	ft.aom_name as max_aom_name,
	c.operator_id,
	c.trip_id,
	case
		when l_aom = 'Montpellier Méditerranée Métropole'
		and c.datetime <= '2024-04-01' and c.operator_id = 3 then 'Montpellier Méditerranée Métropole'
    when l_aom = 'Montpellier Méditerranée Métropole'
		and c.datetime > '2024-04-01' and c.operator_id = 9 then 'Montpellier Méditerranée Métropole'
    when l_aom = 'SM Artois Mobilités'
		and c.operator_id = 9 then 'SM Artois Mobilités'
    when l_aom = 'Métropole Rouen Normandie'
		and c.datetime <= '2023-10-01' and c.operator_id = 3 then 'Métropole Rouen Normandie'
    when l_aom = 'Métropole Rouen Normandie'
		and c.datetime > '2023-10-01' and c.operator_id = 9 then 'Métropole Rouen Normandie'
		when l_aom in ('Bordeaux Métropole','Dijon Métropole','Rennes Métropole') and c.operator_id in (3,4,9) then l_aom
    else null
	end as eligible_aom_name
from
	carpool.carpools c
inner join carpool.identities i on
	c.identity_id = i._id
inner join luis.cohorte_2022_v2 ft on
	ft.uuid = i.uuid
left join geo.perimeters p on
	c.start_geo_code = p.arr
WHERE c.datetime BETWEEN ft.date_first_trip AND ft.date_first_trip + INTERVAL '12 weeks'
)
select 
	"uuid",
	max(max_aom_name) as max_aom_name,
	max(eligible_aom_name) as aom_name,
	count(distinct trip_id) as num_trips
from
	trips
group by
	1
having max(eligible_aom_name) is not null
  """,
    connection=db_engine,
)

In [None]:
df_trips_by_driver_aom.describe()

In [None]:
df_trips_by_driver_cee_aom = pl.read_database(
    """
with trips as (
select
	i.uuid,
	ch.max_aom_name,
	ch.cohorte,
	c.trip_id,
	case
		when l_aom = 'Montpellier Méditerranée Métropole'
		and c.datetime <= '2024-04-01'
		and c.operator_id = 3 then 'Montpellier Méditerranée Métropole'
		when l_aom = 'Montpellier Méditerranée Métropole'
		and c.datetime > '2024-04-01'
		and c.operator_id = 9 then 'Montpellier Méditerranée Métropole'
		when l_aom = 'SM Artois Mobilités'
		and c.operator_id = 9 then 'SM Artois Mobilités'
		when l_aom = 'Métropole Rouen Normandie'
		and c.datetime <= '2023-10-01'
		and c.operator_id = 3 then 'Métropole Rouen Normandie'
		when l_aom = 'Métropole Rouen Normandie'
		and c.datetime > '2023-10-01'
		and c.operator_id = 9 then 'Métropole Rouen Normandie'
		when l_aom in ('Bordeaux Métropole', 'Dijon Métropole', 'Rennes Métropole')
		and c.operator_id in (3, 4, 9) then l_aom
		else null
	end as eligible_aom_name
from
	carpool.carpools c
inner join carpool.identities i on
	c.identity_id = i._id
inner join luis.cee_drivers_v4 ch on
	ch.uuid = i.uuid
left join geo.perimeters p on
	c.start_geo_code = p.arr
where
	c.datetime between ch.date_first_cee and ch.date_first_cee + interval '12 weeks'
)
select 
	"uuid",
	max(max_aom_name) as max_aom_name,
	max(eligible_aom_name) as aom_name,
	max(cohorte) as cohorte,
	count(distinct trip_id) as num_trips
from
	trips
where
	cohorte is not null
group by
	1
having
	max(eligible_aom_name) is not null
  """,
    connection=db_engine,
)

In [None]:
df_trips_by_driver_cee_aom.describe()

In [None]:
df_trips_by_driver_cee_aom.filter(pl.col("aom_name").is_in(selected_aoms)).group_by(
    "aom_name"
).len()

## Moyenne par cohorte

In [None]:
df_trips_by_driver.select(pl.col("num_trajets").mean().alias("moyenne_nombre_trajets"))

In [None]:
df_trips_by_driver_cohortes.group_by("cohorte").agg(
    pl.col("num_trajets").mean().alias("moyenne_nombre_trajets")
).sort(pl.col("cohorte").str.split("_").list.reverse().list.join(""))

## Visualisation 2022

In [None]:
df_trips_by_drivers_2022_agg

In [None]:
def format_breakpoint(breakpoint: float):
    if breakpoint <= 5:
        return str(int(breakpoint))
    elif breakpoint == np.inf:
        return "Plus de 55"
    else:
        return f"{breakpoint-4:.0f}-{breakpoint:.0f}"


df_trips_by_drivers_2022_agg = (
    df_trips_by_driver.get_column("num_trajets")
    .hist([1, 2, 3, 4] + list(range(5, 56, 5)), include_breakpoint=True)
    .with_columns(
        pl.col("breakpoint")
        .map_elements(format_breakpoint, return_dtype=pl.String)
        .alias("cat"),
        pl.format(
            "{}%",
            (100 * pl.col("count") / pl.col("count").sum()).alias("share").round(2),
        ),
    )
)

fig_trips_by_drivers_2022 = px.bar(
    df_trips_by_drivers_2022_agg,
    x="cat",
    y="count",
    text="share",
    template="simple_white",
    title="Distribution du nombre de trajets effectués par conducteurs<br><sub>Cohorte 2022 - Historique de 3 mois",
)
fig_trips_by_drivers_2022.update_xaxes(title="Nombre de trajets effectués")
fig_trips_by_drivers_2022.update_yaxes(title="Nombre de conducteurs")
fig_trips_by_drivers_2022.write_html(
    OUTPUT_PATH / "histo_trajets_par_conducteurs_2022_3m.html"
)
fig_trips_by_drivers_2022.write_image(
    OUTPUT_PATH / "histo_trajets_par_conducteurs_2022_3m.svg",
    format="svg",
    width=1280,
    height=720,
)
fig_trips_by_drivers_2022.show()

## Comparaison 2022 vs CEE


In [None]:
def preprocess_trips_by_driver_df(df: pl.DataFrame, bins: list[int]) -> pl.DataFrame:
    return (
        df.get_column("num_trajets")
        .hist(bins, include_breakpoint=True)
        .with_columns(
            pl.col("breakpoint")
            .cast(pl.String)
            .replace(np.inf, f"{bins[-1]+1}+")
            .str.replace("(\.0)", ""),
            (100 * pl.col("count") / pl.col("count").sum()).alias("share"),
        )
    )

In [None]:
bins = list(range(100))
plot_configs = [
    {
        "data": preprocess_trips_by_driver_df(df_trips_by_driver, bins),
        "name": "Référence 2022",
        "color": "#f39c12",
    },
    {
        "data": preprocess_trips_by_driver_df(
            df_trips_by_driver_cohortes.filter(pl.col("cohorte") == "t1_23"), bins
        ),
        "name": "CEE T1 2023",
        "color": "#d7e1ed",
    },
    {
        "data": preprocess_trips_by_driver_df(
            df_trips_by_driver_cohortes.filter(pl.col("cohorte") == "t2_23"), bins
        ),
        "name": "CEE T2 2023",
        "color": "#89a6c7",
    },
    {
        "data": preprocess_trips_by_driver_df(
            df_trips_by_driver_cohortes.filter(pl.col("cohorte") == "t3_23"), bins
        ),
        "name": "CEE T3 2023",
        "color": "#3E6DA1",
    },
    {
        "data": preprocess_trips_by_driver_df(
            df_trips_by_driver_cohortes.filter(pl.col("cohorte") == "t4_23"), bins
        ),
        "name": "CEE T4 2023",
        "color": "#1a334e",
    },
    {
        "data": preprocess_trips_by_driver_df(
            df_trips_by_driver_cohortes.filter(pl.col("cohorte") == "t1_24"), bins
        ),
        "name": "CEE T1 2024",
        "color": "rgba(113, 88, 226,1.0)",
    },
]
traces = []
for config in plot_configs:
    data = config["data"]
    trace = go.Bar(
        x=data["breakpoint"],
        y=data["share"],
        marker_color=config["color"],
        name=config["name"],
        hovertemplate="%{y:.2f}% des conducteurs ont fait %{x} trajets",
    )
    traces.append(trace)

fig_trips_by_drivers_multi = go.Figure(traces)
fig_trips_by_drivers_multi.update_layout(
    barmode="group",
    bargroupgap=0.2,
    plot_bgcolor="white",
    legend_title="Cohorte :",
    title="Distribution du nombre de trajets effectués pour chaque cohorte (historique de 6 mois)",
)
fig_trips_by_drivers_multi.update_yaxes(
    showgrid=True,
    griddash="dashdot",
    gridwidth=1,
    gridcolor="gray",
    title="Part des conducteurs (%)",
)
fig_trips_by_drivers_multi.update_xaxes(title="Nombre de trajets effectués")
fig_trips_by_drivers_multi.add_vrect(
    x0=9.5,
    x1=10.5,
    fillcolor="#7f8c8d",
    opacity=0.25,
    line_width=0,
    annotation_text="BONUS CEE",
    annotation_position="top left",
    annotation_textangle=-90,
    annotation_font_size=10,
)
fig_trips_by_drivers_multi.show()
fig_trips_by_drivers_multi.write_html(
    OUTPUT_PATH / "histo_trajets_par_conducteurs_multi_6m.html"
)
fig_trips_by_drivers_multi.write_image(
    OUTPUT_PATH / "histo_trajets_par_conducteurs_multi_6m.svg",
    format="svg",
    width=1280,
    height=720,
)

## Comparaison par opérateur


In [None]:
with pl.Config(tbl_rows=60, tbl_formatting="ASCII_MARKDOWN"):
    print(
        df_trips_by_driver_cohortes.group_by(["cohorte", "cee_operator_name"])
        .agg(pl.col("num_trajets").mean().alias("moyenne_nombre_trajets"))
        .sort(
            pl.col("cohorte").str.split("_").list.reverse().list.join(""),
            "cee_operator_name",
        )
    )

In [None]:
fig_num_trips_by_cohort_operator = px.line(
    df_trips_by_driver_cohortes.group_by(["cohorte", "cee_operator_name"])
    .agg(pl.col("num_trajets").mean().alias("moyenne_nombre_trajets"))
    .sort(
        pl.col("cohorte").str.split("_").list.reverse().list.join(""),
        "cee_operator_name",
    ),
    x="cohorte",
    y="moyenne_nombre_trajets",
    color="cee_operator_name",
    markers=True,
    color_discrete_map=color_mapping,
    template="simple_white",
    labels={
        "moyenne_nombre_trajets": "Nombre moyen de trajets effectués",
        "cohorte": "Cohorte",
        "cee_operator_name": "Opérateur",
    },
    title="Comparaison du nombre de trajets effectués par cohorte et pour chaque opérateur<br><sub>Historique de trajets de 3 mois</sub>",
)

fig_num_trips_by_cohort_operator.update_traces(
    marker_size=10, line_width=0.5, line_dash="dot"
)
fig_num_trips_by_cohort_operator.write_html(
    OUTPUT_PATH / "stats_num_trajets_par_op.html"
)
fig_num_trips_by_cohort_operator.write_image(
    OUTPUT_PATH / "stats_num_trajets_par_op.svg", format="svg", width=1280, height=720
)
fig_num_trips_by_cohort_operator.update_layout(height=800)

In [None]:
bins = list(range(48))


ref_data = preprocess_trips_by_driver_df(df_trips_by_driver, bins)
traces = [
    go.Bar(
        x=ref_data["break_point"],
        y=ref_data["share"],
        name="Cohorte 2022",
        hovertemplate="%{y:.2f}% des conducteurs ont fait %{x} trajets",
        marker_color="#f39c12",
        marker_pattern_shape="x",
        marker_pattern_size=12,
        marker_pattern_fgcolor="black",
        marker_pattern_fgopacity=1,
    )
]

enabled_traces = ["YNSTANT", "BlaBlaCar Daily", "MOOVANCE", "Klaxit"]
for operateur in df_trips_by_driver_cohortes["nom_operateur"].unique():
    data = preprocess_trips_by_driver_df(
        df_trips_by_driver_cohortes.filter(pl.col("nom_operateur") == operateur), bins
    )
    trace = go.Bar(
        x=data["break_point"],
        y=data["share"],
        text=operateur,
        name=operateur,
        hovertemplate="%{y:.2f}% des conducteurs ont fait %{x} trajets",
        marker_color=color_mapping[operateur],
        visible=True if operateur in enabled_traces else "legendonly",
    )
    traces.append(trace)

fig_trips_by_drivers_multi_op = go.Figure(traces)
fig_trips_by_drivers_multi_op.update_layout(
    barmode="group",
    bargroupgap=0.2,
    plot_bgcolor="white",
    legend_title="Cohorte :",
    title="Distribution du nombre de trajets effectués pour chaque cohorte",
    template="seaborn",
)
fig_trips_by_drivers_multi_op.update_yaxes(
    showgrid=True,
    griddash="dashdot",
    gridwidth=1,
    gridcolor="gray",
    title="Part des conducteurs (%)",
)
fig_trips_by_drivers_multi_op.update_xaxes(
    title="Nombre de trajets effectués", range=[0.5, 10.5]
)
fig_trips_by_drivers_multi_op.add_vrect(
    x0=9.5,
    x1=10.5,
    fillcolor="#7f8c8d",
    opacity=0.25,
    line_width=0,
    annotation_text="BONUS CEE",
    annotation_position="top left",
    annotation_textangle=-90,
    annotation_font_size=10,
)
fig_trips_by_drivers_multi_op.show()
fig_trips_by_drivers_multi_op.write_html(
    OUTPUT_PATH / "histo_trajets_par_conducteurs_multi_op.html"
)
fig_trips_by_drivers_multi_op.write_image(
    OUTPUT_PATH / "histo_trajets_par_conducteurs_multi_op.svg",
    format="svg",
    width=1280,
    height=720,
)

## Comparaison par AOM


In [None]:
is_aom_expr = (
    pl.when(pl.col("aom_name").is_in(aom_with_incentives))
    .then(pl.lit("Avec incitation"))
    .otherwise(pl.lit("Sans incitation"))
    .alias("aom_incentive_status")
)

### CEE


In [None]:
(
    df_trips_by_driver_cee_aom.filter(pl.col("aom_name").is_in(selected_aoms))
    .group_by(["aom_name"])
    .agg(pl.col("num_trips").mean().alias("moyenne_nombre_trajets"), pl.len())
    .with_columns(is_aom_expr)
    .sort(
        [
            pl.col("aom_incentive_status"),
            pl.col("aom_name"),
        ]
    )
)

In [None]:
df_trips_by_driver_cee_aom_agg = (
    df_trips_by_driver_cee_aom.filter(pl.col("aom_name").is_in(selected_aoms))
    .group_by(["cohorte", "aom_name"])
    .agg(pl.col("num_trips").mean().alias("moyenne_nombre_trajets"), pl.len())
    .with_columns(is_aom_expr)
    .sort(
        [
            pl.col("cohorte").str.split("_").list.reverse().list.join(""),
            pl.col("aom_incentive_status"),
            pl.col("aom_name"),
        ]
    )
)

In [None]:
df_trips_by_driver_cee_aom_agg

In [None]:
df_trips_by_driver_cee_aom_agg_incentives = (
    df_trips_by_driver_cee_aom.with_columns(is_aom_expr)
    .group_by(["cohorte", "aom_incentive_status"])
    .agg(pl.col("num_trips").mean())
    .sort([pl.col("cohorte").str.reverse(), "aom_incentive_status"])
)
df_trips_by_driver_cee_aom_agg_incentives

In [None]:
(
    df_trips_by_driver_cee_aom.filter(pl.col("aom_name").is_in(selected_aoms))
    .group_by(["aom_name"])
    .agg(pl.len(), pl.col("num_trips").mean())
    .with_columns(is_aom_expr)
    .sort(
        [
            pl.col("aom_incentive_status"),
            pl.col("aom_name"),
        ]
    )
)

### 2022


In [None]:
df_trips_by_driver_aom_agg = (
    df_trips_by_driver_aom.filter(pl.col("aom_name").is_in(selected_aoms))
    .group_by(["aom_name"])
    .agg(pl.col("num_trips").mean().alias("moyenne_nombre_trajets"), pl.len())
    .with_columns(is_aom_expr, pl.lit("2022").alias("cohorte"))
    .sort(
        [
            pl.col("cohorte").str.split("_").list.reverse().list.join(""),
            pl.col("aom_incentive_status"),
            pl.col("aom_name"),
        ]
    )
)
df_trips_by_driver_aom_agg

In [None]:
df_trips_by_driver_aom_agg_incentives = (
    df_trips_by_driver_aom.with_columns(is_aom_expr)
    .group_by("aom_incentive_status")
    .agg(pl.col("num_trips").mean())
    .sort("aom_incentive_status")
)

### Visualisation


In [None]:
fig_trips_by_drivers_multi_aom_status = px.bar(
    pl.concat(
        [
            df_trips_by_driver_aom_agg_incentives.with_columns(
                pl.lit("2022").alias("cohorte")
            ),
            df_trips_by_driver_cee_aom_agg_incentives,
        ],
        how="diagonal",
    ).with_columns(pl.col("num_trips").round(2)),
    x="cohorte",
    y="num_trips",
    text="num_trips",
    color="aom_incentive_status",
    barmode="group",
    template="simple_white",
    labels={
        "aom_incentive_status": "Type d'AOM",
        "num_trips": "Nombre de trajets moyens par conducteur",
        "cohorte": "Cohorte",
    },
    title="Comparaison du nombre de trajets moyens par conducteur<br><sub>Par type d'AOM et cohorte<sub>",
    color_discrete_map={
        "Avec incitation": "rgba(39, 174, 96,1.0)",
        "Sans incitation": "rgba(47, 54, 64,1.0)",
    },
)
fig_trips_by_drivers_multi_aom_status.show()

fig_trips_by_drivers_multi_aom_status.write_html(
    OUTPUT_PATH / "histo_trajets_par_conducteurs_multi_aom_status.html"
)
fig_trips_by_drivers_multi_aom_status.write_image(
    OUTPUT_PATH / "histo_trajets_par_conducteurs_multi_aom_status.svg",
    format="svg",
    width=1280,
    height=720,
)

# Nombre de semaines d'activité


## Requêtes


### 2022

In [None]:
df_activity_weeks_by_driver = pl.read_database(
    """
select
	i.uuid,
  max(aom_name) as aom_name,
	count(distinct trip_id) as num_trajets,
	count(distinct date_trunc('week',c.datetime)) as num_semaines_activité
from
	carpool.carpools c
inner join carpool.identities i on c.identity_id = i._id
inner JOIN luis.cohorte_2022_v2 ft ON ft.uuid=i.uuid
WHERE c.datetime BETWEEN ft.date_first_trip AND ft.date_first_trip + INTERVAL '23 WEEKS'
and is_driver
and status=cast('ok' as covoiturage_production.carpool.carpool_status_enum)
group by
	1
""",
    connection=db_engine,
)

In [None]:
df_activity_weeks_by_driver.describe()

### CEE

In [None]:
df_activity_weeks_by_driver_cohortes = pl.read_database(
    """
with cohortes as (select 
	*
from luis.cee_drivers_v4 cd)
select 
	ch.uuid,
  max(max_aom_name) as aom_name,
	count(distinct trip_id) as num_trajets,
	count(distinct date_trunc('week',c.datetime)) as num_semaines_activité,
    min(c.datetime) as date_premier_trajet,
    max(c.datetime) as date_dernier_trajet,
	max(ch.date_first_cee) as date_premier_cee,
  max(ch.cee_operator_name) as cee_operator_name,
	max(ch.cohorte) as cohorte
from
	carpool.carpools c
inner join carpool.identities i on
	c.identity_id = i._id
inner join cohortes ch on
	i."uuid" = ch.uuid
where
	c.datetime BETWEEN ch.date_first_cee AND ch.date_first_cee+ INTERVAL '23 WEEKS'
    and ch.cohorte is not null
	and c.is_driver
    and status=cast('ok' as covoiturage_production.carpool.carpool_status_enum)
group by 1
""",
    connection=db_engine,
)

In [None]:
df_activity_weeks_by_driver_cohortes.describe()

### AOM

In [None]:
df_activity_weeks_by_driver_aom = pl.read_database(
    """
with trips as (
select
	i.uuid,
	ft.aom_name as max_aom_name,
	c.operator_id,
	c.trip_id,
  c.datetime,
	case
		when l_aom = 'Montpellier Méditerranée Métropole'
		and c.datetime <= '2024-04-01' and c.operator_id = 3 then 'Montpellier Méditerranée Métropole'
    when l_aom = 'Montpellier Méditerranée Métropole'
		and c.datetime > '2024-04-01' and c.operator_id = 9 then 'Montpellier Méditerranée Métropole'
    when l_aom = 'SM Artois Mobilités'
		and c.operator_id = 9 then 'SM Artois Mobilités'
    when l_aom = 'Métropole Rouen Normandie'
		and c.datetime <= '2023-10-01' and c.operator_id = 3 then 'Métropole Rouen Normandie'
    when l_aom = 'Métropole Rouen Normandie'
		and c.datetime > '2023-10-01' and c.operator_id = 9 then 'Métropole Rouen Normandie'
		when l_aom in ('Bordeaux Métropole','Dijon Métropole','Rennes Métropole') and c.operator_id in (3,4,9) then l_aom
    else null
	end as eligible_aom_name
from
	carpool.carpools c
inner join carpool.identities i on
	c.identity_id = i._id
inner join luis.cohorte_2022_v2 ft on
	ft.uuid = i.uuid
left join geo.perimeters p on
	c.start_geo_code = p.arr
WHERE c.datetime BETWEEN ft.date_first_trip AND ft.date_first_trip + INTERVAL '12 WEEKS'
)
select 
	"uuid",
	max(max_aom_name) as max_aom_name,
	max(eligible_aom_name) as aom_name,
	count(distinct date_trunc('week',datetime)) as num_semaines_activité
from
	trips
group by
	1
having max(eligible_aom_name) is not null
  """,
    connection=db_engine,
)

In [None]:
df_activity_weeks_by_driver_aom.describe()

In [None]:
df_activity_weeks_by_driver_aom.group_by("")

In [None]:
df_activity_weeks_by_driver_cee_aom = pl.read_database(
    """
with trips as (
select
	i.uuid,
	ch.max_aom_name,
  ch.cohorte,
	c.operator_id,
	c.trip_id,
  c.datetime,
	case
		when l_aom = 'Montpellier Méditerranée Métropole'
		and c.datetime <= '2024-04-01' and c.operator_id = 3 then 'Montpellier Méditerranée Métropole'
    when l_aom = 'Montpellier Méditerranée Métropole'
		and c.datetime > '2024-04-01' and c.operator_id = 9 then 'Montpellier Méditerranée Métropole'
    when l_aom = 'SM Artois Mobilités'
		and c.operator_id = 9 then 'SM Artois Mobilités'
    when l_aom = 'Métropole Rouen Normandie'
		and c.datetime <= '2023-10-01' and c.operator_id = 3 then 'Métropole Rouen Normandie'
    when l_aom = 'Métropole Rouen Normandie'
		and c.datetime > '2023-10-01' and c.operator_id = 9 then 'Métropole Rouen Normandie'
		when l_aom in ('Bordeaux Métropole','Dijon Métropole','Rennes Métropole') and c.operator_id in (3,4,9) then l_aom
    else null
	end as eligible_aom_name
from
	carpool.carpools c
inner join carpool.identities i on
	c.identity_id = i._id
inner join luis.cee_drivers_v4 ch on
	ch.uuid = i.uuid
left join geo.perimeters p on
	c.start_geo_code = p.arr
where
	c.datetime BETWEEN ch.date_first_cee AND ch.date_first_cee+ INTERVAL '12 WEEKS'
)
select 
	"uuid",
	max(max_aom_name) as max_aom_name,
	max(eligible_aom_name) as aom_name,
  max(cohorte) as cohorte,
	count(distinct date_trunc('week',datetime)) as num_semaines_activité
from
	trips
where cohorte is not null
group by
	1
having max(eligible_aom_name) is not null
  """,
    connection=db_engine,
)

In [None]:
df_activity_weeks_by_driver_cee_aom.describe()

## Moyennes

In [None]:
df_activity_weeks_by_driver.select(
    pl.col("num_semaines_activité").mean().alias("moyenne_nombre_semaines_activité"),
    pl.col("num_semaines_activité").median().alias("mediane_nombre_semaines_activité"),
)

In [None]:
df_activity_weeks_by_driver_cohortes.select(
    pl.col("num_semaines_activité").mean().alias("moyenne_nombre_semaines_activité"),
    pl.col("num_semaines_activité").median().alias("mediane_nombre_semaines_activité"),
)

In [None]:
df_activity_weeks_by_driver_cohortes.group_by("cohorte").agg(
    pl.col("num_semaines_activité")
    .mean()
    .alias("moyenne_nombre_semaines_activité")
    .round(1)
).sort(pl.col("cohorte").str.split("_").list.reverse().list.join(""))

## Comparaison 2022 vs 2023 CEE


In [None]:
def preprocess_activity_week_by_driver_df(
    df: pl.DataFrame, bins: list[int]
) -> pl.DataFrame:
    return (
        df.get_column("num_semaines_activité")
        .hist(bins, include_breakpoint=True)
        .with_columns(
            pl.col("breakpoint")
            .cast(pl.String)
            .replace(np.inf, f"{bins[-1]+1}")
            .str.replace("(\.0)", ""),
            (100 * pl.col("count") / pl.col("count").sum()).alias("share"),
        )
    )

In [None]:
bins = list(range(13))
plot_configs = [
    {
        "data": preprocess_activity_week_by_driver_df(
            df_activity_weeks_by_driver, bins
        ),
        "name": "Référence 2022",
        "color": "#f39c12",
    },
    {
        "data": preprocess_activity_week_by_driver_df(
            df_activity_weeks_by_driver_cohortes.filter(pl.col("cohorte") == "t1_23"),
            bins,
        ),
        "name": "CEE T1 2023",
        "color": "#d7e1ed",
    },
    {
        "data": preprocess_activity_week_by_driver_df(
            df_activity_weeks_by_driver_cohortes.filter(pl.col("cohorte") == "t2_23"),
            bins,
        ),
        "name": "CEE T2 2023",
        "color": "#89a6c7",
    },
    {
        "data": preprocess_activity_week_by_driver_df(
            df_activity_weeks_by_driver_cohortes.filter(pl.col("cohorte") == "t3_23"),
            bins,
        ),
        "name": "CEE T3 2023",
        "color": "#3E6DA1",
    },
    {
        "data": preprocess_activity_week_by_driver_df(
            df_activity_weeks_by_driver_cohortes.filter(pl.col("cohorte") == "t4_23"),
            bins,
        ),
        "name": "CEE T4 2023",
        "color": "#1a334e",
    },
    {
        "data": preprocess_activity_week_by_driver_df(
            df_activity_weeks_by_driver_cohortes.filter(pl.col("cohorte") == "t1_24"),
            bins,
        ),
        "name": "CEE T1 2024",
        "color": "rgba(113, 88, 226,1.0)",
    },
]
traces = []
for config in plot_configs:
    data = config["data"]
    trace = go.Bar(
        x=data["breakpoint"],
        y=data["share"],
        marker_color=config["color"],
        name=config["name"],
        hovertemplate="%{y:.2f}% des conducteurs ont %{x} semaine(s) d'activité",
    )
    traces.append(trace)

fig_activity_weeks_drivers_multi = go.Figure(traces)
fig_activity_weeks_drivers_multi.update_layout(
    barmode="group",
    bargroupgap=0.2,
    plot_bgcolor="white",
    legend_title="Cohorte :",
    title="Nombre de semaines d'activité par conducteur<br>"
    "<sub>Historique de 6 mois.<br>Une semaine d'activité est définie comme une semaine où le conducteur a effectué au moins un trajet</sub>",
)
fig_activity_weeks_drivers_multi.update_yaxes(
    showgrid=True,
    griddash="dashdot",
    gridwidth=1,
    gridcolor="gray",
    title="Part des conducteurs (%)",
)
fig_activity_weeks_drivers_multi.update_xaxes(title="Nombre de semaines d'activité")
fig_activity_weeks_drivers_multi.show()
fig_activity_weeks_drivers_multi.write_html(
    OUTPUT_PATH / "histo_semaines_activites_par_conducteurs_multi.html"
)
fig_activity_weeks_drivers_multi.write_image(
    OUTPUT_PATH / "histo_semaines_activites_par_conducteurs_multi.svg",
    format="svg",
    width=1280,
    height=720,
)

## Comparaison opérateurs


In [None]:
df_activity_weeks_by_driver_cohortes_op = (
    df_activity_weeks_by_driver_cohortes.group_by(["cohorte", "cee_operator_name"])
    .agg(
        pl.col("num_semaines_activité").mean().alias("moyenne_nombre_semaines_activité")
    )
    .sort(
        pl.col("cohorte").str.split("_").list.reverse().list.join(""),
        "cee_operator_name",
    )
)
df_activity_weeks_by_driver_cohortes_op

In [None]:
df_activity_weeks_by_driver_cohortes_op.write_clipboard()

In [None]:
fig_num_activity_weeks_by_cohort_operator = px.line(
    df_activity_weeks_by_driver_cohortes_op,
    x="cohorte",
    y="moyenne_nombre_semaines_activité",
    color="cee_operator_name",
    markers=True,
    color_discrete_map=color_mapping,
    template="simple_white",
    labels={
        "moyenne_nombre_semaines_activité": "Nombre moyen de semaines d'activité",
        "cohorte": "Cohorte",
        "cee_operator_name": "Opérateur",
    },
    title="Comparaison du nombre de semaines d'activités par cohorte et pour chaque opérateur<br><sub>Historique de 3 mois</sub>",
)

fig_num_activity_weeks_by_cohort_operator.update_traces(
    marker_size=10, line_width=0.5, line_dash="dot"
)
fig_num_activity_weeks_by_cohort_operator.write_html(
    OUTPUT_PATH / "stats_num_semaines_activites_par_op.html"
)
fig_num_activity_weeks_by_cohort_operator.write_image(
    OUTPUT_PATH / "stats_num_semaines_activites_par_op.svg",
    format="svg",
    width=1280,
    height=720,
)
fig_num_activity_weeks_by_cohort_operator.update_layout(height=800)

## Comparaison par AOM


### CEE


In [None]:
(
    df_activity_weeks_by_driver_cee_aom.filter(pl.col("aom_name").is_in(selected_aoms))
    .group_by(["aom_name"])
    .agg(
        pl.col("num_semaines_activité")
        .mean()
        .alias("moyenne_nombre_semaine_activite")
        .round(2),
        pl.len(),
    )
    .with_columns(is_aom_expr)
    .sort(
        [
            pl.col("aom_incentive_status"),
            pl.col("aom_name"),
        ]
    )
)

In [None]:
df_activity_weeks_by_driver_cee_aom_agg = (
    df_activity_weeks_by_driver_cee_aom.filter(pl.col("aom_name").is_in(selected_aoms))
    .group_by(["cohorte", "aom_name"])
    .agg(
        pl.col("num_semaines_activité").mean().alias("moyenne_nombre_semaine_activite")
    )
    .with_columns(is_aom_expr)
    .sort(
        [
            pl.col("cohorte").str.split("_").list.reverse().list.join(""),
            pl.col("aom_incentive_status"),
            pl.col("aom_name"),
        ]
    )
)
df_activity_weeks_by_driver_cee_aom_agg

In [None]:
df_activity_weeks_by_driver_cee_aom_agg_incentives = (
    df_activity_weeks_by_driver_cee_aom.with_columns(is_aom_expr)
    .group_by(["cohorte", "aom_incentive_status"])
    .agg(pl.col("num_semaines_activité").mean())
    .sort([pl.col("cohorte").str.reverse(), "aom_incentive_status"])
)
df_activity_weeks_by_driver_cee_aom_agg_incentives

### 2022


In [None]:
df_activity_weeks_by_driver_aom_agg = (
    df_activity_weeks_by_driver_aom.filter(pl.col("aom_name").is_in(selected_aoms))
    .group_by(["aom_name"])
    .agg(
        pl.col("num_semaines_activité")
        .mean()
        .alias("moyenne_nombre_semaine_activite")
        .round(2),
        pl.len(),
    )
    .with_columns(is_aom_expr, pl.lit("2022").alias("cohorte"))
    .sort(
        [
            pl.col("cohorte").str.split("_").list.reverse().list.join(""),
            pl.col("aom_incentive_status"),
            pl.col("aom_name"),
        ]
    )
)
df_activity_weeks_by_driver_aom_agg

In [None]:
df_activity_weeks_by_driver_aom_agg_incentives = (
    df_activity_weeks_by_driver_aom.with_columns(is_aom_expr)
    .group_by("aom_incentive_status")
    .agg(pl.col("num_semaines_activité").mean())
    .sort("aom_incentive_status")
)
df_activity_weeks_by_driver_aom_agg_incentives

### Visualisation


In [None]:
fig_activity_weeks_by_drivers_multi_aom_status = px.bar(
    pl.concat(
        [
            df_activity_weeks_by_driver_aom_agg_incentives.with_columns(
                pl.lit("2022").alias("cohorte")
            ),
            df_activity_weeks_by_driver_cee_aom_agg_incentives,
        ],
        how="diagonal",
    ).with_columns(pl.col("num_semaines_activité").round(2)),
    x="cohorte",
    y="num_semaines_activité",
    text="num_semaines_activité",
    color="aom_incentive_status",
    barmode="group",
    template="simple_white",
    labels={
        "aom_incentive_status": "Type d'AOM",
        "num_semaines_activité": "Nombre moyen de semaines d'activité",
        "cohorte": "Cohorte",
    },
    title="Comparaison du nombre moyen de semaines d'activité par conducteur<br><sub>Par type d'AOM et cohorte<sub>",
    color_discrete_map={
        "Avec incitation": "rgba(39, 174, 96,1.0)",
        "Sans incitation": "rgba(47, 54, 64,1.0)",
    },
)
fig_activity_weeks_by_drivers_multi_aom_status.show()
fig_activity_weeks_by_drivers_multi_aom_status.write_html(
    OUTPUT_PATH / "histo_semaines_activite_par_conducteurs_multi_aom_status.html"
)
fig_activity_weeks_by_drivers_multi_aom_status.write_image(
    OUTPUT_PATH / "histo_semaines_activite_par_conducteurs_multi_aom_status.svg",
    format="svg",
    width=1280,
    height=720,
)

# Distance


## Requêtes


### 2022

In [None]:
df_distance_by_trips = pl.read_database(
    """
select
	trip_id,
  max(aom_name) as aom_name,
	max(distance) as distance
from
	carpool.carpools c
inner join carpool.identities i on c.identity_id = i._id
inner JOIN luis.cohorte_2022_v2 ft ON ft.uuid=i.uuid
WHERE c.datetime BETWEEN ft.date_first_trip AND ft.date_first_trip + INTERVAL '23 weeks'
and is_driver
and status=cast('ok' as covoiturage_production.carpool.carpool_status_enum)
group by
	1
""",
    connection=db_engine,
)

In [None]:
df_distance_by_trips.describe()

On filtre les trajets > 100km :

In [None]:
df_distance_by_trips.filter(pl.col("distance") < 100_000).describe()

Part des trajets >100km :

In [None]:
df_distance_by_trips.select(
    (100 * (pl.col("distance") > 100_000).sum() / pl.len()).alias(
        "Part des trajets >100km sur l'ensemble des trajets (%)"
    )
)

### CEE

In [None]:
df_distance_by_trips_cohortes = pl.read_database(
    """
with cohortes as (select 
	*
from luis.cee_drivers_v4 cd)
select
	trip_id,
	max(coalesce(distance,cast(meta->>'calc_distance' as int))) as distance,
  max(ft.cee_operator_name) as cee_operator_name,
  max(ft.cohorte) as cohorte,
  max(max_aom_name) as aom_name
from
	carpool.carpools c
inner join carpool.identities i on c.identity_id = i._id
inner JOIN cohortes ft ON ft.uuid=i.uuid
WHERE c.datetime BETWEEN ft.date_first_cee AND ft.date_first_cee + INTERVAL '23 weeks'
and is_driver
and status=cast('ok' as covoiturage_production.carpool.carpool_status_enum)
and ft.cohorte is not null
group by
	1
""",
    connection=db_engine,
)

In [None]:
df_distance_by_trips_cohortes.describe()

### AOM

In [None]:
df_distance_by_trips_aom = pl.read_database(
    """
with trips as (
select
	i.uuid,
	ft.aom_name as max_aom_name,
	c.operator_id,
	c.trip_id,
  c.distance,
	case
		when l_aom = 'Montpellier Méditerranée Métropole'
		and c.datetime <= '2024-04-01' and c.operator_id = 3 then 'Montpellier Méditerranée Métropole'
    when l_aom = 'Montpellier Méditerranée Métropole'
		and c.datetime > '2024-04-01' and c.operator_id = 9 then 'Montpellier Méditerranée Métropole'
    when l_aom = 'SM Artois Mobilités'
		and c.operator_id = 9 then 'SM Artois Mobilités'
    when l_aom = 'Métropole Rouen Normandie'
		and c.datetime <= '2023-10-01' and c.operator_id = 3 then 'Métropole Rouen Normandie'
    when l_aom = 'Métropole Rouen Normandie'
		and c.datetime > '2023-10-01' and c.operator_id = 9 then 'Métropole Rouen Normandie'
		when l_aom in ('Bordeaux Métropole','Dijon Métropole','Rennes Métropole') and c.operator_id in (3,4,9) then l_aom
    else null
	end as eligible_aom_name
from
	carpool.carpools c
inner join carpool.identities i on
	c.identity_id = i._id
inner join luis.cohorte_2022_v2 ft on
	ft.uuid = i.uuid
left join geo.perimeters p on
	c.start_geo_code = p.arr
WHERE c.datetime BETWEEN ft.date_first_trip AND ft.date_first_trip + INTERVAL '12 weeks'
)
select 
	trip_id,
	max(eligible_aom_name) as aom_name,
	max(distance) as distance
from
	trips
where eligible_aom_name is not null
group by
	1
  """,
    connection=db_engine,
)

In [None]:
df_distance_by_trips_aom.describe()

In [None]:
df_distance_by_trips_cee_aom = pl.read_database(
    """
with trips as (
select
	i.uuid,
	ft.max_aom_name,
  ft.cohorte,
	c.operator_id,
	c.trip_id,
  c.distance,
	case
		when l_aom = 'Montpellier Méditerranée Métropole'
		and c.datetime <= '2024-04-01' and c.operator_id = 3 then 'Montpellier Méditerranée Métropole'
    when l_aom = 'Montpellier Méditerranée Métropole'
		and c.datetime > '2024-04-01' and c.operator_id = 9 then 'Montpellier Méditerranée Métropole'
    when l_aom = 'SM Artois Mobilités'
		and c.operator_id = 9 then 'SM Artois Mobilités'
    when l_aom = 'Métropole Rouen Normandie'
		and c.datetime <= '2023-10-01' and c.operator_id = 3 then 'Métropole Rouen Normandie'
    when l_aom = 'Métropole Rouen Normandie'
		and c.datetime > '2023-10-01' and c.operator_id = 9 then 'Métropole Rouen Normandie'
		when l_aom in ('Bordeaux Métropole','Dijon Métropole','Rennes Métropole') and c.operator_id in (3,4,9) then l_aom
    else null
	end as eligible_aom_name
from
	carpool.carpools c
inner join carpool.identities i on
	c.identity_id = i._id
inner join luis.cee_drivers_v4 ft on
	ft.uuid = i.uuid
left join geo.perimeters p on
	c.start_geo_code = p.arr
WHERE c.datetime BETWEEN ft.date_first_cee AND ft.date_first_cee + INTERVAL '12 weeks'

)
select 
	trip_id,
	max(eligible_aom_name) as aom_name,
  max(cohorte) as cohorte,
	max(distance) as distance
from
	trips
where eligible_aom_name is not null
and cohorte is not null
group by
	1
  """,
    connection=db_engine,
)

In [None]:
df_distance_by_trips_cee_aom.describe()

## Moyennes

### 2022

In [None]:
df_distance_by_trips.filter(pl.col("distance") < 100_000).describe()

Part des trajets >100km :

In [None]:
df_distance_by_trips.select(
    (100 * (pl.col("distance") > 100_000).sum() / pl.len()).alias(
        "Part des trajets >100km sur l'ensemble des trajets (%)"
    )
)

### CEE

In [None]:
df_distance_by_trips_cohortes.group_by("cohorte").agg(
    (pl.col("distance") / 1000).mean().alias("moyenne_distance").round(1)
).sort(pl.col("cohorte").str.split("_").list.reverse().list.join(""))

On filtre les trajets > 100km :

In [None]:
df_distance_by_trips_cohortes.filter(pl.col("distance") < 100_000).describe()

In [None]:
df_distance_by_trips_cohortes.filter(pl.col("distance") > 100_000).describe()

In [None]:
df_distance_by_trips_cohortes.filter(pl.col("distance") < 100_000).group_by(
    "cohorte"
).agg((pl.col("distance") / 1000).mean().round(1).alias("moyenne_distance")).sort(
    pl.col("cohorte").str.split("_").list.reverse().list.join("")
)

Part des trajets >100km :

In [None]:
df_distance_by_trips_cohortes.select(
    (100 * (pl.col("distance") > 100_000).sum() / pl.len()).alias(
        "Part des trajets >100km sur l'ensemble des trajets (%)"
    )
)

## Comparaison 2022 vs 2023 CEE


In [None]:
plot_configs = [
    {
        "data": df_distance_by_trips,
        "name": "Référence 2022",
        "color": "#f39c12",
    },
    {
        "data": df_distance_by_trips_cohortes.filter(pl.col("cohorte") == "t1_23"),
        "name": "CEE T1 2023",
        "color": "#d7e1ed",
    },
    {
        "data": df_distance_by_trips_cohortes.filter(pl.col("cohorte") == "t2_23"),
        "name": "CEE T2 2023",
        "color": "#89a6c7",
    },
    {
        "data": df_distance_by_trips_cohortes.filter(pl.col("cohorte") == "t3_23"),
        "name": "CEE T3 2023",
        "color": "#3E6DA1",
    },
    {
        "data": df_distance_by_trips_cohortes.filter(pl.col("cohorte") == "t4_23"),
        "name": "CEE T4 2023",
        "color": "#1a334e",
    },
    {
        "data": df_distance_by_trips_cohortes.filter(pl.col("cohorte") == "t1_24"),
        "name": "CEE T1 2024",
        "color": "rgba(113, 88, 226,1.0)",
    },
]
traces = []
for config in plot_configs:
    data = config["data"]
    trace = go.Histogram(
        x=data["distance"] / 1000,
        histfunc="count",
        histnorm="percent",
        xbins_size=0.2,
        xbins_start=0,
        marker_color=config["color"],
        marker_opacity=0.5,
        name=config["name"],
        hovertemplate="%{y:.2f}% des trajets font %{x} km",
        visible=(
            True
            if config["name"] in ["Référence 2022", "CEE T4 2023", "CEE T1 2024"]
            else "legendonly"
        ),
    )
    traces.append(trace)

fig_distance_multi = go.Figure(traces)
fig_distance_multi.update_layout(
    barmode="overlay",
    plot_bgcolor="white",
    legend_title="Cohorte :",
    title="Distribution des distances réalisées pour les trajets de chaque cohorte",
)
fig_distance_multi.update_yaxes(
    showgrid=True,
    griddash="dashdot",
    gridwidth=1,
    gridcolor="gray",
    title="Part des trajets (%)",
)
fig_distance_multi.update_xaxes(title="Distance réalisée (km)", range=[0, 70])
fig_distance_multi.show()
fig_distance_multi.write_html(OUTPUT_PATH / "histo_distances_multi.html")
fig_distance_multi.write_image(
    OUTPUT_PATH / "histo_distances_multi.svg", format="svg", width=1920, height=1080
)

## Comparaison opérateurs


In [None]:
df_distances_by_cohortes_op = (
    df_distance_by_trips_cohortes.group_by(["cohorte", "cee_operator_name"])
    .agg((pl.col("distance") / 1000).mean().alias("distance_moyenne"))
    .sort(
        pl.col("cohorte").str.split("_").list.reverse().list.join(""),
        "cee_operator_name",
    )
)
with pl.Config(tbl_rows=600):
    print(df_distances_by_cohortes_op)

In [None]:
df_distances_by_cohortes_op.write_clipboard()

In [None]:
fig_distance_operator = px.line(
    df_distances_by_cohortes_op,
    x="cohorte",
    y="distance_moyenne",
    color="cee_operator_name",
    markers=True,
    color_discrete_map=color_mapping,
    template="simple_white",
    labels={
        "distance_moyenne": "Distance moyenne (km)",
        "cohorte": "Cohorte",
        "cee_operator_name": "Opérateur",
    },
    title="Comparaison de la distance moyenne par cohorte et pour chaque opérateur<br><sub>Historique de 3 mois</sub>",
)

fig_distance_operator.update_traces(marker_size=11, line_width=0.5, line_dash="dot")
fig_distance_operator.write_html(OUTPUT_PATH / "stats_distances_par_op.html")
fig_distance_operator.write_image(
    OUTPUT_PATH / "stats_distances_par_op.svg",
    format="svg",
    width=1280,
    height=720,
)
fig_distance_operator.update_layout(height=800)

## Comparaison par AOM


### CEE


In [None]:
(
    df_distance_by_trips_cee_aom.filter(pl.col("aom_name").is_in(selected_aoms))
    .group_by(["aom_name"])
    .agg(
        (pl.col("distance") / 1000).mean().alias("distance_moyenne").round(2),
        pl.len(),
    )
    .with_columns(is_aom_expr)
    .sort(
        [
            pl.col("aom_incentive_status"),
            pl.col("aom_name"),
        ]
    )
)

In [None]:
df_distance_by_trips_cee_aom_agg = (
    df_distance_by_trips_cee_aom.filter(pl.col("aom_name").is_in(selected_aoms))
    .group_by(["cohorte", "aom_name"])
    .agg((pl.col("distance") / 1000).mean().alias("distance_moyenne"))
    .with_columns(is_aom_expr)
    .sort(
        [
            pl.col("cohorte").str.split("_").list.reverse().list.join(""),
            pl.col("aom_incentive_status"),
            pl.col("aom_name"),
        ]
    )
)
df_distance_by_trips_cee_aom_agg

In [None]:
df_distance_by_trips_cee_aom_agg_incentives = (
    df_distance_by_trips_cee_aom.with_columns(is_aom_expr)
    .group_by(["cohorte", "aom_incentive_status"])
    .agg((pl.col("distance") / 1000).mean())
    .sort([pl.col("cohorte").str.reverse(), "aom_incentive_status"])
)
df_distance_by_trips_cee_aom_agg_incentives

### 2022


In [None]:
df_distance_by_trips_aom_agg = (
    df_distance_by_trips_aom.filter(pl.col("aom_name").is_in(selected_aoms))
    .group_by(["aom_name"])
    .agg(
        (pl.col("distance") / 1000).mean().alias("distance_moyenne").round(2),
        pl.len(),
    )
    .with_columns(is_aom_expr, pl.lit("2022").alias("cohorte"))
    .sort(
        [
            pl.col("cohorte").str.split("_").list.reverse().list.join(""),
            pl.col("aom_incentive_status"),
            pl.col("aom_name"),
        ]
    )
)
df_distance_by_trips_aom_agg

In [None]:
df_distance_by_trips_aom_agg_incentives = (
    df_distance_by_trips_aom.with_columns(is_aom_expr)
    .group_by("aom_incentive_status")
    .agg((pl.col("distance") / 1000).mean())
    .sort("aom_incentive_status")
)
df_distance_by_trips_aom_agg_incentives

### Visualisation


In [None]:
fig_distance_by_trips_multi_aom_status = px.bar(
    pl.concat(
        [
            df_distance_by_trips_aom_agg_incentives.with_columns(
                pl.lit("2022").alias("cohorte")
            ),
            df_distance_by_trips_cee_aom_agg_incentives,
        ],
        how="diagonal",
    ).with_columns(pl.col("distance").round(2)),
    x="cohorte",
    y="distance",
    text="distance",
    color="aom_incentive_status",
    barmode="group",
    template="simple_white",
    labels={
        "aom_incentive_status": "Type d'AOM",
        "distance": "Distance moyenne par trajet (km)",
        "cohorte": "Cohorte",
    },
    title="Comparaison de la distance moyenne par trajet<br><sub>Par type d'AOM et cohorte<sub>",
    color_discrete_map={
        "Avec incitation": "rgba(39, 174, 96,1.0)",
        "Sans incitation": "rgba(47, 54, 64,1.0)",
    },
)
fig_distance_by_trips_multi_aom_status.show()
fig_distance_by_trips_multi_aom_status.write_html(
    OUTPUT_PATH / "histo_distance_multi_aom_status.html"
)
fig_distance_by_trips_multi_aom_status.write_image(
    OUTPUT_PATH / "histo_distance_multi_aom_status.svg",
    format="svg",
    width=1280,
    height=720,
)

# Heures de départ


## Requêtes


In [None]:
df_departure_dow_hour_by_trips = pl.read_database(
    """
SELECT 
	trip_id,
	date_part('dow',min(datetime at time zone 'Europe/Paris')) as jour,
	date_part('hour',min(datetime at time zone 'Europe/Paris')) as heure
from
	carpool.carpools c
    left join carpool.identities i on c.identity_id = i._id
    inner JOIN luis.cohorte_2022_v2 ft ON ft.uuid=i.uuid
    WHERE c.datetime BETWEEN ft.date_first_trip AND ft.date_first_trip + INTERVAL '12 WEEKS'
    and is_driver
    and status=cast('ok' as covoiturage_production.carpool.carpool_status_enum)
group by
	1
 """,
    connection=db_engine,
)

In [None]:
df_departure_dow_hour_by_trips.describe()

In [None]:
df_departure_dow_hour_by_trips_cohortes = pl.read_database(
    """
SELECT 
	trip_id,
	date_part('dow',min(datetime at time zone 'Europe/Paris')) as jour,
	date_part('hour',min(datetime at time zone 'Europe/Paris')) as heure,
  max(ft.cohorte) as cohorte
from
	carpool.carpools c
    left join carpool.identities i on c.identity_id = i._id
    inner JOIN luis.cee_drivers_v4 ft ON ft.uuid=i.uuid
    WHERE c.datetime BETWEEN ft.date_first_cee AND ft.date_first_cee + INTERVAL '12 WEEKS'
    and ft.uuid is not null
    and is_driver
    and status=cast('ok' as covoiturage_production.carpool.carpool_status_enum)
group by
	1
""",
    connection=db_engine,
)

In [None]:
df_departure_dow_hour_by_trips_cohortes.describe()

## Comparaison


In [None]:
def preprocess_departure_dow_hour_by_trips_df(
    departure_dow_hour_by_trips_df: pl.DataFrame,
) -> pl.DataFrame:
    mapping_jours = {
        "0": "Dimanche",
        "6": "Samedi",
        "5": "Vendredi",
        "4": "Jeudi",
        "3": "Mercredi",
        "2": "Mardi",
        "1": "Lundi",
    }
    departure_dow_hour_by_trips_df_agg = (
        departure_dow_hour_by_trips_df.with_columns(
            pl.col("jour").cast(pl.Int8), pl.col("heure").cast(pl.Int8)
        )
        .group_by(["jour", "heure"])
        .agg(
            (
                100
                * pl.col("trip_id").count()
                / departure_dow_hour_by_trips_df.shape[0]
            ).alias("share")
        )
        .with_columns(
            pl.col("jour").cast(pl.String).replace(mapping_jours).alias("jour_str")
        )
    ).sort((pl.col("jour") - 1) % 7, descending=True)

    return departure_dow_hour_by_trips_df_agg

In [None]:
data = preprocess_departure_dow_hour_by_trips_df(df_departure_dow_hour_by_trips)
data = data.with_columns(
    pl.when(pl.col("share") > 0.5)
    .then(pl.format("{}%", pl.col("share").round(2).cast(pl.String)))
    .otherwise(pl.lit(""))
    .alias("text")
)
trace = go.Heatmap(
    x=data["heure"],
    y=data["jour_str"],
    z=data["share"],
    texttemplate="%{text}",
    xgap=2,
    ygap=2,
    zmin=0,
    zmax=1,
    autocolorscale=False,
    colorscale=[[0, "#ffffff"], [1, "#832804"]],
    hovertemplate="%{z:.2f}% des trajets ont lieu le jour %{y} de %{x}H00 à %{x}H59<extra></extra>",
    colorbar_title="Part des trajets",
    colorbar_ticksuffix="%",
)
fig_dow_hours_trips = go.Figure([trace])
fig_dow_hours_trips.update_layout(
    plot_bgcolor="white", title="Cohorte 2022 - Quand ont lieu les départs de trajets ?"
)
fig_dow_hours_trips.update_xaxes(dtick=1, title="Heure de la journée")
fig_dow_hours_trips.update_yaxes(dtick=1, title="Jour de la semaine")
fig_dow_hours_trips.show()

In [None]:
data = preprocess_departure_dow_hour_by_trips_df(
    df_departure_dow_hour_by_trips_cohortes.filter(pl.col("cohorte") == "t4_23")
)
data = data.with_columns(
    pl.when(pl.col("share") > 0.5)
    .then(pl.col("share").round(2).cast(pl.String))
    .otherwise(pl.lit(""))
    .alias("text")
)
trace = go.Heatmap(
    x=data["heure"],
    y=data["jour_str"],
    z=data["share"],
    texttemplate="%{text}",
    xgap=2,
    ygap=2,
    zmin=0,
    zmax=1,
    autocolorscale=False,
    colorscale=[[0, "#ffffff"], [1, "#832804"]],
    hovertemplate="%{z:.2f}% des trajets ont lieu le jour %{y} de %{x}H00 à %{x}H59<extra></extra>",
    colorbar_title="Part des trajets",
    colorbar_ticksuffix="%",
)
fig_dow_hours_trips_cee = go.Figure([trace])
fig_dow_hours_trips_cee.update_layout(
    plot_bgcolor="white",
    title="Cohorte CEE T4 23 - Quand  ont lieu les départs de trajets ?",
)
fig_dow_hours_trips_cee.update_xaxes(dtick=1, title="Heure de la journée")
fig_dow_hours_trips_cee.update_yaxes(dtick=1, title="Jour de la semaine")
fig_dow_hours_trips_cee.show()

# Passagers


## Requêtes


### 2022

In [None]:
df_passengers_by_trips = pl.read_database(
    """
SELECT 
	c.trip_id,
    max(c.datetime) as date_trajet,
    max(ft.date_first_trip) as date_first_trip,
    sum(seats) as "count",
    max(operator_id) as operator_id,
    max(ft.uuid::text) as uuid_cohorte,
    max(aom_name) as aom_name
from
	carpool.carpools c
    left join carpool.identities i on c.identity_id = i._id
    left JOIN luis.cohorte_2022_v2 ft ON ft.uuid=i.uuid
    and status=cast('ok' as covoiturage_production.carpool.carpool_status_enum)
group by
	1
Having
   max(c.datetime)  BETWEEN  max(ft.date_first_trip) AND  max(ft.date_first_trip) + INTERVAL '23 weeks'
   and max(ft.uuid::text) is not null
""",
    connection=db_engine,
)

In [None]:
df_passengers_by_trips.describe()

### CEE

In [None]:
df_passengers_by_trips_cohortes = pl.read_database(
    """
with cohortes as (select 
	*
from luis.cee_drivers_v4 cd)
SELECT 
	  c.trip_id,
    max(c.datetime) as date_trajet,
    max(ft.date_first_cee) as date_first_trip,
    sum(seats) as "count",
    max(cee_operator_name) as cee_operator_name,
    max(ft.max_aom_name) as aom_name,
    max(ft.uuid::text) as uuid_cohorte,
    max(ft.cohorte) as cohorte
from
	carpool.carpools c
    left join carpool.identities i on c.identity_id = i._id
    left JOIN cohortes ft ON ft.uuid=i.uuid
    and status=cast('ok' as covoiturage_production.carpool.carpool_status_enum)
group by
	1
Having
   max(c.datetime)  BETWEEN  max(ft.date_first_cee) AND  max(ft.date_first_cee) + INTERVAL '23 weeks'
   and max(ft.uuid::text) is not null
   and max(ft.cohorte) is not null
""",
    connection=db_engine,
)

In [None]:
df_passengers_by_trips_cohortes.describe()

### AOM

In [None]:
df_passengers_by_trips_aom = pl.read_database(
    """
with drivers_trips as (
select
	i.uuid,
  	ft.date_first_trip as date_first_trip,
	case
		when l_aom = 'Montpellier Méditerranée Métropole'
		and c.datetime <= '2024-04-01' and c.operator_id = 3 then 'Montpellier Méditerranée Métropole'
    when l_aom = 'Montpellier Méditerranée Métropole'
		and c.datetime > '2024-04-01' and c.operator_id = 9 then 'Montpellier Méditerranée Métropole'
    when l_aom = 'SM Artois Mobilités'
		and c.operator_id = 9 then 'SM Artois Mobilités'
    when l_aom = 'Métropole Rouen Normandie'
		and c.datetime <= '2023-10-01' and c.operator_id = 3 then 'Métropole Rouen Normandie'
    when l_aom = 'Métropole Rouen Normandie'
		and c.datetime > '2023-10-01' and c.operator_id = 9 then 'Métropole Rouen Normandie'
		when l_aom in ('Bordeaux Métropole','Dijon Métropole','Rennes Métropole') and c.operator_id in (3,4,9) then l_aom
    else null
	end as eligible_aom_name
from
	carpool.carpools c
inner join carpool.identities i on
	c.identity_id = i._id
inner join luis.cohorte_2022_v2 ft on
	ft.uuid = i.uuid
left join geo.perimeters p on
	c.start_geo_code = p.arr
),
drivers as (
select 
	"uuid",
	min(date_first_trip) as date_first_trip,
	max(eligible_aom_name) as eligible_aom_name
from
	drivers_trips
group by 1
),
trips_distance as (
select 
	c.trip_id,
	max(i."uuid"::text) filter (where is_driver) as "uuid",
  	min(c.datetime) as datetime,
	sum(coalesce (distance,(c.meta->>'calc_distance')::int) * seats) as distance_passagers
from
	carpool.carpools c
inner join carpool.identities i on
	c.identity_id = i._id
where date_part('year',c.datetime)=2022
group by
	1)
select 
	a.uuid,
  	max(a.eligible_aom_name) as aom_name,
	sum(b.distance_passagers)::float as distance
from
	drivers a
left join trips_distance b on
	a."uuid"::text = b."uuid"
where 
  b.datetime between a.date_first_trip and a.date_first_trip + interval '12 weeks' 
group by
	a.uuid
having max(a.eligible_aom_name) is not null
  """,
    connection=db_engine,
)

In [None]:
df_passengers_by_trips_aom.describe()

In [None]:
df_passengers_by_trips_cee_aom = pl.read_database(
    """
with drivers_trips as (
select
	i.uuid,
  ft.date_first_cee as date_first_cee,
  ft.cohorte,
	case
		when l_aom = 'Montpellier Méditerranée Métropole'
		and c.datetime <= '2024-04-01' and c.operator_id = 3 then 'Montpellier Méditerranée Métropole'
    when l_aom = 'Montpellier Méditerranée Métropole'
		and c.datetime > '2024-04-01' and c.operator_id = 9 then 'Montpellier Méditerranée Métropole'
    when l_aom = 'SM Artois Mobilités'
		and c.operator_id = 9 then 'SM Artois Mobilités'
    when l_aom = 'Métropole Rouen Normandie'
		and c.datetime <= '2023-10-01' and c.operator_id = 3 then 'Métropole Rouen Normandie'
    when l_aom = 'Métropole Rouen Normandie'
		and c.datetime > '2023-10-01' and c.operator_id = 9 then 'Métropole Rouen Normandie'
		when l_aom in ('Bordeaux Métropole','Dijon Métropole','Rennes Métropole') and c.operator_id in (3,4,9) then l_aom
    else null
	end as eligible_aom_name
from
	carpool.carpools c
inner join carpool.identities i on
	c.identity_id = i._id
inner join luis.cee_drivers_v4 ft on
	ft.uuid = i.uuid
left join geo.perimeters p on
	c.start_geo_code = p.arr
),
drivers as (
select 
	"uuid",
	min(date_first_cee) as date_first_cee,
  max(cohorte) as cohorte,
	max(eligible_aom_name) as eligible_aom_name
from
	drivers_trips
group by 1
),
trips_distance as (
select 
	c.trip_id,
	max(i."uuid"::text) filter (where is_driver) as "uuid",
  	min(c.datetime) as datetime,
	sum(coalesce (distance,(c.meta->>'calc_distance')::int) * seats) as distance_passagers
from
	carpool.carpools c
inner join carpool.identities i on
	c.identity_id = i._id
where date_part('year',c.datetime) in (2023,2024)
group by
	1)
select 
	a.uuid,
  	max(a.eligible_aom_name) as aom_name,
    max(a.cohorte) as cohorte,
	sum(b.distance_passagers)::float as distance
from
	drivers a
left join trips_distance b on
	a."uuid"::text = b."uuid"
where 
  b.datetime between a.date_first_cee and a.date_first_cee + interval '12 weeks' 
group by
	a.uuid
having max(a.eligible_aom_name) is not null
  """,
    connection=db_engine,
)

In [None]:
df_passengers_by_trips_cee_aom.describe()

## Moyennes

In [None]:
df_passengers_by_trips_cohortes.group_by("cohorte").agg(
    pl.col("count").mean().round(2)
).sort(pl.col("cohorte").str.split("_").list.reverse().list.join(""))

## Visualisation


In [None]:
def preprocess_passengers_by_trips_df(
    df: pl.DataFrame, bins: list[int]
) -> pl.DataFrame:
    return (
        df.get_column("count")
        .hist(bins, include_breakpoint=True)
        .with_columns(
            pl.col("breakpoint")
            .cast(pl.String)
            .replace(np.inf, f"{bins[-1]+1}")
            .str.replace("(\.0)", ""),
            (100 * pl.col("count") / pl.col("count").sum()).alias("share"),
        )
    )

In [None]:
bins = list(range(5))
plot_configs = [
    {
        "data": preprocess_passengers_by_trips_df(df_passengers_by_trips, bins),
        "name": "Référence 2022",
        "color": "#f39c12",
    },
    {
        "data": preprocess_passengers_by_trips_df(
            df_passengers_by_trips_cohortes.filter(pl.col("cohorte") == "t1_23"),
            bins,
        ),
        "name": "CEE T1 2023",
        "color": "#d7e1ed",
    },
    {
        "data": preprocess_passengers_by_trips_df(
            df_passengers_by_trips_cohortes.filter(pl.col("cohorte") == "t2_23"),
            bins,
        ),
        "name": "CEE T2 2023",
        "color": "#89a6c7",
    },
    {
        "data": preprocess_passengers_by_trips_df(
            df_passengers_by_trips_cohortes.filter(pl.col("cohorte") == "t3_23"),
            bins,
        ),
        "name": "CEE T3 2023",
        "color": "#3E6DA1",
    },
    {
        "data": preprocess_passengers_by_trips_df(
            df_passengers_by_trips_cohortes.filter(pl.col("cohorte") == "t4_23"),
            bins,
        ),
        "name": "CEE T4 2023",
        "color": "#1a334e",
    },
    {
        "data": preprocess_passengers_by_trips_df(
            df_passengers_by_trips_cohortes.filter(pl.col("cohorte") == "t1_24"),
            bins,
        ),
        "name": "CEE T1 2024",
        "color": "rgba(113, 88, 226,1.0)",
    },
]
traces = []
for config in plot_configs:
    data = config["data"]
    trace = go.Bar(
        x=data["breakpoint"],
        y=data["share"],
        marker_color=config["color"],
        name=config["name"],
        hovertemplate="%{y:.2f}% des trajets ont %{x} passager(s)",
    )
    traces.append(trace)

fig_passagers_multi = go.Figure(traces)
fig_passagers_multi.update_layout(
    barmode="group",
    bargroupgap=0.2,
    plot_bgcolor="white",
    legend_title="Cohorte :",
    title="Combien y-a t'il de passagers dans un trajet de coviturage ?",
)
fig_passagers_multi.update_yaxes(
    showgrid=True,
    griddash="dashdot",
    gridwidth=1,
    gridcolor="gray",
    title="Part des trajets (%)",
)
fig_passagers_multi.update_xaxes(title="Nombre de passagers", range=[0.5, 5.5])
fig_passagers_multi.show()
fig_passagers_multi.write_html(OUTPUT_PATH / "histo_passagers_multi.html")
fig_passagers_multi.write_image(
    OUTPUT_PATH / "histo_passagers_multi.svg", format="svg", width=1280, height=720
)

## Comparaison opérateurs


In [None]:
df_passagers_by_cohortes_op = (
    df_passengers_by_trips_cohortes.group_by(["cohorte", "cee_operator_name"])
    .agg(pl.col("count").mean().alias("nombre_passagers_moyen"))
    .sort(
        pl.col("cohorte").str.split("_").list.reverse().list.join(""),
        "cee_operator_name",
    )
)
with pl.Config(tbl_rows=600):
    print(df_passagers_by_cohortes_op)

In [None]:
df_passagers_by_cohortes_op.write_clipboard()

In [None]:
fig_passagers_operator = px.line(
    df_passagers_by_cohortes_op,
    x="cohorte",
    y="nombre_passagers_moyen",
    color="cee_operator_name",
    markers=True,
    color_discrete_map=color_mapping,
    template="simple_white",
    labels={
        "nombre_passagers_moyen": "Nombre moyen de passagers",
        "cohorte": "Cohorte",
        "cee_operator_name": "Opérateur",
    },
    title="Comparaison du nombre moyen de passagers par cohorte et pour chaque opérateur<br><sub>Le conducteur est compté comme passager.</sub>",
)

fig_passagers_operator.update_traces(marker_size=11, line_width=0.5, line_dash="dot")
fig_passagers_operator.write_html(OUTPUT_PATH / "stats_passagers_par_op.html")
fig_passagers_operator.write_image(
    OUTPUT_PATH / "stats_passagers_par_op.svg",
    format="svg",
    width=1280,
    height=720,
)
fig_passagers_operator.update_layout(height=800)

## Comparaison par AOM


### CEE


In [None]:
(
    df_passengers_by_trips_cee_aom.filter(pl.col("aom_name").is_in(selected_aoms))
    .group_by(["aom_name"])
    .agg(
        pl.col("passagers").mean().alias("nombre_moyen_passagers").round(2),
        pl.len(),
    )
    .with_columns(is_aom_expr)
    .sort(
        [
            pl.col("aom_incentive_status"),
            pl.col("aom_name"),
        ]
    )
)

In [None]:
df_passengers_by_trips_cee_aom_agg = (
    df_passengers_by_trips_cee_aom.filter(pl.col("aom_name").is_in(selected_aoms))
    .group_by(["cohorte", "aom_name"])
    .agg(pl.col("passagers").mean().alias("nombre_moyen_passagers"))
    .with_columns(is_aom_expr)
    .sort(
        [
            pl.col("cohorte").str.split("_").list.reverse().list.join(""),
            pl.col("aom_incentive_status"),
            pl.col("aom_name"),
        ]
    )
)
df_passengers_by_trips_cee_aom_agg

In [None]:
df_passengers_by_trips_cee_aom_agg_incentives = (
    df_passengers_by_trips_cee_aom.with_columns(is_aom_expr)
    .group_by(["cohorte", "aom_incentive_status"])
    .agg(pl.col("passagers").mean().alias("nombre_moyen_passagers"))
    .sort([pl.col("cohorte").str.reverse(), "aom_incentive_status"])
)
df_passengers_by_trips_cee_aom_agg_incentives

### 2022


In [None]:
df_passengers_by_trips_aom_agg = (
    df_passengers_by_trips_aom.filter(pl.col("aom_name").is_in(selected_aoms))
    .group_by(["aom_name"])
    .agg(
        pl.col("passagers").mean().alias("nombre_moyen_passagers").round(2),
        pl.len(),
    )
    .with_columns(is_aom_expr, pl.lit("2022").alias("cohorte"))
    .sort(
        [
            pl.col("cohorte").str.split("_").list.reverse().list.join(""),
            pl.col("aom_incentive_status"),
            pl.col("aom_name"),
        ]
    )
)
df_passengers_by_trips_aom_agg

In [None]:
df_passengers_by_trips_aom_agg_incentives = (
    df_passengers_by_trips_aom.with_columns(is_aom_expr)
    .group_by("aom_incentive_status")
    .agg(pl.col("passagers").mean().alias("nombre_moyen_passagers"))
    .sort("aom_incentive_status")
)
df_passengers_by_trips_aom_agg_incentives

### Visualisation


In [None]:
fig_passengers_by_trips_multi_aom_status = px.bar(
    pl.concat(
        [
            df_passengers_by_trips_aom_agg_incentives.with_columns(
                pl.lit("2022").alias("cohorte")
            ),
            df_passengers_by_trips_cee_aom_agg_incentives,
        ],
        how="diagonal",
    ).with_columns(pl.col("nombre_moyen_passagers").round(2)),
    x="cohorte",
    y="nombre_moyen_passagers",
    text="nombre_moyen_passagers",
    color="aom_incentive_status",
    barmode="group",
    template="simple_white",
    labels={
        "aom_incentive_status": "Type d'AOM",
        "nombre_moyen_passagers": "Nombre moyen de passagers",
        "cohorte": "Cohorte",
    },
    title="Comparaison du nombre moyen de passagers<br><sub>Par type d'AOM et cohorte<sub>",
    color_discrete_map={
        "Avec incitation": "rgba(39, 174, 96,1.0)",
        "Sans incitation": "rgba(47, 54, 64,1.0)",
    },
)
fig_passengers_by_trips_multi_aom_status.show()
fig_passengers_by_trips_multi_aom_status.write_html(
    OUTPUT_PATH / "histo_passagers_multi_aom_status.html"
)
fig_passengers_by_trips_multi_aom_status.write_image(
    OUTPUT_PATH / "histo_passagers_multi_aom_status.svg",
    format="svg",
    width=1280,
    height=720,
)

# Trajets intracommunaux


## Requêtes


### 2022

In [None]:
df_communes_by_trips = pl.read_database(
    """
select
	trip_id,
  max(ft.aom_name) as aom_name,
	min(start_geo_code) as start_commune,
	min(end_geo_code) as end_commune
from
	carpool.carpools c
inner join carpool.identities i on c.identity_id = i._id
inner JOIN luis.cohorte_2022_v2 ft ON ft.uuid=i.uuid
WHERE c.datetime BETWEEN ft.date_first_trip AND ft.date_first_trip + INTERVAL '23 weeks'
and is_driver
and status=cast('ok' as covoiturage_production.carpool.carpool_status_enum)
group by
	1
""",
    connection=db_engine,
)

In [None]:
df_communes_by_trips.describe()

In [None]:
df_communes_by_trips.select(
    (pl.col("start_commune") == pl.col("end_commune")).value_counts()
).unnest("start_commune").with_columns(100 * pl.col("count") / pl.col("count").sum())

### CEE

In [None]:
df_communes_by_trips_cohortes = pl.read_database(
    """
with cohortes as (select 
	*
from luis.cee_drivers_v4 cd)
select
	trip_id,
	min(start_geo_code) as start_commune,
	min(end_geo_code) as end_commune,
  max(ft.cee_operator_name) as cee_operator_name,
  max(ft.max_aom_name) as aom_name,
  max(ft.cohorte) as cohorte
from
	carpool.carpools c
inner join carpool.identities i on c.identity_id = i._id
inner JOIN cohortes ft ON ft.uuid=i.uuid
WHERE c.datetime BETWEEN ft.date_first_cee AND ft.date_first_cee + INTERVAL '23 weeks'
and is_driver
and status=cast('ok' as covoiturage_production.carpool.carpool_status_enum)
and ft.cohorte is not null
group by
	1
""",
    connection=db_engine,
)

In [None]:
df_communes_by_trips_cohortes.describe()

## Comparaison


In [None]:
def preprocess_communes_by_trips(df: pl.DataFrame) -> float:
    truth_share = (
        100
        * (
            df.select((pl.col("start_commune") == pl.col("end_commune")).value_counts())
            .unnest("start_commune")
            .filter(pl.col("start_commune"))
            .select("count")
            / len(df)
        ).item()
    )

    return truth_share

In [None]:
plot_configs = [
    {
        "data": preprocess_communes_by_trips(df_communes_by_trips),
        "name": "Référence 2022",
        "color": "#f39c12",
    },
    {
        "data": preprocess_communes_by_trips(
            df_communes_by_trips_cohortes.filter(pl.col("cohorte") == "t1_23")
        ),
        "name": "CEE T1 2023",
        "color": "#d7e1ed",
    },
    {
        "data": preprocess_communes_by_trips(
            df_communes_by_trips_cohortes.filter(pl.col("cohorte") == "t2_23")
        ),
        "name": "CEE T2 2023",
        "color": "#89a6c7",
    },
    {
        "data": preprocess_communes_by_trips(
            df_communes_by_trips_cohortes.filter(pl.col("cohorte") == "t3_23")
        ),
        "name": "CEE T3 2023",
        "color": "#3E6DA1",
    },
    {
        "data": preprocess_communes_by_trips(
            df_communes_by_trips_cohortes.filter(pl.col("cohorte") == "t4_23")
        ),
        "name": "CEE T4 2023",
        "color": "#1a334e",
    },
    {
        "data": preprocess_communes_by_trips(
            df_communes_by_trips_cohortes.filter(pl.col("cohorte") == "t1_24")
        ),
        "name": "CEE T1 2024",
        "color": "rgba(113, 88, 226,1.0)",
    },
]
traces = []
for config in plot_configs:
    data = config["data"]
    trace = go.Bar(
        x=[config["name"]],
        y=[data],
        text=[f"{data:.2f}%"],
        textposition="inside",
        textfont_size=15,
        marker_color=config["color"],
        hovertemplate="%{y:.2f}% des trajets sont intra-communaux<extra></extra>",
    )
    traces.append(trace)

fig_communes_multi = go.Figure(traces)
fig_communes_multi.update_layout(
    barmode="overlay",
    plot_bgcolor="white",
    showlegend=False,
    title="Part des trajets dont le départ et la destination est la même commune",
    margin_t=80,
)
fig_communes_multi.update_yaxes(
    showgrid=True,
    griddash="dashdot",
    gridwidth=1,
    gridcolor="gray",
    title="Part des trajets intracommunaux (%)",
)
fig_communes_multi.update_xaxes(title="Cohorte")
fig_communes_multi.show()
fig_communes_multi.write_html(OUTPUT_PATH / "histo_communes_multi.html")
fig_communes_multi.write_image(OUTPUT_PATH / "histo_communes_multi.svg", format="svg")

## Comparaison opérateurs


In [None]:
df_trips_intracommunaux_by_cohortes_op = (
    df_communes_by_trips_cohortes.group_by(["cohorte", "cee_operator_name"])
    .agg(
        (
            100 * (pl.col("start_commune") == pl.col("end_commune")).sum() / pl.len()
        ).alias("share_intra")
    )
    .sort(
        pl.col("cohorte").str.split("_").list.reverse().list.join(""),
        "cee_operator_name",
    )
)
with pl.Config(tbl_rows=600):
    print(df_trips_intracommunaux_by_cohortes_op)

In [None]:
df_trips_intracommunaux_by_cohortes_op.write_clipboard()

In [None]:
fig_trips_intracommunaux_operator = px.line(
    df_trips_intracommunaux_by_cohortes_op,
    x="cohorte",
    y="share_intra",
    color="cee_operator_name",
    markers=True,
    color_discrete_map=color_mapping,
    template="simple_white",
    labels={
        "share_intra": "Part du nombre de trajets intracommunaux (%)",
        "cohorte": "Cohorte",
        "cee_operator_name": "Opérateur",
    },
    title="Comparaison de la part de trajets intracommunaux par cohorte et pour chaque opérateur",
)

fig_trips_intracommunaux_operator.update_traces(
    marker_size=11, line_width=0.5, line_dash="dot"
)
fig_trips_intracommunaux_operator.write_html(
    OUTPUT_PATH / "stats_trajets_intracommunaux_par_op.html"
)
fig_trips_intracommunaux_operator.write_image(
    OUTPUT_PATH / "stats_trajets_intracommunaux_par_op.svg",
    format="svg",
    width=1280,
    height=720,
)
fig_trips_intracommunaux_operator.update_layout(height=800)

## Comparaison par AOM


In [None]:
df_intra_by_trips_aom = (
    df_communes_by_trips.filter(pl.col("aom_name").is_in(selected_aoms))
    .group_by(["aom_name"])
    .agg(
        (
            100 * (pl.col("start_commune") == pl.col("end_commune")).sum() / pl.len()
        ).alias("share_intra")
    )
    .sort(
        "aom_name",
    )
)
df_intra_by_trips_aom

In [None]:
df_intra_by_trips_cee_aom = (
    df_communes_by_trips_cohortes.filter(pl.col("aom_name").is_in(selected_aoms))
    .group_by(["cohorte", "aom_name"])
    .agg(
        (
            100 * (pl.col("start_commune") == pl.col("end_commune")).sum() / pl.len()
        ).alias("share_intra")
    )
    .sort(
        pl.col("cohorte").str.split("_").list.reverse().list.join(""),
        "aom_name",
    )
)
df_intra_by_trips_cee_aom

In [None]:
fig_intra_multi_aom = px.bar(
    pl.concat(
        [
            df_intra_by_trips_aom.with_columns(pl.lit("2022").alias("cohorte")),
            df_intra_by_trips_cee_aom,
        ],
        how="diagonal",
    ),
    x="cohorte",
    y="share_intra",
    color="aom_name",
    barmode="group",
    template="simple_white",
    labels={
        "aom_name": "AOM",
        "share_intra": "Part des trajets intra-communaux( %)",
        "cohorte": "Cohorte",
    },
    title="Comparaison de la part des trajets intra-communaux<br><sub>Par AOM et cohorte<sub>",
)
fig_intra_multi_aom.update_yaxes(
    showgrid=True,
    griddash="dashdot",
    gridwidth=1,
    gridcolor="gray",
)

fig_intra_multi_aom.show()
fig_intra_multi_aom.write_html(OUTPUT_PATH / "histo_intra_multi_aom.html")
fig_intra_multi_aom.write_image(
    OUTPUT_PATH / "histo_intra_multi_aom.svg",
    format="svg",
    width=1280,
    height=720,
)

# Churn/Utilisation/Attrition


## Requêtes


### 2022

In [None]:
df_weeks_by_driver = pl.read_database(
    """
with "template" as (
select 
	*,
	generate_series(date_trunc('week',c.date_first_trip at time zone 'Europe/Paris'),
	date_trunc('week',c.date_first_trip at time zone 'Europe/Paris' + interval '51 weeks'),
	interval '1 weeks') as semaine
from
	luis.cohorte_2022_v2 c
  where c.date_first_trip < '2022-08-01'
  ),

trips as (
select
	ft.uuid,
	date_trunc('week',
	c.datetime at time zone 'Europe/Paris') as semaine
from
	carpool.carpools c
inner join carpool.identities i on
c.identity_id = i._id
inner JOIN luis.cohorte_2022_v2 ft ON ft.uuid=i.uuid
where
c.datetime at time zone 'Europe/Paris' between ft.date_first_trip and ft.date_first_trip + interval '51 weeks'
and is_driver
and status = cast('ok' as covoiturage_production.carpool.carpool_status_enum)
group by
	 1,2),

aggregated_data as (select 
	t.uuid::text,
	t.semaine,
  max(t.aom_name) as aom_name,
	count(tr.semaine)>0 had_trip
from
	"template" t
left join trips tr on
	t.uuid = tr.uuid
	and t.semaine = tr.semaine
group by
	1,2)

select 
	*,
	row_number() over (partition by uuid order by semaine) as num_semaine
from aggregated_data
order by 1,2
""",
    connection=db_engine,
)

In [None]:
df_weeks_by_driver.describe()

### CEE

In [None]:
df_weeks_by_driver_cohortes = pl.read_database(
    """
with "template" as (
select
	*,
  generate_series(date_trunc('week',cd.date_first_cee at time zone 'Europe/Paris'),
	                date_trunc('week',cd.date_first_cee at time zone 'Europe/Paris' + interval '51 weeks'),
                  interval '1 weeks') as semaine
from luis.cee_drivers_v4 cd
  ),

trips as (
select
	ft.uuid,
	date_trunc('week',
	c.datetime) as semaine
from
	carpool.carpools c
inner join carpool.identities i on
c.identity_id = i._id
inner JOIN "template" ft ON ft.uuid=i.uuid
where
c.datetime between ft.date_first_cee and ft.date_first_cee + interval '51 weeks'
and is_driver
and status = cast('ok' as covoiturage_production.carpool.carpool_status_enum)
and ft.cohorte is not null
group by
	 1,2),

aggregated_data as (select 
	t.uuid::text,
	t.semaine,
  max(t.max_aom_name) as aom_name,
	count(tr.semaine)>0 had_trip,
  max(t.cee_operator_name) as cee_operator_name,
  max(t.cohorte) as cohorte
from
	"template" t
left join trips tr on
	t.uuid = tr.uuid
	and t.semaine = tr.semaine
group by
	1,2)

select 
	*,
	row_number() over (partition by uuid order by semaine) as num_semaine
from aggregated_data
order by 1,2
""",
    connection=db_engine,
)

In [None]:
df_weeks_by_driver_cohortes.describe()

### AOM

In [None]:
df_weeks_by_trips_aom = pl.read_database(
    """
with driver_trips as (
select
	i.uuid,
	ft.date_first_trip,
	case
		when l_aom = 'Montpellier Méditerranée Métropole'
		and c.datetime <= '2024-04-01' and c.operator_id = 3 then 'Montpellier Méditerranée Métropole'
    when l_aom = 'Montpellier Méditerranée Métropole'
		and c.datetime > '2024-04-01' and c.operator_id = 9 then 'Montpellier Méditerranée Métropole'
    when l_aom = 'SM Artois Mobilités'
		and c.operator_id = 9 then 'SM Artois Mobilités'
    when l_aom = 'Métropole Rouen Normandie'
		and c.datetime <= '2023-10-01' and c.operator_id = 3 then 'Métropole Rouen Normandie'
    when l_aom = 'Métropole Rouen Normandie'
		and c.datetime > '2023-10-01' and c.operator_id = 9 then 'Métropole Rouen Normandie'
		when l_aom in ('Bordeaux Métropole','Dijon Métropole','Rennes Métropole') and c.operator_id in (3,4,9) then l_aom
    else null
	end as eligible_aom_name
from
	carpool.carpools c
inner join carpool.identities i on
	c.identity_id = i._id
inner join luis.cohorte_2022_v2 ft on
	ft.uuid = i.uuid
left join geo.perimeters p on
	c.start_geo_code = p.arr
WHERE c.datetime BETWEEN ft.date_first_trip AND ft.date_first_trip + INTERVAL '19 weeks'
),
drivers as (
select 
	"uuid",
	max(eligible_aom_name) as eligible_aom_name,
	max(date_first_trip) as date_first_trip
from
	driver_trips
group by
	1
having max(eligible_aom_name) is not null
),
"template" as (
select 
	*,
	generate_series(date_trunc('week',c.date_first_trip at time zone 'Europe/Paris'),
	date_trunc('week',c.date_first_trip at time zone 'Europe/Paris' + interval '19 weeks'),
	interval '1 weeks') as semaine
from
	drivers c
  where c.date_first_trip < '2022-08-01'
  ),
trips as (
select
	ft.uuid,
	date_trunc('week',
	c.datetime at time zone 'Europe/Paris') as semaine,
	max(ft.eligible_aom_name) as eligible_aom_name
from
	carpool.carpools c
inner join carpool.identities i on
c.identity_id = i._id
inner JOIN drivers ft ON ft.uuid=i.uuid
where
c.datetime at time zone 'Europe/Paris' between ft.date_first_trip and ft.date_first_trip + interval '19 weeks'
and is_driver
and status = cast('ok' as covoiturage_production.carpool.carpool_status_enum)
group by 1,2
),
aggregated_data as (select 
	t.uuid::text,
	t.semaine,
  max(tr.eligible_aom_name) as aom_name,
	count(tr.semaine)>0 had_trip
from
	"template" t
left join trips tr on
	t.uuid = tr.uuid
	and t.semaine = tr.semaine
group by
	1,2)
select 
	*,
	row_number() over (partition by uuid order by semaine) as num_semaine
from aggregated_data
order by 1,2
  """,
    connection=db_engine,
)

In [None]:
df_weeks_by_trips_aom.describe()

In [None]:
df_weeks_by_trips_cee_aom = pl.read_database(
    """
with driver_trips as (
select
	i.uuid,
	ft.date_first_cee,
  ft.cohorte,
	case
		when l_aom = 'Montpellier Méditerranée Métropole'
		and c.datetime <= '2024-04-01' and c.operator_id = 3 then 'Montpellier Méditerranée Métropole'
    when l_aom = 'Montpellier Méditerranée Métropole'
		and c.datetime > '2024-04-01' and c.operator_id = 9 then 'Montpellier Méditerranée Métropole'
    when l_aom = 'SM Artois Mobilités'
		and c.operator_id = 9 then 'SM Artois Mobilités'
    when l_aom = 'Métropole Rouen Normandie'
		and c.datetime <= '2023-10-01' and c.operator_id = 3 then 'Métropole Rouen Normandie'
    when l_aom = 'Métropole Rouen Normandie'
		and c.datetime > '2023-10-01' and c.operator_id = 9 then 'Métropole Rouen Normandie'
		when l_aom in ('Bordeaux Métropole','Dijon Métropole','Rennes Métropole') and c.operator_id in (3,4,9) then l_aom
    else null
	end as eligible_aom_name
from
	carpool.carpools c
inner join carpool.identities i on
	c.identity_id = i._id
inner join luis.cee_drivers_v4 ft on
	ft.uuid = i.uuid
left join geo.perimeters p on
	c.start_geo_code = p.arr
WHERE c.datetime BETWEEN ft.date_first_cee AND ft.date_first_cee + INTERVAL '19 weeks'
),
drivers as (
select 
	"uuid",
	max(eligible_aom_name) as eligible_aom_name,
	max(date_first_cee) as date_first_cee,
  max(cohorte) as cohorte
from
	driver_trips
group by
	1
having max(eligible_aom_name) is not null
and max(cohorte) is not null
),
"template" as (
select 
	*,
	generate_series(date_trunc('week',c.date_first_cee at time zone 'Europe/Paris'),
	date_trunc('week',c.date_first_cee at time zone 'Europe/Paris' + interval '19 weeks'),
	interval '1 weeks') as semaine
from
	drivers c
  where c.date_first_cee < '2024-03-01'
  ),
trips as (
select
	ft.uuid,
	date_trunc('week',
	c.datetime at time zone 'Europe/Paris') as semaine,
	max(ft.eligible_aom_name) as eligible_aom_name,
  max(ft.cohorte) as cohorte
from
	carpool.carpools c
inner join carpool.identities i on
c.identity_id = i._id
inner JOIN drivers ft ON ft.uuid=i.uuid
where
c.datetime at time zone 'Europe/Paris' between ft.date_first_cee and ft.date_first_cee + interval '19 weeks'
and is_driver
and status = cast('ok' as covoiturage_production.carpool.carpool_status_enum)
group by 1,2
),
aggregated_data as (select 
	t.uuid::text,
	t.semaine,
  	max(tr.eligible_aom_name) as aom_name,
    max(tr.cohorte) as cohorte,
	count(tr.semaine)>0 had_trip
from
	"template" t
left join trips tr on
	t.uuid = tr.uuid
	and t.semaine = tr.semaine
group by
	1,2)
select 
	*,
	row_number() over (partition by uuid order by semaine) as num_semaine
from aggregated_data
order by 1,2

  """,
    connection=db_engine,
)

In [None]:
df_weeks_by_trips_cee_aom.describe()

## Comparaison


In [None]:
def preprocess_week_by_driver_df(df: pl.DataFrame) -> pl.DataFrame:
    df_week_by_cohorte = (
        df.group_by(["num_semaine"])
        .agg((100 * pl.col("had_trip").sum() / df["uuid"].n_unique()).alias("share"))
        .sort("num_semaine")
    )
    return df_week_by_cohorte

In [None]:
plot_configs = [
    {
        "data": preprocess_week_by_driver_df(df_weeks_by_driver),
        "name": "Référence 2022",
        "color": "#f39c12",
    },
    {
        "data": preprocess_week_by_driver_df(
            df_weeks_by_driver_cohortes.filter(pl.col("cohorte") == "t1_23")
        ),
        "name": "CEE T1 2023",
        "color": "#d7e1ed",
    },
    {
        "data": preprocess_week_by_driver_df(
            df_weeks_by_driver_cohortes.filter(pl.col("cohorte") == "t2_23")
        ),
        "name": "CEE T2 2023",
        "color": "#89a6c7",
    },
    {
        "data": preprocess_week_by_driver_df(
            df_weeks_by_driver_cohortes.filter(pl.col("cohorte") == "t3_23")
        ),
        "name": "CEE T3 2023",
        "color": "#3E6DA1",
    },
    {
        "data": preprocess_week_by_driver_df(
            df_weeks_by_driver_cohortes.filter(pl.col("cohorte") == "t4_23")
        ),
        "name": "CEE T4 2023",
        "color": "#1a334e",
    },
    {
        "data": preprocess_week_by_driver_df(
            df_weeks_by_driver_cohortes.filter(pl.col("cohorte") == "t1_24")
        ),
        "name": "CEE T1 2024",
        "color": "rgba(113, 88, 226,1.0)",
    },
]
traces = []
for config in plot_configs:
    data = config["data"]
    trace = go.Scatter(
        x=data["num_semaine"],
        y=data["share"],
        marker_color=config["color"],
        hovertemplate="%{y:.2f}% des conducteurs ont été actifs %{x} semaine(s)",
        name=config["name"],
        visible=config.get("visible", True),
    )
    traces.append(trace)

fig_weeks_by_driver_multi = go.Figure(traces)
fig_weeks_by_driver_multi.update_layout(
    plot_bgcolor="white",
    title="Courbes d'attrition des conducteurs<br>Quel cohorte a réussi à retenir les conducteurs le plus longtemps ?",
    hovermode="x unified",
)
fig_weeks_by_driver_multi.update_yaxes(
    showgrid=True,
    griddash="dot",
    gridwidth=1,
    gridcolor="gray",
    title="Part des conducteurs (%)",
    zeroline=True,
    zerolinecolor="black",
    tickvals=[0, 10, 20, 40, 60, 80, 100],
)
fig_weeks_by_driver_multi.update_xaxes(
    title="Numéro de la semaine",
    range=[0, 52],
    dtick=1,
    tickprefix="Semaine ",
    showtickprefix="none",
)

# Annotation du coude
fig_weeks_by_driver_multi.add_annotation(
    x=2.5,
    y=58.5,
    ay=63,
    ayref="y",
    ax=3,
    axref="x",
    text="Entre 50% et 60% des utilisateurs perdus en semaine 2",
    xanchor="left",
    showarrow=True,
    arrowhead=1,
    font_color="rgba(44, 62, 80,1.0)",
    bgcolor="white",
    font_size=14,
    borderpad=3,
)
fig_weeks_by_driver_multi.add_shape(
    type="circle",
    xref="x",
    yref="y",
    x0=1.5,
    y0=32,
    x1=2.5,
    y1=60,
    line_dash="dash",
)
fig_weeks_by_driver_multi.show()
fig_weeks_by_driver_multi.write_html(OUTPUT_PATH / "num_semaines_multi.html")
fig_weeks_by_driver_multi.write_image(
    OUTPUT_PATH / "num_semaines_multi.svg", format="svg", width=1920, height=1080
)

## Comparaison opérateurs


In [None]:
df_drivers_per_week_op = (
    df_weeks_by_driver_cohortes.filter(pl.col("cohorte").is_not_null())
    .with_columns(
        pl.col("uuid").n_unique().over("cee_operator_name").alias("num_drivers")
    )
    .group_by(["cee_operator_name", "num_semaine"])
    .agg(pl.col("had_trip").sum(), pl.col("num_drivers").max())
    .with_columns((100 * pl.col("had_trip") / pl.col("num_drivers")).alias("share"))
    .sort(["cee_operator_name", "num_semaine"])
)
df_drivers_per_week_op

In [None]:
fig_churn_op = px.line(
    df_drivers_per_week_op.filter(pl.col("cee_operator_name") != "Picholines"),
    x="num_semaine",
    y="share",
    color="cee_operator_name",
    color_discrete_map=color_mapping,
    template="simple_white",
    title="Courbe d'attrition par opérateur<br><sub>Toutes cohortes confondues</sub>",
    labels={
        "num_semaine": "Semaine",
        "share": "% des conducteurs retenus",
        "cee_operator_name": "Opérateur",
    },
)
fig_churn_op.write_image(
    OUTPUT_PATH / "stats_churn_par_op.svg",
    format="svg",
    width=1280,
    height=720,
)
fig_churn_op.write_html(OUTPUT_PATH / "stats_churn_par_op.html")
fig_churn_op

## Comparaison par AOM


### CEE


In [None]:
(
    df_weeks_by_trips_cee_aom.with_columns(pl.col("aom_name").forward_fill())
    .filter(pl.col("aom_name").is_in(selected_aoms))
    .group_by(["aom_name", "num_semaine"])
    .agg(
        (
            100
            * pl.col("had_trip").sum()
            / df_weeks_by_trips_cee_aom["uuid"].n_unique()
        ).alias("share")
    )
    .with_columns(is_aom_expr)
    .sort([pl.col("aom_incentive_status"), pl.col("aom_name"), pl.col("num_semaine")])
)

In [None]:
df_weeks_by_drivers_cee_aom_agg_incentives = (
    df_weeks_by_trips_cee_aom.with_columns(pl.col("aom_name").forward_fill())
    .with_columns(
        is_aom_expr,
    )
    .with_columns(
        pl.col("uuid").n_unique().over("aom_incentive_status").alias("total_uuid")
    )
    .group_by(["aom_incentive_status", "num_semaine"])
    .agg(
        (100 * pl.col("had_trip").sum() / pl.col("total_uuid").max()).alias(
            "num_drivers"
        ),
        pl.col("total_uuid").max(),
    )
    .sort(["aom_incentive_status", "num_semaine"])
)
df_weeks_by_drivers_cee_aom_agg_incentives

### 2022


In [None]:
df_weeks_by_drivers_aom_agg_incentives = (
    df_weeks_by_trips_aom.with_columns(pl.col("aom_name").forward_fill())
    .with_columns(
        is_aom_expr,
    )
    .with_columns(
        pl.col("uuid").n_unique().over("aom_incentive_status").alias("total_uuid")
    )
    .group_by(["aom_incentive_status", "num_semaine"])
    .agg(
        (100 * pl.col("had_trip").sum() / pl.col("total_uuid").max()).alias(
            "num_drivers"
        ),
        pl.col("total_uuid").max(),
    )
    .sort(["aom_incentive_status", "num_semaine"])
)
df_weeks_by_drivers_aom_agg_incentives

### Visualisation


In [None]:
fig_weeks_by_driver_multi_aom_status = px.line(
    pl.concat(
        [
            df_weeks_by_drivers_aom_agg_incentives.with_columns(
                pl.lit("2022").alias("cohorte")
            ),
            df_weeks_by_drivers_cee_aom_agg_incentives.with_columns(
                pl.lit("CEE").alias("cohorte")
            ),
        ],
        how="diagonal",
    ).with_columns(pl.col("num_drivers").round(2)),
    x="num_semaine",
    y="num_drivers",
    line_dash="aom_incentive_status",
    color="cohorte",
    template="simple_white",
    labels={
        "aom_incentive_status": "Type d'AOM",
        "num_drivers": "% des conducteurs",
        "cohorte": "Cohorte",
        "num_semaine": "Semaine n°",
    },
    title="Comparaison de l'attrition entre les conducteurs 2022 et les conducteurs CEE<br><sub>Un conducteur est compté présent une semaine durant laquelle il covoiture.<sub>",
    color_discrete_map={
        "Avec incitation": "rgba(39, 174, 96,1.0)",
        "Sans incitation": "rgba(47, 54, 64,1.0)",
    },
)
fig_weeks_by_driver_multi_aom_status.update_yaxes(
    showgrid=True,
    griddash="dot",
    gridwidth=1,
    gridcolor="gray",
    dtick=10,
)
fig_weeks_by_driver_multi_aom_status.show()
fig_weeks_by_driver_multi_aom_status.write_html(
    OUTPUT_PATH / "fig_churn_aom_status.html"
)
fig_weeks_by_driver_multi_aom_status.write_image(
    OUTPUT_PATH / "fig_churn_aom_status.svg",
    format="svg",
    width=1280,
    height=720,
)

## Application au nombre de trajets effectués

### 2022

In [None]:
df_trips_by_driver_6m = pl.read_database(
    """
select
	i.uuid,
	count(distinct trip_id) as num_trajets
from
	carpool.carpools c
inner join carpool.identities i on c.identity_id = i._id
inner JOIN luis.cohorte_2022_v2 ft ON ft.uuid=i.uuid
WHERE c.datetime BETWEEN ft.date_first_trip AND ft.date_first_trip + INTERVAL '23 weeks'
and ft.date_first_trip<='2022-07-30'
and is_driver
and status=cast('ok' as covoiturage_production.carpool.carpool_status_enum)
group by
	1
""",
    connection=db_engine,
)

In [None]:
df_trips_by_driver_6m.select(pl.col("num_trajets").mean())

In [None]:
df_churn_2022 = (
    df_weeks_by_driver.with_columns(
        pl.col("uuid").n_unique().sum().alias("num_drivers")
    )
    .group_by(["num_semaine"])
    .agg((100 * pl.col("had_trip").sum() / pl.col("num_drivers").max()).alias("share"))
    .sort("num_semaine")
)
df_churn_2022

In [None]:
df_trips_by_driver_6m.select(
    pl.col("num_trajets").mean().alias("mean_trips"),
    (pl.col("num_trajets").mean() / 24).alias("mean_trips_by_week"),
).join(df_churn_2022.filter(pl.col("num_semaine") >= 25), how="cross").select(
    (pl.col("mean_trips").max() * 2).alias("naive_estimation_mean_trips"),
    (
        pl.col("mean_trips").first()
        + ((pl.col("share") / 100) * pl.col("mean_trips_by_week")).sum()
    ).alias("churn_estimatation_mean_trips"),
)

### CEE

In [None]:
df_trips_by_driver_cohortes_6m = pl.read_database(
    """
with cohortes as (select 
	*
from luis.cee_drivers_v4 cd)
select 
	ch.uuid,
    max(ch.cohorte) as cohorte,
	count(distinct trip_id) as num_trajets
from
	carpool.carpools c
inner join carpool.identities i on
	c.identity_id = i._id
inner join cohortes ch on
	i."uuid" = ch.uuid
where
	c.datetime BETWEEN ch.date_first_cee AND ch.date_first_cee+ INTERVAL '23 weeks'
	and ch.cohorte is not null
	and c.is_driver
    and status=cast('ok' as covoiturage_production.carpool.carpool_status_enum)
group by 1
""",
    connection=db_engine,
)

In [None]:
df_churn_cohortes = (
    df_weeks_by_driver_cohortes.filter(pl.col("cohorte").is_not_null())
    .with_columns(pl.col("uuid").n_unique().sum().alias("num_drivers_cohorte"))
    .group_by(["num_semaine"])
    .agg(
        (100 * pl.col("had_trip").sum() / pl.col("num_drivers_cohorte").max()).alias(
            "share"
        )
    )
    .sort("num_semaine")
)

### Moyenne 2023

In [None]:
df_trips_by_driver_2023_6m_agg = df_trips_by_driver_cohortes_6m.filter(
    pl.col("cohorte").str.contains("23")
).select(
    pl.len(),
    pl.col("num_trajets").mean().alias("mean_trips"),
    (pl.col("num_trajets") / (24)).mean().alias("mean_trips_by_week"),
)
df_trips_by_driver_2023_6m_agg

In [None]:
df_trips_by_driver_2023_6m_agg.join(
    df_churn_cohortes.filter(pl.col("num_semaine") >= 25), how="cross"
).select(
    (pl.col("mean_trips").max() * 2).alias("naive_estimation_mean_trips").round(1),
    (
        pl.col("mean_trips").max()
        + ((pl.col("share") / 100) * pl.col("mean_trips_by_week")).sum()
    )
    .alias("churn_estimatation_mean_trips")
    .round(1),
)

### Stats par cohortes

In [None]:
df_trips_by_driver_cohortes_6m_agg = df_trips_by_driver_cohortes_6m.group_by(
    "cohorte"
).agg(
    pl.len(),
    pl.col("num_trajets").mean().alias("mean_trips"),
    (pl.col("num_trajets") / (24)).mean().alias("mean_trips_by_week"),
)
df_trips_by_driver_cohortes_6m_agg.sort(pl.col("cohorte").str.reverse())

In [None]:
df_trips_by_driver_cohortes_6m_agg.join(
    df_churn_cohortes.filter(pl.col("num_semaine") >= 25), how="cross"
).group_by("cohorte").agg(
    (pl.col("mean_trips").max() * 2).alias("naive_estimation_mean_trips").round(1),
    (
        pl.col("mean_trips").max()
        + ((pl.col("share") / 100) * pl.col("mean_trips_by_week")).sum()
    )
    .alias("churn_estimatation_mean_trips")
    .round(1),
).sort(
    pl.col("cohorte").str.reverse()
)

# Primo-conducteurs


## Requêtes


### 2022

In [None]:
df_primo_drivers = pl.read_database(
    """
select
  *
from luis.cee_drivers_v4 cdv 
where cdv.cohorte is not null
  """,
    connection=db_engine,
)

In [None]:
df_primo_drivers.group_by("num_trips_before_cee").len().sort("num_trips_before_cee")

### CEE

In [None]:
df_primo_drivers_aom = pl.read_database(
    """
with trips as (
select
	i.uuid,
	ch.num_trips_before_cee,
  ch.cohorte,
	case
		when l_aom = 'Montpellier Méditerranée Métropole'
		and c.datetime <= '2024-04-01' and c.operator_id = 3 then 'Montpellier Méditerranée Métropole'
    when l_aom = 'Montpellier Méditerranée Métropole'
		and c.datetime > '2024-04-01' and c.operator_id = 9 then 'Montpellier Méditerranée Métropole'
    when l_aom = 'SM Artois Mobilités'
		and c.operator_id = 9 then 'SM Artois Mobilités'
    when l_aom = 'Métropole Rouen Normandie'
		and c.datetime <= '2023-10-01' and c.operator_id = 3 then 'Métropole Rouen Normandie'
    when l_aom = 'Métropole Rouen Normandie'
		and c.datetime > '2023-10-01' and c.operator_id = 9 then 'Métropole Rouen Normandie'
		when l_aom in ('Bordeaux Métropole','Dijon Métropole','Rennes Métropole') and c.operator_id in (3,4,9) then l_aom
    else null
	end as eligible_aom_name
from
	carpool.carpools c
inner join carpool.identities i on
	c.identity_id = i._id
inner join luis.cee_drivers_v4 ch on
	ch.uuid = i.uuid
left join geo.perimeters p on
	c.start_geo_code = p.arr
where
	c.datetime BETWEEN ch.date_first_cee AND ch.date_first_cee+ INTERVAL '24 weeks'
)
select 
	"uuid",
	max(num_trips_before_cee) as num_trips_before_cee,
	max(eligible_aom_name) as aom_name,
  max(cohorte) as cohorte
from
	trips
where cohorte is not null
group by
	1
having max(eligible_aom_name) is not null
  """,
    connection=db_engine,
)

## Visualisation


In [None]:
def preprocess_trips_pre_cee_by_driver_df(
    df: pl.DataFrame, bins: list[int]
) -> pl.DataFrame:
    df_agg = (
        df.get_column("num_trips_before_cee")
        .hist(bins, include_breakpoint=True)
        .with_columns(
            pl.col("breakpoint")
            .cast(pl.String)
            .replace(np.inf, f"{bins[-1]+1}+")
            .str.replace("(\.0)", ""),
            (100 * pl.col("count") / pl.col("count").sum()).alias("share"),
        )
    )
    return df_agg

In [None]:
bins = list(range(6))
plot_configs = [
    {
        "data": preprocess_trips_pre_cee_by_driver_df(
            df_primo_drivers.filter(pl.col("cohorte") == "t1_23"), bins
        ),
        "name": "CEE T1 2023",
        "color": "#d7e1ed",
    },
    {
        "data": preprocess_trips_pre_cee_by_driver_df(
            df_primo_drivers.filter(pl.col("cohorte") == "t2_23"), bins
        ),
        "name": "CEE T2 2023",
        "color": "#89a6c7",
    },
    {
        "data": preprocess_trips_pre_cee_by_driver_df(
            df_primo_drivers.filter(pl.col("cohorte") == "t3_23"), bins
        ),
        "name": "CEE T3 2023",
        "color": "#3E6DA1",
    },
    {
        "data": preprocess_trips_pre_cee_by_driver_df(
            df_primo_drivers.filter(pl.col("cohorte") == "t4_23"), bins
        ),
        "name": "CEE T4 2023",
        "color": "#1a334e",
    },
    {
        "data": preprocess_trips_pre_cee_by_driver_df(
            df_primo_drivers.filter(pl.col("cohorte") == "t4_23"), bins
        ),
        "name": "CEE T4 2023",
        "color": "rgba(113, 88, 226,1.0)",
    },
]
traces = []
for config in plot_configs:
    data = config["data"]
    trace = go.Bar(
        x=data["breakpoint"],
        y=data["share"],
        marker_color=config["color"],
        hovertemplate="%{y:.2f}% des conducteurs ont été actifs %{x} semaine(s)",
        name=config["name"],
    )
    traces.append(trace)

fig_weeks_by_driver_multi = go.Figure(traces)
fig_weeks_by_driver_multi.update_layout(
    plot_bgcolor="white",
    barmode="group",
    title="Nombre de trajets effectués pré-CEE",
)
fig_weeks_by_driver_multi.update_yaxes(
    showgrid=True,
    griddash="dashdot",
    gridwidth=1,
    gridcolor="gray",
    title="Part des conducteurs (%)",
)
fig_weeks_by_driver_multi.update_xaxes(title="Nombre de trajets effectués avant la CEE")
fig_weeks_by_driver_multi.show()
fig_weeks_by_driver_multi.write_html(OUTPUT_PATH / "num_trajets_pre_cee_multi.html")
fig_weeks_by_driver_multi.write_image(
    OUTPUT_PATH / "num_trajets_pre_cee_multi.svg", format="svg", width=1280, height=720
)

## Comparaison par AOM


In [None]:
df_primo_drivers_cee_aom = (
    df_primo_drivers_aom.filter(pl.col("aom_name").is_in(selected_aoms))
    .with_columns(is_aom_expr)
    .group_by(
        [
            "cohorte",
            "aom_incentive_status",
            (pl.col("num_trips_before_cee") == 0).alias("primo_conducteur"),
        ]
    )
    .agg(pl.len())
    .with_columns(
        pl.col("len").sum().over(["cohorte", "aom_incentive_status"]).alias("total")
    )
    .with_columns((100 * pl.col("len") / pl.col("total")).alias("share"))
    .sort(
        pl.col("cohorte").str.split("_").list.reverse().list.join(""),
        "aom_incentive_status",
    )
)

In [None]:
fig_primo_drivers_aom = px.bar(
    df_primo_drivers_cee_aom.filter(pl.col("primo_conducteur").not_()),
    x="cohorte",
    y="share",
    color="aom_incentive_status",
    barmode="group",
    title="Quelle est la part des conducteurs ayant déjà covoiturés avant leur CEE ?",
    labels={
        "share": "% des conducteurs<br> ayant déjà covoiturés avant leur CEE",
        "cohorte": "Cohorte",
        "aom_name": "AOM",
    },
    template="simple_white",
    color_discrete_map={
        "Avec incitation": "rgba(39, 174, 96,1.0)",
        "Sans incitation": "rgba(47, 54, 64,1.0)",
    },
)


fig_primo_drivers_aom.update_yaxes(
    showgrid=True,
    griddash="dashdot",
    gridwidth=1,
    gridcolor="gray",
)

fig_primo_drivers_aom.show()
fig_primo_drivers_aom.write_image(
    OUTPUT_PATH / "fig_primo_conducteurs_par_aom_status.svg",
    format="svg",
    width=1280,
    height=720,
)
fig_primo_drivers_aom.write_html(
    OUTPUT_PATH / "fig_primo_conducteurs_par_aom_status.html"
)

# Distance économisée


## Requêtes


### 2022

In [None]:
# La distance est multipliée par le nombre de sièges
df_distance_eco_by_driver = pl.read_database(
    """
with drivers_trips as (
select 
		i.uuid,
		trip_id
from
	carpool.carpools c
inner join carpool.identities i on
	c.identity_id = i._id
inner join luis.cohorte_2022_v2 cdv on
	i."uuid" = cdv."uuid"
where
	c.datetime between cdv.date_first_trip and cdv.date_first_trip + interval '23 weeks'
  and cdv.date_first_trip < '2022-08-01'
	and status = cast('ok' as covoiturage_production.carpool.carpool_status_enum)
	and is_driver
group by
		1,
	2
),
trips_distance as (
select 
	c.trip_id,
	sum(coalesce (distance,(c.meta->>'calc_distance')::int) * seats) as distance_passagers
from
	carpool.carpools c
where
	c.trip_id in (
	select
		trip_id
	from
		drivers_trips)
group by
	1)
select 
	a.uuid,
	sum(b.distance_passagers)::float as distance
from
	drivers_trips a
left join trips_distance b on
	a.trip_id = b.trip_id
group by
	a.uuid
""",
    connection=db_engine,
)

In [None]:
df_distance_eco_by_driver.describe()

### CEE

In [None]:
df_distance_eco_by_driver_cee = pl.read_database(
    """
with drivers_trips as (
select
	i.uuid,
	trip_id,
	max(cdv.cohorte) as cohorte,
	max(cdv.date_first_cee) as date_first_cee,
	max(cdv.cee_operator_name) as cee_operator_name
from
	carpool.carpools c
inner join carpool.identities i on
	c.identity_id = i._id
inner join luis.cee_drivers_v4 cdv on
	i."uuid" = cdv."uuid"
where
	c.datetime between cdv.date_first_cee and cdv.date_first_cee + interval '11 weeks'
	and cdv.cohorte is not null
	and status = cast('ok' as covoiturage_production.carpool.carpool_status_enum)
	and is_driver
group by
		1,
	2
),
trips_distance as (
select 
	c.trip_id,
	sum(coalesce (distance,(c.meta->>'calc_distance')::int) * seats) as distance_passagers
from
	carpool.carpools c
where
	c.trip_id in (
	select
		trip_id
	from
		drivers_trips)
group by
	1)
select 
	a.uuid,
	max(a.cohorte) as cohorte,
	max(a.date_first_cee) as date_first_cee,
	max(a.cee_operator_name) as cee_operator_name,
	sum(b.distance_passagers)::float as distance
from
	drivers_trips a
left join trips_distance b on
	a.trip_id = b.trip_id
group by
	a.uuid
""",
    connection=db_engine,
)

In [None]:
df_distance_eco_by_driver_cee.describe()

### AOM

In [None]:
df_distance_eco_by_driver_aom = pl.read_database(
    """
with driver_trips as (
select
	i.uuid,
	ft.date_first_trip,
	case
		when l_aom = 'Montpellier Méditerranée Métropole'
		and c.datetime <= '2024-04-01' and c.operator_id = 3 then 'Montpellier Méditerranée Métropole'
    when l_aom = 'Montpellier Méditerranée Métropole'
		and c.datetime > '2024-04-01' and c.operator_id = 9 then 'Montpellier Méditerranée Métropole'
    when l_aom = 'SM Artois Mobilités'
		and c.operator_id = 9 then 'SM Artois Mobilités'
    when l_aom = 'Métropole Rouen Normandie'
		and c.datetime <= '2023-10-01' and c.operator_id = 3 then 'Métropole Rouen Normandie'
    when l_aom = 'Métropole Rouen Normandie'
		and c.datetime > '2023-10-01' and c.operator_id = 9 then 'Métropole Rouen Normandie'
		when l_aom in ('Bordeaux Métropole','Dijon Métropole','Rennes Métropole') and c.operator_id in (3,4,9) then l_aom
    else null
	end as eligible_aom_name
from
	carpool.carpools c
inner join carpool.identities i on
	c.identity_id = i._id
inner join luis.cohorte_2022_v2 ft on
	ft.uuid = i.uuid
left join geo.perimeters p on
	c.start_geo_code = p.arr
WHERE c.datetime BETWEEN ft.date_first_trip AND ft.date_first_trip + INTERVAL '19 weeks'
),
drivers as (
select 
	"uuid",
	max(eligible_aom_name) as eligible_aom_name,
	max(date_first_trip) as date_first_trip
from
	driver_trips
group by
	1
having max(eligible_aom_name) is not null
),
"template" as (
select 
	*,
	generate_series(date_trunc('week',c.date_first_trip at time zone 'Europe/Paris'),
	date_trunc('week',c.date_first_trip at time zone 'Europe/Paris' + interval '19 weeks'),
	interval '1 weeks') as semaine
from
	drivers c
  where c.date_first_trip < '2022-08-01'
  ),
trips as (
select
	ft.uuid,
	date_trunc('week',
	c.datetime at time zone 'Europe/Paris') as semaine,
	max(ft.eligible_aom_name) as eligible_aom_name
from
	carpool.carpools c
inner join carpool.identities i on
c.identity_id = i._id
inner JOIN drivers ft ON ft.uuid=i.uuid
where
c.datetime at time zone 'Europe/Paris' between ft.date_first_trip and ft.date_first_trip + interval '19 weeks'
and is_driver
and status = cast('ok' as covoiturage_production.carpool.carpool_status_enum)
group by 1,2
),
aggregated_data as (select 
	t.uuid::text,
	t.semaine,
  	max(tr.eligible_aom_name) as eligible_aom_name,
	count(tr.semaine)>0 had_trip
from
	"template" t
left join trips tr on
	t.uuid = tr.uuid
	and t.semaine = tr.semaine
group by
	1,2)
select 
	*,
	row_number() over (partition by uuid order by semaine) as num_semaine
from aggregated_data
order by 1,2
""",
    connection=db_engine,
)

In [None]:
df_distance_eco_by_driver_aom.describe()

In [None]:
df_distance_eco_by_driver_aom.group_by(pl.col("aom_name")).len()

In [None]:
df_distance_eco_by_driver_cee_aom = pl.read_database(
    """
with driver_trips as (
select
	i.uuid,
	ft.date_first_cee,
  ft.cohorte,
	case
		when l_aom = 'Montpellier Méditerranée Métropole'
		and c.datetime <= '2024-04-01' and c.operator_id = 3 then 'Montpellier Méditerranée Métropole'
    when l_aom = 'Montpellier Méditerranée Métropole'
		and c.datetime > '2024-04-01' and c.operator_id = 9 then 'Montpellier Méditerranée Métropole'
    when l_aom = 'SM Artois Mobilités'
		and c.operator_id = 9 then 'SM Artois Mobilités'
    when l_aom = 'Métropole Rouen Normandie'
		and c.datetime <= '2023-10-01' and c.operator_id = 3 then 'Métropole Rouen Normandie'
    when l_aom = 'Métropole Rouen Normandie'
		and c.datetime > '2023-10-01' and c.operator_id = 9 then 'Métropole Rouen Normandie'
		when l_aom in ('Bordeaux Métropole','Dijon Métropole','Rennes Métropole') and c.operator_id in (3,4,9) then l_aom
    else null
	end as eligible_aom_name
from
	carpool.carpools c
inner join carpool.identities i on
	c.identity_id = i._id
inner join luis.cee_drivers_v4 ft on
	ft.uuid = i.uuid
left join geo.perimeters p on
	c.start_geo_code = p.arr
WHERE c.datetime BETWEEN ft.date_first_cee AND ft.date_first_cee + INTERVAL '19 weeks'
),
drivers as (
select 
	"uuid",
	max(eligible_aom_name) as eligible_aom_name,
  max(cohorte) as cohorte,
	max(date_first_cee) as date_first_cee
from
	driver_trips
group by
	1
having max(eligible_aom_name) is not null
),
"template" as (
select 
	*,
	generate_series(date_trunc('week',c.date_first_cee at time zone 'Europe/Paris'),
	date_trunc('week',c.date_first_cee at time zone 'Europe/Paris' + interval '19 weeks'),
	interval '1 weeks') as semaine
from
	drivers c
  where c.date_first_cee < '2024-03-01'
  ),
trips as (
select
	ft.uuid,
	date_trunc('week',
	c.datetime at time zone 'Europe/Paris') as semaine,
	max(ft.eligible_aom_name) as eligible_aom_name,
  max(ft.cohorte) as cohorte
from
	carpool.carpools c
inner join carpool.identities i on
c.identity_id = i._id
inner JOIN drivers ft ON ft.uuid=i.uuid
where
c.datetime at time zone 'Europe/Paris' between ft.date_first_cee and ft.date_first_cee + interval '19 weeks'
and is_driver
and status = cast('ok' as covoiturage_production.carpool.carpool_status_enum)
group by 1,2
),
aggregated_data as (select 
	t.uuid::text,
	t.semaine,
  	max(tr.eligible_aom_name) as eligible_aom_name,
    max(tr.cohorte) as cohorte,
	count(tr.semaine)>0 had_trip
from
	"template" t
left join trips tr on
	t.uuid = tr.uuid
	and t.semaine = tr.semaine
group by
	1,2)
select 
	*,
	row_number() over (partition by uuid order by semaine) as num_semaine
from aggregated_data
order by 1,2
""",
    connection=db_engine,
)

In [None]:
df_distance_eco_by_driver_cee_aom.describe()

In [None]:
df_distance_eco_by_driver_cee_aom.group_by(pl.col("aom_name")).len()

## Stats


In [None]:
df_distance_eco_by_driver.describe()

In [None]:
df_distance_eco_by_driver.select(pl.col("distance").mean() / 1000)

In [None]:
df_distance_eco_by_driver_cee.describe()

In [None]:
df_distance_eco_by_driver_cee.select(pl.col("distance").mean() / 1000)

## Visualisation


### Distribution des distance économisées

In [None]:
plot_configs = [
    {
        "data": df_distance_eco_by_driver,
        "name": "Référence 2022",
        "color": "#f39c12",
    },
    {
        "data": df_distance_eco_by_driver_cee.filter(pl.col("cohorte") == "t1_23"),
        "name": "CEE T1 2023",
        "color": "#d7e1ed",
    },
    {
        "data": df_distance_eco_by_driver_cee.filter(pl.col("cohorte") == "t2_23"),
        "name": "CEE T2 2023",
        "color": "#89a6c7",
    },
    {
        "data": df_distance_eco_by_driver_cee.filter(pl.col("cohorte") == "t3_23"),
        "name": "CEE T3 2023",
        "color": "#3E6DA1",
    },
    {
        "data": df_distance_eco_by_driver_cee.filter(pl.col("cohorte") == "t4_23"),
        "name": "CEE T4 2023",
        "color": "#1a334e",
    },
    {
        "data": df_distance_eco_by_driver_cee.filter(pl.col("cohorte") == "t1_24"),
        "name": "CEE T1 2024",
        "color": "rgba(113, 88, 226,1.0)",
    },
]
traces = []
for config in plot_configs:
    data = config["data"]
    trace = go.Histogram(
        x=data["distance"] / 1000,
        histfunc="count",
        histnorm="percent",
        xbins_size=1,
        xbins_start=0,
        marker_color=config["color"],
        marker_opacity=0.5,
        name=config["name"],
        hovertemplate="%{y:.2f}% des conducteurs ont économisés %{x} km",
        visible=(
            True
            if config["name"] in ["Référence 2022", "CEE T4 2023", "CEE T1 2024"]
            else "legendonly"
        ),
    )
    traces.append(trace)

fig_distance_eco_multi = go.Figure(traces)
fig_distance_eco_multi.update_layout(
    barmode="overlay",
    plot_bgcolor="white",
    legend_title="Cohorte :",
    title="Distribution des distances économisés par les conducteurs de chaque cohorte (historique de 5 mois)",
)
fig_distance_eco_multi.update_yaxes(
    showgrid=True,
    griddash="dashdot",
    gridwidth=1,
    gridcolor="gray",
    title="Part des conducteurs (%)",
)
fig_distance_eco_multi.update_xaxes(title="Distance économisée (km)", range=[0, 150])
fig_distance_eco_multi.show()
fig_distance_eco_multi.write_html(OUTPUT_PATH / "histo_distances_eco_multi_5m.html")
fig_distance_eco_multi.write_image(
    OUTPUT_PATH / "histo_distances_eco_multi_5m.svg",
    format="svg",
    width=1280,
    height=720,
)

In [None]:
df_distance_eco_by_driver["distance"].mean() / 1000

### Moyenne par cohorte

In [None]:
plot_configs = [
    {
        "data": df_distance_eco_by_driver["distance"].mean() / 1000,
        "name": "Référence 2022",
        "color": "#f39c12",
    },
    {
        "data": df_distance_eco_by_driver_cee.filter(pl.col("cohorte") == "t1_23")[
            "distance"
        ].mean()
        / 1000,
        "name": "CEE T1 2023",
        "color": "#d7e1ed",
    },
    {
        "data": df_distance_eco_by_driver_cee.filter(pl.col("cohorte") == "t2_23")[
            "distance"
        ].mean()
        / 1000,
        "name": "CEE T2 2023",
        "color": "#89a6c7",
    },
    {
        "data": df_distance_eco_by_driver_cee.filter(pl.col("cohorte") == "t3_23")[
            "distance"
        ].mean()
        / 1000,
        "name": "CEE T3 2023",
        "color": "#3E6DA1",
    },
    {
        "data": df_distance_eco_by_driver_cee.filter(pl.col("cohorte") == "t4_23")[
            "distance"
        ].mean()
        / 1000,
        "name": "CEE T4 2023",
        "color": "#1a334e",
    },
    {
        "data": df_distance_eco_by_driver_cee.filter(pl.col("cohorte") == "t1_24")[
            "distance"
        ].mean()
        / 1000,
        "name": "CEE T1 2024",
        "color": "rgba(113, 88, 226,1.0)",
    },
]
traces = []
for config in plot_configs:
    data = config["data"]
    trace = go.Bar(
        y=[data],
        x=[config["name"]],
        text=[f"{data:.2f}km"],
        marker_color=config["color"],
        name=config["name"],
        hovertemplate="Les conducteurs de la cohorte %{x} ont économisés en moyenne %{y}km",
    )
    traces.append(trace)

fig_distance_eco_moyenne_multi = go.Figure(traces)
fig_distance_eco_moyenne_multi.update_layout(
    barmode="overlay",
    plot_bgcolor="white",
    legend_title="Cohorte :",
    title="Quelle distance un conducteur de chaque cohorte économise-t-il en moyenne (historique de 5 mois)?",
)
fig_distance_eco_moyenne_multi.update_yaxes(
    showgrid=True,
    griddash="dashdot",
    gridwidth=1,
    gridcolor="gray",
    title="Distance moyenne <br>économisée par conducteur (km)",
)
fig_distance_eco_moyenne_multi.update_xaxes(title="Cohorte")
fig_distance_eco_moyenne_multi.show()
fig_distance_eco_moyenne_multi.write_html(
    OUTPUT_PATH / "distances_eco_moyenne_multi_5m.html"
)
fig_distance_eco_moyenne_multi.write_image(
    OUTPUT_PATH / "distances_eco_moyenne_multi_5m.svg",
    format="svg",
    width=1280,
    height=720,
)

### Médiane par cohorte

In [None]:
plot_configs = [
    {
        "data": df_distance_eco_by_driver["distance"].median() / 1000,
        "name": "Référence 2022",
        "color": "#f39c12",
    },
    {
        "data": df_distance_eco_by_driver_cee.filter(pl.col("cohorte") == "t1_23")[
            "distance"
        ].median()
        / 1000,
        "name": "CEE T1 2023",
        "color": "#d7e1ed",
    },
    {
        "data": df_distance_eco_by_driver_cee.filter(pl.col("cohorte") == "t2_23")[
            "distance"
        ].median()
        / 1000,
        "name": "CEE T2 2023",
        "color": "#89a6c7",
    },
    {
        "data": df_distance_eco_by_driver_cee.filter(pl.col("cohorte") == "t3_23")[
            "distance"
        ].median()
        / 1000,
        "name": "CEE T3 2023",
        "color": "#3E6DA1",
    },
    {
        "data": df_distance_eco_by_driver_cee.filter(pl.col("cohorte") == "t4_23")[
            "distance"
        ].median()
        / 1000,
        "name": "CEE T4 2023",
        "color": "#1a334e",
    },
    {
        "data": df_distance_eco_by_driver_cee.filter(pl.col("cohorte") == "t1_24")[
            "distance"
        ].median()
        / 1000,
        "name": "CEE T1 2024",
        "color": "rgba(113, 88, 226,1.0)",
    },
]
traces = []
for config in plot_configs:
    data = config["data"]
    trace = go.Bar(
        y=[data],
        x=[config["name"]],
        text=[f"{data:.2f}km"],
        marker_color=config["color"],
        name=config["name"],
        hovertemplate="Les conducteurs de la cohorte %{x} ont économisés une médiane de %{y}km",
    )
    traces.append(trace)

fig_distance_eco_mediane_multi = go.Figure(traces)
fig_distance_eco_mediane_multi.update_layout(
    barmode="overlay",
    plot_bgcolor="white",
    legend_title="Cohorte :",
    title="Quelle distance médiane un conducteur de chaque cohorte économise-t-il ?<br><sub>Historique de 5 mois</sub>",
)
fig_distance_eco_mediane_multi.update_yaxes(
    showgrid=True,
    griddash="dashdot",
    gridwidth=1,
    gridcolor="gray",
    title="Distance médiane <br>économisée par conducteur (km)",
)
fig_distance_eco_mediane_multi.update_xaxes(title="Cohorte")
fig_distance_eco_mediane_multi.show()
fig_distance_eco_mediane_multi.write_html(
    OUTPUT_PATH / "distances_eco_mediane_multi_5m.html"
)
fig_distance_eco_mediane_multi.write_image(
    OUTPUT_PATH / "distances_eco_mediane_multi_5m.svg",
    format="svg",
    width=1280,
    height=720,
)

### Par cohortes hebdomadaires

In [None]:
df_distance_eco_by_driver_cee_agg_by_week = df_distance_eco_by_driver_cee.group_by(
    pl.col("date_first_cee").dt.truncate("1w")
).agg((pl.col("distance") / 1000).mean().round(0))
df_distance_eco_by_driver_cee_agg_by_week.sort("date_first_cee")

In [None]:
fig_distance_eco_by_week = px.bar(
    df_distance_eco_by_driver_cee_agg_by_week.filter(
        pl.col("date_first_cee").dt.year() >= 2023
    ),
    x="date_first_cee",
    y="distance",
    template="simple_white",
    title="Distance économisée moyenne par cohorte hebdomdaire<br>"
    "<sub>Historique de trois mois pour chaque conducteur, chaque conducteur est affecté à la semaine d'obtention de son CEE.</sub>",
    labels={"date_first_cee": "Semaine", "distance": "Distance économisée (km)"},
)
fig_distance_eco_by_week.update_traces(marker_color="#c0b490")
fig_distance_eco_by_week.update_yaxes(
    showgrid=True,
    griddash="dashdot",
    gridwidth=1,
    gridcolor="gray",
    title="Distance moyenne<br>économisée (km)",
    tickvals=[0, 100, 200, 300, 400, 500, 600],
)
fig_distance_eco_by_week.update_xaxes(title=None)
fig_distance_eco_by_week.write_html(OUTPUT_PATH / "distances_eco_cee_hebdo.html")
fig_distance_eco_by_week.write_image(
    OUTPUT_PATH / "distances_eco_cee_hebdo.svg",
    format="svg",
    width=1280,
    height=720,
)
fig_distance_eco_by_week.show()

### Comparaison opérateurs


In [None]:
df_saved_distance_by_cohortes_op = (
    df_distance_eco_by_driver_cee.group_by(["cohorte", "cee_operator_name"])
    .agg((pl.col("distance") / 1000).mean().alias("distance_economisee_moyenne"))
    .sort(
        pl.col("cohorte").str.split("_").list.reverse().list.join(""),
        "cee_operator_name",
    )
)
with pl.Config(tbl_rows=600):
    print(df_saved_distance_by_cohortes_op)

In [None]:
df_saved_distance_by_cohortes_op.write_clipboard()

In [None]:
fig_saved_distance_operator = px.line(
    df_saved_distance_by_cohortes_op,
    x="cohorte",
    y="distance_economisee_moyenne",
    color="cee_operator_name",
    markers=True,
    color_discrete_map=color_mapping,
    template="simple_white",
    labels={
        "distance_economisee_moyenne": "Distance moyene économisée (km)",
        "cohorte": "Cohorte",
        "cee_operator_name": "Opérateur",
    },
    title="Comparaison de la distance moyenne économisée par cohorte et pour chaque opérateur",
)

fig_saved_distance_operator.update_traces(
    marker_size=11, line_width=0.5, line_dash="dot"
)
fig_saved_distance_operator.write_html(OUTPUT_PATH / "stats_distance_eco_par_op.html")
fig_saved_distance_operator.write_image(
    OUTPUT_PATH / "stats_distance_eco_par_op.svg",
    format="svg",
    width=1280,
    height=720,
)
fig_saved_distance_operator.update_layout(height=800)

## Comparaison par AOM


### CEE


In [None]:
(
    df_distance_eco_by_driver_cee_aom.filter(pl.col("aom_name").is_in(selected_aoms))
    .group_by(["aom_name"])
    .agg(
        (pl.col("distance") / 1000).mean().alias("distance_eco").round(2),
        pl.len(),
    )
    .with_columns(is_aom_expr)
    .sort(
        [
            pl.col("aom_incentive_status"),
            pl.col("aom_name"),
        ]
    )
)

In [None]:
df_distance_eco_by_driver_cee_aom_agg = (
    df_distance_eco_by_driver_cee_aom.filter(pl.col("aom_name").is_in(selected_aoms))
    .group_by(["cohorte", "aom_name"])
    .agg((pl.col("distance") / 1000).mean().alias("distance_eco"))
    .with_columns(is_aom_expr)
    .sort(
        [
            pl.col("cohorte").str.split("_").list.reverse().list.join(""),
            pl.col("aom_incentive_status"),
            pl.col("aom_name"),
        ]
    )
)
df_distance_eco_by_driver_cee_aom_agg

In [None]:
df_distance_eco_by_driver_cee_aom_agg_incentives = (
    df_distance_eco_by_driver_cee_aom.with_columns(is_aom_expr)
    .group_by(["cohorte", "aom_incentive_status"])
    .agg((pl.col("distance") / 1000).mean().alias("distance_eco"))
    .sort([pl.col("cohorte").str.reverse(), "aom_incentive_status"])
)
df_distance_eco_by_driver_cee_aom_agg_incentives

### 2022


In [None]:
df_distance_eco_by_driver_aom_agg = (
    df_distance_eco_by_driver_aom.filter(pl.col("aom_name").is_in(selected_aoms))
    .group_by(["aom_name"])
    .agg(
        (pl.col("distance") / 1000).mean().alias("distance_eco").round(2),
        pl.len(),
    )
    .with_columns(is_aom_expr, pl.lit("2022").alias("cohorte"))
    .sort(
        [
            pl.col("cohorte").str.split("_").list.reverse().list.join(""),
            pl.col("aom_incentive_status"),
            pl.col("aom_name"),
        ]
    )
)
df_distance_eco_by_driver_aom_agg

In [None]:
df_distance_eco_by_driver_aom_agg_incentives = (
    df_distance_eco_by_driver_aom.with_columns(is_aom_expr)
    .group_by("aom_incentive_status")
    .agg((pl.col("distance") / 1000).mean().alias("distance_eco"))
    .sort("aom_incentive_status")
)
df_distance_eco_by_driver_aom_agg_incentives

### Visualisation


In [None]:
fig_distance_eco_by_driver_multi_aom_status = px.bar(
    pl.concat(
        [
            df_distance_eco_by_driver_aom_agg_incentives.with_columns(
                pl.lit("2022").alias("cohorte")
            ),
            df_distance_eco_by_driver_cee_aom_agg_incentives,
        ],
        how="diagonal",
    ).with_columns(pl.col("distance_eco").round(0)),
    x="cohorte",
    y="distance_eco",
    text="distance_eco",
    color="aom_incentive_status",
    barmode="group",
    template="simple_white",
    labels={
        "aom_incentive_status": "Type d'AOM",
        "distance_eco": "Distance moyenne économisée (km)",
        "cohorte": "Cohorte",
    },
    title="Comparaison de la distance moyenne économisée<br><sub>Par type d'AOM et cohorte<sub>",
    color_discrete_map={
        "Avec incitation": "rgba(39, 174, 96,1.0)",
        "Sans incitation": "rgba(47, 54, 64,1.0)",
    },
)
fig_distance_eco_by_driver_multi_aom_status.show()
fig_distance_eco_by_driver_multi_aom_status.write_html(
    OUTPUT_PATH / "histo_distance_eco_multi_aom_status.html"
)
fig_distance_eco_by_driver_multi_aom_status.write_image(
    OUTPUT_PATH / "histo_distance_eco_multi_aom_status.svg",
    format="svg",
    width=1280,
    height=720,
)

# KwhCumac

## Requetes

## 2022

In [None]:
df_distance_eco = pl.read_database(
    """
with drivers_trips as (
select 
	i.uuid,
	trip_id
from
	carpool.carpools c
inner join carpool.identities i on
	c.identity_id = i._id
inner join luis.cohorte_2022_v2 cdv on
	i."uuid" = cdv."uuid"
where
	c.datetime between cdv.date_first_trip and cdv.date_first_trip + interval '103 weeks'
  and cdv.date_first_trip < '2022-08-01'
	and status = cast('ok' as covoiturage_production.carpool.carpool_status_enum)
	and is_driver
group by
	1,
	2
),
trips_distance as (
select 
	c.trip_id,
    date_part('month',c.datetime) as month,
	sum(coalesce (distance,(c.meta->>'calc_distance')::int) * seats) as distance_passagers
from
	carpool.carpools c
where
	c.trip_id in (
	select
		trip_id
	from
		drivers_trips)
group by
	1,2)
select 
	a.uuid,
    b.month,
    count(distinct a.trip_id) as num_trajets,
	sum(b.distance_passagers)::float as distance
from
	drivers_trips a
left join trips_distance b on
	a.trip_id = b.trip_id
group by
	1,2
    """,
    connection=db_engine,
)

In [None]:
df_distance_eco.describe()

In [None]:
df_churn = preprocess_week_by_driver_df(df_weeks_by_driver)
df_churn

In [None]:
df_distance_eco_agg = df_distance_eco.select(
    pl.len(),
    (pl.col("distance") / 1000).mean().alias("mean_distance"),
    (pl.col("distance") / 1000).median().alias("median_distance"),
    (pl.col("distance") / (25 * 1000)).mean().alias("mean_distance_by_week"),
    (pl.col("distance") / (25 * 1000)).median().alias("median_distance_by_week"),
)
df_distance_eco_agg

In [None]:
df_distance_eco_agg_sup_10 = df_distance_eco.filter(pl.col("num_trajets") >= 10).select(
    pl.len(),
    (pl.col("distance") / 1000).mean().alias("mean_distance"),
    (pl.col("distance") / 1000).median().alias("median_distance"),
    (pl.col("distance") / (25 * 1000)).mean().alias("mean_distance_by_week"),
    (pl.col("distance") / (25 * 1000)).median().alias("median_distance_by_week"),
)
df_distance_eco_agg_sup_10

In [None]:
df_distance_eco_estimations = (
    df_churn.filter(pl.col("num_semaine") >= 25)
    .join(
        df_distance_eco_agg,
        how="cross",
    )
    .select(
        pl.lit("2022").alias("cohorte"),
        pl.col("len").max().alias("num_drivers"),
        (pl.col("mean_distance").max() * 2).alias("naive_estimation_mean_distance"),
        (pl.col("median_distance").max() * 2).alias("naive_estimation_median_distance"),
        (
            pl.col("mean_distance").first()
            + ((pl.col("share") / 100) * pl.col("mean_distance_by_week")).sum()
        ).alias("churn_estimatation_mean_distance"),
        (
            pl.col("median_distance").first()
            + ((pl.col("share") / 100) * pl.col("median_distance_by_week")).sum()
        ).alias("churn_estimation_median_distance"),
    )
    .with_columns(
        (pl.col(c) * 0.9 * 0.647).alias(c.replace("distance", "kwhcumac"))
        for c in [
            "naive_estimation_mean_distance",
            "naive_estimation_median_distance",
            "churn_estimatation_mean_distance",
            "churn_estimation_median_distance",
        ]
    )
)
df_distance_eco_estimations

In [None]:
df_distance_eco_estimations_sup_10 = (
    df_churn.filter(pl.col("num_semaine") >= 25)
    .join(
        df_distance_eco_agg_sup_10,
        how="cross",
    )
    .select(
        pl.lit("2022 plus de 10 trajets").alias("cohorte"),
        pl.col("len").max().alias("num_drivers"),
        (pl.col("mean_distance").max() * 2).alias("naive_estimation_mean_distance"),
        (pl.col("median_distance").max() * 2).alias("naive_estimation_median_distance"),
        (
            pl.col("mean_distance").first()
            + ((pl.col("share") / 100) * pl.col("mean_distance_by_week")).sum()
        ).alias("churn_estimatation_mean_distance"),
        (
            pl.col("median_distance").first()
            + ((pl.col("share") / 100) * pl.col("median_distance_by_week")).sum()
        ).alias("churn_estimation_median_distance"),
    )
    .with_columns(
        (pl.col(c) * 0.9 * 0.647).alias(c.replace("distance", "kwhcumac"))
        for c in [
            "naive_estimation_mean_distance",
            "naive_estimation_median_distance",
            "churn_estimatation_mean_distance",
            "churn_estimation_median_distance",
        ]
    )
)
df_distance_eco_estimations_sup_10

In [None]:
df_distance_eco_agg

In [None]:
(df_churn.select(pl.col("share")) / 100) * df_distance_eco_agg.select(
    pl.col("mean_distance_by_week"), pl.col("median_distance_by_week")
)

In [None]:
df_distance_eco.filter((pl.col("distance") / 1000) < 5000).describe()

In [None]:
px.bar(
    (df_distance_eco["distance"] / 1000).hist(bins=list(range(0, 5000, 5))),
    x="category",
    y="count",
)

## CEE

In [None]:
df_distance_eco_cee = pl.read_database(
    """
with drivers_trips as (
select 
	i.uuid,
	trip_id,
    max(cohorte) as cohorte,
    max(num_trips_after_cee) as num_trips_after_cee
from
	carpool.carpools c
inner join carpool.identities i on
	c.identity_id = i._id
inner join luis.cee_drivers_v4 cdv on
	i."uuid" = cdv."uuid"
where
	c.datetime between cdv.date_first_cee and cdv.date_first_cee + interval '23 weeks'
    and cdv.date_first_cee < '2024-01-01'
	and status = cast('ok' as covoiturage_production.carpool.carpool_status_enum)
	and is_driver
    and cohorte is not null
group by
	1,
	2
),
trips_distance as (
select 
	c.trip_id,
	sum(coalesce (distance,(c.meta->>'calc_distance')::int) * seats) as distance_passagers
from
	carpool.carpools c
where
	c.trip_id in (
	select
		trip_id
	from
		drivers_trips)
group by
	1)
select 
	a.uuid,
    max(cohorte) as cohorte,
    count(distinct a.trip_id) as num_trajets,
    max(num_trips_after_cee) as num_trips_after_cee,
	sum(b.distance_passagers)::float as distance
from
	drivers_trips a
left join trips_distance b on
	a.trip_id = b.trip_id
group by
	a.uuid
    """,
    connection=db_engine,
)

In [None]:
df_distance_eco_cee.describe()

### Courbe de churn

In [None]:
df_churn_cee = preprocess_week_by_driver_df(
    df_weeks_by_driver_cohortes.filter(pl.col("cohorte").is_in(["t1_23", "t2_23"]))
)
df_churn_cee

### Stats 2023

In [None]:
df_distance_eco_2023_agg = df_distance_eco_cee.select(
    pl.len(),
    (pl.col("distance") / 1000).mean().alias("mean_distance"),
    (pl.col("distance") / 1000).median().alias("median_distance"),
    (pl.col("distance") / (25 * 1000)).mean().alias("mean_distance_by_week"),
    (pl.col("distance") / (25 * 1000)).median().alias("median_distance_by_week"),
)
df_distance_eco_2023_agg

### Stats par cohortes

In [None]:
df_distance_eco_cee_agg = df_distance_eco_cee.group_by("cohorte").agg(
    pl.len(),
    (pl.col("distance") / 1000).mean().alias("mean_distance"),
    (pl.col("distance") / 1000).median().alias("median_distance"),
    (pl.col("distance") / (25 * 1000)).mean().alias("mean_distance_by_week"),
    (pl.col("distance") / (25 * 1000)).median().alias("median_distance_by_week"),
)
df_distance_eco_cee_agg.sort(pl.col("cohorte").str.reverse())

In [None]:
df_distance_eco_estimations_cee = (
    df_churn_cee.filter(pl.col("num_semaine") >= 25)
    .join(
        df_distance_eco_cee_agg,
        how="cross",
    )
    .group_by("cohorte")
    .agg(
        pl.col("len").max().alias("num_drivers"),
        pl.col("mean_distance").max().alias("mean_distance"),
        (pl.col("mean_distance").max() * 2)
        .alias("naive_estimation_mean_distance")
        .round(1),
        (pl.col("median_distance").max() * 2)
        .alias("naive_estimation_median_distance")
        .round(1),
        (
            pl.col("mean_distance").first()
            + ((pl.col("share") / 100) * pl.col("mean_distance_by_week")).sum()
        )
        .alias("churn_estimatation_mean_distance")
        .round(1),
        (
            pl.col("median_distance").first()
            + ((pl.col("share") / 100) * pl.col("median_distance_by_week")).sum()
        )
        .alias("churn_estimation_median_distance")
        .round(1),
    )
).with_columns(
    (pl.col(c) * 0.9 * 0.647).round(1).alias(c.replace("distance", "kwhcumac"))
    for c in [
        "naive_estimation_mean_distance",
        "naive_estimation_median_distance",
        "churn_estimatation_mean_distance",
        "churn_estimation_median_distance",
    ]
)
df_distance_eco_estimations_cee.sort(pl.col("cohorte").str.reverse())

In [None]:
df_distance_eco_cee_agg_by_bonus = df_distance_eco_cee.group_by(
    ["cohorte", (pl.col("num_trips_after_cee") >= 10).alias("with_bonus")]
).agg(
    pl.len(),
    (pl.col("distance") / 1000).mean().alias("mean_distance").round(1),
    (pl.col("distance") / 1000).median().alias("median_distance").round(1),
    (pl.col("distance") / (25 * 1000)).mean().alias("mean_distance_by_week").round(1),
    (pl.col("distance") / (25 * 1000))
    .median()
    .alias("median_distance_by_week")
    .round(1),
)
df_distance_eco_cee_agg_by_bonus.sort(pl.col("cohorte").str.reverse())

In [None]:
df_distance_eco_estimations_cee_bonus = (
    df_churn_cee.filter(pl.col("num_semaine") >= 25)
    .join(
        df_distance_eco_cee_agg_by_bonus,
        how="cross",
    )
    .group_by(["cohorte", "with_bonus"])
    .agg(
        pl.col("len").max().alias("num_drivers"),
        pl.col("mean_distance").max().alias("mean_distance").round(1),
        (pl.col("mean_distance").max() * 2)
        .alias("naive_estimation_mean_distance")
        .round(1),
        (pl.col("median_distance").max() * 2)
        .alias("naive_estimation_median_distance")
        .round(1),
        (
            pl.col("mean_distance").max()
            + ((pl.col("share") / 100) * pl.col("mean_distance_by_week")).sum()
        )
        .alias("churn_estimatation_mean_distance")
        .round(1),
        (
            pl.col("median_distance").max()
            + ((pl.col("share") / 100) * pl.col("median_distance_by_week")).sum()
        )
        .alias("churn_estimation_median_distance")
        .round(1),
    )
    .with_columns(
        (pl.col(c) * 0.9 * 0.647).round(1).alias(c.replace("distance", "kwhcumac"))
        for c in [
            "naive_estimation_mean_distance",
            "naive_estimation_median_distance",
            "churn_estimatation_mean_distance",
            "churn_estimation_median_distance",
        ]
    )
)
df_distance_eco_estimations_cee_bonus.sort(pl.col("cohorte").str.reverse())

## Visualisation

In [None]:
fig_kwhcumac = px.bar(
    pl.concat(
        [
            df_distance_eco_estimations,
            df_distance_eco_estimations_sup_10,
            df_distance_eco_estimations_cee,
            df_distance_eco_estimations_cee_bonus,
        ],
        how="diagonal",
    ).rename(
        {
            "naive_estimation_mean_kwhcumac": "Estimation annuelle naïve (moyenne)",
            "naive_estimation_median_kwhcumac": "Estimation annuelle naïve (médiane)",
            "churn_estimatation_mean_kwhcumac": "Estimation annuelle modèle churn (moyenne)",
            "churn_estimation_median_kwhcumac": "Estimation annuelle modèle churn (médiane)",
        }
    ),
    x="cohorte",
    y=[
        "Estimation annuelle naïve (moyenne)",
        "Estimation annuelle naïve (médiane)",
        "Estimation annuelle modèle churn (moyenne)",
        "Estimation annuelle modèle churn (médiane)",
    ],
    text="value",
    text_auto=".0f",
    barmode="group",
    template="simple_white",
    labels={"variable": "métrique", "value": "kWhCumac"},
    title="Combien d'énergie a permis d'économiser un conducteur ?<br><sub>Par cohorte, estimation annuelle basée sur un historique de trajets de 6 mois</sub>",
)
fig_kwhcumac.update_yaxes(
    showgrid=True,
    griddash="dashdot",
    gridwidth=1,
    gridcolor="gray",
)
fig_kwhcumac.show()
fig_kwhcumac.write_html(OUTPUT_PATH / "histo_kwh_cumac.html")
fig_kwhcumac.write_image(
    OUTPUT_PATH / "histo_kwh_cumac.svg",
    format="svg",
    width=1280,
    height=720,
)

## Analayse de l'effet de seuil

In [None]:
df_distance_eco_cee

In [None]:
thresholds = range(0, 101, 10)
stats_dfs = []
for t in thresholds:
    temp_df = df_distance_eco_cee.filter(pl.col("num_trajets") >= t).select(
        pl.lit(t).alias("threshold"),
        pl.len(),
        (pl.col("distance") / 1000).mean().alias("mean_distance"),
        (pl.col("distance") / 1000).median().alias("median_distance"),
        (pl.col("distance") / (25 * 1000)).mean().alias("mean_distance_by_week"),
        (pl.col("distance") / (25 * 1000)).median().alias("median_distance_by_week"),
    )
    stats_dfs.append(temp_df)

cummulative_distance_eco_cee_df = pl.concat(stats_dfs)
cummulative_distance_eco_cee_df

In [None]:
cummulative_distance_eco_cee_df_agg = (
    cummulative_distance_eco_cee_df.join(
        df_churn_cee.filter(pl.col("num_semaine") >= 25),
        how="cross",
    )
    .group_by("threshold")
    .agg(
        pl.col("mean_distance").max(),
        pl.col("median_distance").max(),
        pl.col("len").max().alias("num_drivers"),
        (pl.col("mean_distance").max() * 2).alias("naive_estimation_mean_distance"),
        (pl.col("median_distance").max() * 2).alias("naive_estimation_median_distance"),
        (
            pl.col("mean_distance").first()
            + ((pl.col("share") / 100) * pl.col("mean_distance_by_week")).sum()
        ).alias("churn_estimatation_mean_distance"),
        (
            pl.col("median_distance").first()
            + ((pl.col("share") / 100) * pl.col("median_distance_by_week")).sum()
        ).alias("churn_estimation_median_distance"),
    )
    .with_columns(
        *[
            (pl.col(c) * 0.9 * 0.647).alias(c.replace("distance", "kwhcumac"))
            for c in [
                "naive_estimation_mean_distance",
                "naive_estimation_median_distance",
                "churn_estimatation_mean_distance",
                "churn_estimation_median_distance",
            ]
        ],
        (100 * pl.col("num_drivers") / pl.col("num_drivers").max()).alias(
            "share_drivers"
        )
    )
)
cummulative_distance_eco_cee_df_agg

In [None]:
index = cummulative_distance_eco_cee_df_agg.select(
    pl.format("{}+", pl.col("threshold"))
)["threshold"].to_list()


bar_naive = go.Bar(
    x=index,
    y=cummulative_distance_eco_cee_df_agg["naive_estimation_mean_distance"].to_list(),
    texttemplate="%{y:.0f}",
    name="Estimation moyenne naïve",
    marker_color="rgba(189, 195, 199,1.0)",
    textposition="outside",
)

bar_churn = go.Bar(
    x=index,
    y=cummulative_distance_eco_cee_df_agg["churn_estimatation_mean_distance"].to_list(),
    texttemplate="%{y:.0f}",
    name="Estimation moyenne par le modèle de churn",
    marker_color="rgba(41, 128, 185,1.0)",
    textposition="outside",
)

line_num_drivers = go.Scatter(
    x=index,
    y=cummulative_distance_eco_cee_df_agg["share_drivers"].to_list(),
    line_color="rgba(0, 184, 148,1.0)",
    mode="markers+lines+text",
    texttemplate="%{y:.0f}%",
    textposition="top center",
    textfont_weight="bold",
    name="Part des conducteurs (%)",
    yaxis="y2",
)

fig_kwhcumac_cummulative = go.Figure([bar_naive, bar_churn, line_num_drivers])

fig_kwhcumac_cummulative.update_xaxes(
    type="category",
    linecolor="black",
    title="Nombre de trajets effectués (effectifs cummulés)",
)
fig_kwhcumac_cummulative.update_layout(
    title="kWhcumac économisés par conducteurs<br> en fonction du nombre minimum de trajets effectués",
    barmode="group",
    yaxis=dict(
        title="kWhcumac économisé<br> par conduceur par an",
        linecolor="black",
        showgrid=False,
        griddash="dot",
        gridwidth=1,
        gridcolor="gray",
        tickcolor="black",
        ticks="outside",
        range=[0, 10_000],
    ),
    yaxis2=dict(
        anchor="x", overlaying="y", side="right", visible=False, range=[0, 120]
    ),
    legend_orientation="h",
    legend_y=-0.3,
    plot_bgcolor="white",
)

fig_kwhcumac_cummulative.show()
fig_kwhcumac_cummulative.write_html(OUTPUT_PATH / "fig_kwh_cumac_cummulative.html")
fig_kwhcumac_cummulative.write_image(
    OUTPUT_PATH / "fig_kwh_cumac_cummulative.svg",
    format="svg",
    width=1280,
    height=720,
)

# KwhCumac mensuel

## Requêtes

### 2022

In [None]:
df_distance_eco_by_month = pl.read_database(
    """
with first_trip as (
select
		driver_operator_user_id,
		min(start_datetime) as first_trip_datetime
from
	carpool_v2.carpools c
where
	c.start_datetime <= now() - interval '2 years'
group by
	1
having
	min(start_datetime)>= '2022-01-01'
),
template as (
select
	*,
	generate_series(date_trunc('month',
	first_trip_datetime at time zone 'Europe/Paris'),
	date_trunc('month',
	first_trip_datetime at time zone 'Europe/Paris' + interval '24 month'),
	interval '1 month') as mois
from
	first_trip
)
select
	t.*,
	c.distance,
	c.passenger_seats,
	c.distance * c.passenger_seats as distance_eco
from
	template t
left join carpool_v2.carpools c on
	t.driver_operator_user_id = c.driver_operator_user_id
	and t.mois = date_trunc('month',
	start_datetime)::timestamp
    """,
    connection=db_engine,
)

In [None]:
df_distance_eco_by_month_agg = (
    df_distance_eco_by_month.group_by(["driver_operator_user_id", "mois"])
    .agg(pl.col("distance_eco").sum())
    .with_columns(
        pl.col("mois")
        .rank()
        .over("driver_operator_user_id", order_by="mois")
        .alias("num_mois")
    )
    .group_by("num_mois")
    .agg((pl.col("distance_eco") / 1000).mean())
    .with_columns((pl.col("distance_eco") * 0.9 * 0.647).alias("kwhcumac"))
    .sort("num_mois")
)
df_distance_eco_by_month_agg

In [None]:
px.line(
    df_distance_eco_by_month_agg,
    "num_mois",
    y="distance_eco",
    markers=True,
    template="simple_white",
)

In [None]:
(
    df_distance_eco_by_month.group_by(["driver_operator_user_id", "mois"])
    .agg(pl.col("distance_eco").sum())
    .with_columns(
        pl.col("mois")
        .rank()
        .over("driver_operator_user_id", order_by="mois")
        .alias("num_mois")
    )
    .group_by("num_mois")
    .agg(
        (pl.col("distance_eco") / 1000).sum(),
        pl.col("driver_operator_user_id")
        .filter(pl.col("distance_eco") != 0)
        .n_unique()
        .alias("remaining_drivers"),
        pl.col("driver_operator_user_id").n_unique().alias("total_driver"),
    )
    .with_columns(
        ((pl.col("distance_eco") / pl.col("remaining_drivers")) * 0.9 * 0.647).alias(
            "kwhcumac"
        ),
    )
    .sort("num_mois")
)

In [None]:
px.line(
    (
        df_distance_eco_by_month.group_by(["driver_operator_user_id", "mois"])
        .agg(pl.col("distance_eco").sum())
        .with_columns(
            pl.col("mois")
            .rank()
            .over("driver_operator_user_id", order_by="mois")
            .alias("num_mois")
        )
        .group_by("num_mois")
        .agg(
            (pl.col("distance_eco") / 1000).sum(),
            pl.col("driver_operator_user_id")
            .filter(pl.col("distance_eco") != 0)
            .n_unique()
            .alias("remaining_drivers"),
            pl.col("driver_operator_user_id").n_unique().alias("total_drivers"),
        )
        .with_columns(
            (
                (pl.col("distance_eco") / pl.col("remaining_drivers")) * 0.9 * 0.647
            ).alias("kwhcumac"),
        )
        .sort("num_mois")
    ),
    x="num_mois",
    y="kwhcumac",
    template="simple_white"
)

### CEE

In [None]:
df_distance_eco_cee_by_month = pl.read_database(
    """
with first_trip as (
select
		driver_operator_user_id,
		min(start_datetime) as first_cee_datetime
from
	carpool_v2.carpools c
inner join cee.cee_applications ca on ca.operator_journey_id=c.operator_journey_id
where
	c.start_datetime <= '2023-06-01'
group by
	1
having
	min(start_datetime)>= '2023-01-01'
),
template as (
select
	*,
	generate_series(date_trunc('month',
	first_cee_datetime at time zone 'Europe/Paris'),
	date_trunc('month',
	first_cee_datetime at time zone 'Europe/Paris' + interval '24 month'),
	interval '1 month') as mois
from
	first_trip
)
select
	t.*,
	c.distance,
	c.passenger_seats,
	c.distance * c.passenger_seats as distance_eco
from
	template t
left join carpool_v2.carpools c on
	t.driver_operator_user_id = c.driver_operator_user_id
	and t.mois = date_trunc('month',
	start_datetime)::timestamp
    """,
    connection=db_engine,
)

In [None]:
df_distance_eco_cee_by_month_agg = (
    df_distance_eco_cee_by_month.with_columns(
        pl.when(pl.col("first_cee_datetime") < datetime(2023, 4, 1))
        .then(pl.lit("t1_23"))
        .when(pl.col("first_cee_datetime") < datetime(2023, 7, 1))
        .then(pl.lit("t2_23"))
        .alias("cohorte")
    )
    .group_by(["driver_operator_user_id", "mois"])
    .agg(pl.col("distance_eco").sum(), pl.col("cohorte").max())
    .with_columns(
        pl.col("mois")
        .rank()
        .over("driver_operator_user_id", order_by="mois")
        .alias("num_mois"),
    )
    .group_by(["cohorte", "num_mois"])
    .agg((pl.col("distance_eco") / 1000).mean())
    .with_columns((pl.col("distance_eco") * 0.9 * 0.647).alias("kwhcumac"))
    .sort("num_mois")
)
df_distance_eco_cee_by_month_agg

## Visualisation

In [None]:
fig_kwhcumac_mensuel = px.line(
    pl.concat(
        [
            df_distance_eco_by_month_agg.with_columns(pl.lit("2022").alias("cohorte")),
            df_distance_eco_cee_by_month_agg,
        ],
        how="diagonal",
    )
    .with_columns(pl.col("kwhcumac").round(0))
    .sort("num_mois"),
    x="num_mois",
    y="kwhcumac",
    color="cohorte",
    template="simple_white",
    labels={"num_mois": "Mois n°", "cohorte": "Cohorte", "kwhcumac": "kWh cumac"},
    title="Comparaison de khwcumac économisé au fil des mois par les différentes cohortes",
    color_discrete_map={
       **cohortes_color_mapping,"t1_23":"#82c4ad"
    },
)

fig_kwhcumac_mensuel.write_html(OUTPUT_PATH / "fig_kwhcumac_mensuel_multi.html")
fig_kwhcumac_mensuel.update_traces(
    textposition = "top center"
)
fig_kwhcumac_mensuel.update_layout(
    hovermode="x unified",
)
fig_kwhcumac_mensuel.update_yaxes(
    showgrid=True,
    griddash="dashdot",
    gridwidth=1,
    gridcolor="gray",
)
fig_kwhcumac_mensuel.write_image(
    OUTPUT_PATH / "fig_kwhcumac_mensuel_multi.svg",
    format="svg",
    width=1280,
    height=720,
)
fig_kwhcumac_mensuel.show()

# Mouvements


In [None]:
df_positions = pl.read_database(
    """
with trips as (
	select
		i."uuid",
		c.trip_id,
		max(c.datetime) as datetime,
		max(end_position::geometry) as end_position
	from carpool.carpools c 
	inner join carpool.identities i on c.identity_id = i._id
	left join geo.perimeters p on c.start_geo_code = p.arr
	where p.aom = '246700488'
	and is_driver
	and status::text = 'ok'
	group by 1,2
)
select
	cv."uuid",
	trip_id,
	t.datetime,
	st_x(end_position::geometry) as longitude,
	st_y(end_position::geometry) as latitude
from luis.cohorte_2022_v2 cv
inner join trips t on cv."uuid" = t."uuid"
where t.datetime between cv.date_first_trip and cv.date_first_trip + interval '12 weeks'
""",
    connection=db_engine,
)

In [None]:
df_positions.describe()

In [None]:
df_positions.group_by(["longitude", "latitude"]).len()

In [None]:
df_positions_cee = pl.read_database(
    """
with trips as (
	select
		i."uuid",
		c.trip_id,
		max(c.datetime) as datetime,
		max(end_position::geometry) as end_position
	from carpool.carpools c 
	inner join carpool.identities i on c.identity_id = i._id
	left join geo.perimeters p on c.start_geo_code = p.arr
	where p.aom = '246700488'
	and is_driver
	and status::text = 'ok'
	group by 1,2
)
select
	cv."uuid",
	trip_id,
	t.datetime,
	st_x(end_position::geometry) as longitude,
	st_y(end_position::geometry) as latitude
from luis.cee_drivers_v4 cv
inner join trips t on cv."uuid" = t."uuid"
where t.datetime between cv.date_first_cee and cv.date_first_cee + interval '12 weeks'
and cohorte='t4_23'
""",
    connection=db_engine,
)

In [None]:
df_positions_cee.describe()

In [None]:
df_positions_cee.group_by(["longitude", "latitude"]).len()

In [None]:
px.density_mapbox(
    df_positions,
    lat="latitude",
    lon="longitude",
    radius=4,
    opacity=0.7,
    center=dict(lat=48.44, lon=7.75),
    zoom=7.6,
    mapbox_style="open-street-map",
    height=800,
    width=900,
    title="Points de chute des trajets au départ de l'AOM Strasbourg Métropôle - Cohorte 2022",
)

In [None]:
px.density_mapbox(
    df_positions_cee,
    lat="latitude",
    lon="longitude",
    radius=4,
    opacity=0.7,
    center=dict(lat=48.44, lon=7.75),
    zoom=7.6,
    mapbox_style="open-street-map",
    height=800,
    width=900,
    title="Points de chute des trajets au départ de l'AOM Strasbourg Métropôle - Cohorte CEE T3 2023",
)

# Territoires

In [None]:
df_insee = pl.read_excel(
    "insee_zones_urbaines.xlsx",
    sheet_name="Composition_communale",
    read_options={"header_row": 0},
    engine="calamine",
)
df_insee.describe()

## Requêtes

### 2022

In [None]:
df_drivers_territories = pl.read_database(
    """
select
	cdv."uuid"::text,
	array_agg(c.start_geo_code) as start_geo_codes,
	array_agg(c.end_geo_code) as end_geo_codes
from
	carpool.carpools c
inner join carpool.identities i on
	c.identity_id = i."_id"
inner join luis.cohorte_2022_v2 cdv on
	cdv."uuid" = i."uuid"
where
	c.datetime between cdv.date_first_trip and cdv.date_first_trip + interval '11 weeks'
	and is_driver
	and status = cast('ok' as covoiturage_production.carpool.carpool_status_enum)
    and cdv.date_first_trip <= '2022-09-30'
group by
	1
    """,
    connection=db_engine,
)

In [None]:
df_drivers_territories.describe()

### CEE

In [None]:
df_drivers_territories_cee = pl.read_database(
    """
select
	cdv."uuid"::text,
	max(cdv.cohorte) as cohorte,
	array_agg(c.start_geo_code) as start_geo_codes,
	array_agg(c.end_geo_code) as end_geo_codes,
	max(cdv.max_aom_name) as aom_name
from
	carpool.carpools c
inner join carpool.identities i on
	c.identity_id = i."_id"
inner join luis.cee_drivers_v4 cdv on
	cdv."uuid" = i."uuid"
where
	cdv.cohorte is not null
	and c.datetime between cdv.date_first_cee and cdv.date_first_cee + interval '11 weeks'
	and is_driver
	and status = cast('ok' as covoiturage_production.carpool.carpool_status_enum)
group by
	1
    """,
    connection=db_engine,
)

In [None]:
df_drivers_territories_cee.describe()

## Visualisation

In [None]:
def preprocess_territories_df(df: pl.DataFrame) -> pl.DataFrame:

    agg_exprs = [pl.len()]
    group_by_exprs = ["TYPE_COMMUNE_UU"]
    if "cohorte" in df.columns:
        agg_exprs.append(
            pl.col("cohorte").max(),
        )
        group_by_exprs.insert(0, "cohorte")

    df_agg = (
        df.with_columns(
            pl.col("start_geo_codes")
            .list.concat(pl.col("end_geo_codes"))
            .alias("all_geo_codes")
        )
        .explode("all_geo_codes")
        .group_by(["uuid", "all_geo_codes"])
        .agg(agg_exprs)
        .with_columns(
            pl.col("len")
            .rank(descending=True, method="random")
            .over("uuid")
            .alias("rank")
        )
        .filter(pl.col("rank") == 1)
        .join(df_insee, left_on="all_geo_codes", right_on="CODGEO")
        .group_by(group_by_exprs)
        .agg(pl.len().alias("num_drivers"))
    )

    if "cohorte" in df.columns:
        df_agg = df_agg.with_columns(
            pl.col("num_drivers").sum().over(["cohorte"]).alias("total_drivers")
        )
    else:
        df_agg = df_agg.with_columns(pl.col("num_drivers").sum().alias("total_drivers"))

    df_agg = df_agg.with_columns(
        (100 * pl.col("num_drivers") / pl.col("total_drivers")).alias("share")
    ).with_columns(pl.format("{}%", pl.col("share").round(1)).alias("share_fmt"))

    return df_agg

In [None]:
df_drivers_territories_agg = preprocess_territories_df(df_drivers_territories)
df_drivers_territories_agg

In [None]:
df_drivers_territories_cee_agg = preprocess_territories_df(df_drivers_territories_cee)
df_drivers_territories_cee_agg

In [None]:
fig_territories_by_cohorte = px.bar(
    pl.concat(
        [
            df_drivers_territories_agg.with_columns(pl.lit("2022").alias("cohorte")),
            df_drivers_territories_cee_agg,
        ],
        how="diagonal",
    ).sort(pl.col("cohorte").str.reverse()),
    x="cohorte",
    y="share",
    text="share_fmt",
    color="TYPE_COMMUNE_UU",
    template="simple_white",
    color_discrete_map={
        "Hors unité urbaine": "rgba(163, 203, 56,1.0)",
        "Unité urbaine": "rgba(52, 73, 94,1.0)",
    },
    title="Répartition territoriale des conducteurs<br><sub>Historique de trajets de 3 mois</sub>",
    labels={"share": "% des conducteurs", "TYPE_COMMUNE_UU": "", "cohorte": "Cohorte"},
)
fig_territories_by_cohorte.update_traces(textposition="outside")
fig_territories_by_cohorte.update_yaxes(range=[0, 130])

fig_territories_by_cohorte.write_html(
    OUTPUT_PATH / "fig_territoires_cohortes_multi.html"
)
fig_territories_by_cohorte.write_image(
    OUTPUT_PATH / "fig_territoires_cohortes_multi.svg",
    format="svg",
    width=1280,
    height=720,
)
fig_territories_by_cohorte.show()

# Analyse des conducteurs fidélisés


## Requêtes

### Stats

In [None]:
df_trips_cee_loyal = pl.read_database(
    """
with trips as (
  select
    cdv.uuid,
    date_trunc('week',
    c.datetime) as semaine,
    c.trip_id,
    cdv.max_aom_name,
    cdv.cee_operator_name,
    cdv.cohorte
  from
    carpool.carpools c
    inner join carpool.identities i on
    c.identity_id = i."_id"
    inner join luis.cee_drivers_v4 cdv on cdv."uuid" = i."uuid"
  where
    c.datetime between cdv.date_first_cee and cdv.date_first_cee + interval '19 weeks'
    and is_driver
    and status = cast('ok' as covoiturage_production.carpool.carpool_status_enum)
    and cdv.cohorte is not null
)
select
  uuid::text,
  count(distinct trip_id) as num_trips,
  count(distinct semaine) as num_semaines,
  max(max_aom_name) as aom_name,
  max(cee_operator_name) as cee_operator_name,
  max(cohorte) as cohorte
from
  trips tr
group by
1
having count(distinct semaine)>=16
""",
    connection=db_engine,
)

In [None]:
df_trips_cee_loyal.describe()

### Distance économisée

In [None]:
df_distance_eco_by_loyal_drivers_cee = pl.read_database(
    """
with drivers_trips as (
select
	i.uuid,
	trip_id,
    max(date_trunc('week',
    c.datetime)) as semaine,
	max(cdv.cohorte) as cohorte,
	max(cdv.date_first_cee) as date_first_cee,
	max(cdv.cee_operator_name) as cee_operator_name
from
	carpool.carpools c
inner join carpool.identities i on
	c.identity_id = i._id
inner join luis.cee_drivers_v4 cdv on
	i."uuid" = cdv."uuid"
where
	c.datetime between cdv.date_first_cee and cdv.date_first_cee + interval '23 weeks'
    and cdv.date_first_cee < now() - interval '23 weeks'
	and cdv.cohorte is not null
	and status = cast('ok' as covoiturage_production.carpool.carpool_status_enum)
	and is_driver
group by
		1,
	2
),
trips_distance as (
select 
	c.trip_id,
	sum(coalesce (distance,(c.meta->>'calc_distance')::int) * seats) as distance_passagers
from
	carpool.carpools c
where
	c.trip_id in (
	select
		trip_id
	from
		drivers_trips)
group by
	1)
select 
	a.uuid,
	max(a.cohorte) as cohorte,
    count(distinct semaine) as num_semaines,
	max(a.date_first_cee) as date_first_cee,
	max(a.cee_operator_name) as cee_operator_name,
	sum(b.distance_passagers)::float as distance
from
	drivers_trips a
left join trips_distance b on
	a.trip_id = b.trip_id
group by
	a.uuid
having count(distinct semaine)>=16
""",
    connection=db_engine,
)

In [None]:
df_distance_eco_by_loyal_drivers_cee

In [None]:
df_distance_eco_by_loyal_drivers_cee.select((pl.col("distance") / 1000).mean())

### Trajets

In [None]:
df_trips_by_loyal_drivers_cee = pl.read_database(
    """
with drivers as (
select
	i.uuid,
	max(cdv.cohorte) as cohorte,
	max(cdv.date_first_cee) as date_first_cee,
	max(cdv.cee_operator_name) as cee_operator_name
from
	carpool.carpools c
inner join carpool.identities i on
	c.identity_id = i._id
inner join luis.cee_drivers_v4 cdv on
	i."uuid" = cdv."uuid"
where
	c.datetime between cdv.date_first_cee and cdv.date_first_cee + interval '24 weeks'
    and cdv.date_first_cee < now() - interval '23 weeks'
	and cdv.cohorte is not null
	and status = cast('ok' as covoiturage_production.carpool.carpool_status_enum)
	and is_driver
group by
	1
having count(distinct date_trunc('week',c.datetime))>=16
)
select
	i.uuid::text,
	trip_id,
    max(date_trunc('week',
    c.datetime)) as semaine,
	max(cdv.cohorte) as cohorte,
	max(cdv.date_first_cee) as date_first_cee,
	max(cdv.cee_operator_name) as cee_operator_name,
    sum(coalesce (distance,(c.meta->>'calc_distance')::int) * seats) as distance_passagers
from
	carpool.carpools c
inner join carpool.identities i on
	c.identity_id = i._id
inner join drivers cdv on
	i."uuid" = cdv."uuid"
where
	c.datetime between cdv.date_first_cee and cdv.date_first_cee + interval '23 weeks'
    and cdv.date_first_cee < now() - interval '23 weeks'
	and cdv.cohorte is not null
	and status = cast('ok' as covoiturage_production.carpool.carpool_status_enum)
	and is_driver
group by
	1,
	2

""",
    connection=db_engine,
)

## Nombre moyen de trajets effectués par semaine

In [None]:
df_trips_by_loyal_drivers_cee.describe()

In [None]:
df_mean_trips_by_week_loyal_drivers_cee = (
    df_trips_by_loyal_drivers_cee.group_by([pl.col("uuid"), pl.col("semaine")])
    .agg(pl.col("trip_id").n_unique().alias("num_trips"))
    .with_columns(pl.col("semaine").rank().over(pl.col("uuid")).alias("num_semaine"))
    .sort(["uuid", "num_semaine"])
    .group_by("num_semaine")
    .agg(pl.col("num_trips").mean())
)
df_mean_trips_by_week_loyal_drivers_cee.sort("num_semaine")

In [None]:
fig_mean_trips_by_week_loyal_drivers = px.bar(
    df_mean_trips_by_week_loyal_drivers_cee,
    x="num_semaine",
    y="num_trips",
    text="num_trips",
    text_auto=".1f",
    labels={"num_semaine": "Semaine", "num_trips": "Nombre de trajets moyen effectués"},
    template="simple_white",
    title="Evolution de la moyenne du nombre de trajets effectués par semaine pour les conducteurs fidèles"
    "<br><sub>Historique de trajets de 6 mois</br>",
)
fig_mean_trips_by_week_loyal_drivers.update_xaxes(dtick=1, title="Semaine n°")
fig_mean_trips_by_week_loyal_drivers.update_yaxes(
    showgrid=True, griddash="dashdot", gridwidth=1, gridcolor="gray"
)
fig_mean_trips_by_week_loyal_drivers.write_html(
    OUTPUT_PATH / "histo_moyenne_trajets_semaine_utilisateurs_fideles.html"
)
fig_mean_trips_by_week_loyal_drivers.write_image(
    OUTPUT_PATH / "histo_moyenne_trajets_semaine_utilisateurs_fideles.svg",
    format="svg",
    width=1280,
    height=720,
)
fig_mean_trips_by_week_loyal_drivers.show()

## Nombre de conducteurs loyaux par opérateurs

In [None]:
fig_loyal_drivers_by_operator = px.bar(
    df_trips_cee_loyal.group_by(pl.col("cee_operator_name"))
    .len()
    .sort("cee_operator_name"),
    x="cee_operator_name",
    y="len",
    text="len",
    color="cee_operator_name",
    color_discrete_map=color_mapping,
    title="Distribution des conducteurs fidèles par opérateur",
    labels={"len": "Nombre de conducteurs", "cee_operator_name": "Opérateur"},
)
fig_loyal_drivers_by_operator.show()
fig_loyal_drivers_by_operator.write_html(
    OUTPUT_PATH / "histo_utilisateurs_fideles_par_operateurs.html"
)
fig_loyal_drivers_by_operator.write_image(
    OUTPUT_PATH / "histo_utilisateurs_fideles_par_operateurs.svg",
    format="svg",
    width=1280,
    height=720,
)

## Nombre de trajets


In [None]:
(
    df_trips_cee_loyal.group_by("cohorte")
    .agg(pl.col("num_trips").mean().round(2))
    .sort(pl.col("cohorte").str.split("_").list.reverse().list.join(""))
)

In [None]:
fig_num_trips_loyal_driver_operator = px.bar(
    df_trips_cee_loyal.group_by("cee_operator_name")
    .agg(pl.col("num_trips").mean().round(2))
    .sort(pl.col("cee_operator_name")),
    x="cee_operator_name",
    color="cee_operator_name",
    y="num_trips",
    text="num_trips",
    color_discrete_map=color_mapping,
    title="Nombre de trajets effectués par les conducteurs fidèles de chaque opérateur<br><sub>Historique de 6 mois</sub>",
    labels={"num_trips": "Nombre de trajets moyens", "cee_operator_name": "Opérateur"},
)
fig_num_trips_loyal_driver_operator.show()
fig_num_trips_loyal_driver_operator.write_html(
    OUTPUT_PATH / "histo_trajets_utilisateurs_fideles_par_operateurs.html"
)
fig_num_trips_loyal_driver_operator.write_image(
    OUTPUT_PATH / "histo_trajets_utilisateurs_fideles_par_operateurs.svg",
    format="svg",
    width=1280,
    height=720,
)

## Distance


In [None]:
df_distances_cee_loyal = pl.read_database(
    """
with trips as (
  select
    cdv.uuid,
    c.trip_id,
    max(date_trunc('week',
    c.datetime)) as semaine,
    max(c.distance) as distance,
    max(cdv.max_aom_name) as max_aom_name,
    max(cdv.cee_operator_name) as cee_operator_name,
    max(cdv.cohorte) as cohorte
  from
    carpool.carpools c
    inner join carpool.identities i on
    c.identity_id = i."_id"
    inner join luis.cee_drivers_v4 cdv on cdv."uuid" = i."uuid"
  where
    c.datetime between cdv.date_first_cee and cdv.date_first_cee + interval '23 weeks'
    and is_driver
    and status = cast('ok' as covoiturage_production.carpool.carpool_status_enum)
    and cdv.cohorte is not null
  group by 1,2
)
select
  uuid::text,
  count(distinct trip_id) as num_trips,
  count(distinct semaine) as num_semaines,
  avg(distance)::float as distance,
  max(max_aom_name) as aom_name,
  max(cee_operator_name) as cee_operator_name,
  max(cohorte) as cohorte
from
  trips tr
group by
1
having count(distinct semaine)>=16
""",
    connection=db_engine,
)

In [None]:
df_distances_cee_loyal.describe()

In [None]:
(
    df_distances_cee_loyal.group_by("cohorte")
    .agg((pl.col("distance") / 1000).mean().round(2))
    .sort(pl.col("cohorte").str.split("_").list.reverse().list.join(""))
)

In [None]:
fig_distance_loyal_driver_operator = px.bar(
    (
        df_distances_cee_loyal.group_by("cee_operator_name")
        .agg((pl.col("distance") / 1000).mean().round(2))
        .sort(pl.col("cee_operator_name"))
    ),
    x="cee_operator_name",
    color="cee_operator_name",
    y="distance",
    text="distance",
    color_discrete_map=color_mapping,
    title="Distances moyennes effectuées par les conducteurs fidèles de chaque opérateur<br><sub> Historique de 6 mois.</sub>",
    labels={"distance": "Distance moyenne (km)", "cee_operator_name": "Opérateur"},
)
fig_distance_loyal_driver_operator.show()
fig_distance_loyal_driver_operator.write_html(
    OUTPUT_PATH / "histo_distance_utilisateurs_fideles_par_operateurs.html"
)
fig_distance_loyal_driver_operator.write_image(
    OUTPUT_PATH / "histo_distance_utilisateurs_fideles_par_operateurs.svg",
    format="svg",
    width=1280,
    height=720,
)

## Nombre de passagers moyen


In [None]:
df_passagers_cee_loyal = pl.read_database(
    """
with loyal_drivers as (
select
	cdv."uuid",
	max(cdv.date_first_cee) as date_first_cee,
    max(cdv.cohorte) as cohorte,
	max(cdv.cee_operator_name) as cee_operator_name
from
	carpool.carpools c
inner join carpool.identities i on
	c.identity_id = i."_id"
inner join luis.cee_drivers_v4 cdv on
	cdv."uuid" = i."uuid"
where 
	cdv.cohorte is not null
	and c.datetime between cdv.date_first_cee and cdv.date_first_cee + interval '23 weeks'
group by
	1
having
	count(distinct date_trunc('week', c.datetime))>= 16
)
select
  c.trip_id,
  sum(seats) as seats,
  max(ld.cohorte) as cohorte,
  max(ld.cee_operator_name) as cee_operator_name,
  max(ld.uuid::text) as ld_uuid
from
carpool.carpools c
  inner join carpool.identities i on
  c.identity_id = i."_id"
  left join loyal_drivers ld on ld."uuid" = i."uuid"
where
  status = cast('ok' as covoiturage_production.carpool.carpool_status_enum)
  and c.datetime >= '2023-01-01'
group by 1
having 
  max(ld.uuid::text) is not null
""",
    connection=db_engine,
)

In [None]:
df_passagers_cee_loyal

In [None]:
df_passagers_cee_loyal.describe()

In [None]:
fig_passengers_loyal_drivers = px.bar(
    df_passagers_cee_loyal.filter(pl.col("seats") != 0)["seats"]
    .hist(bins=[1, 2, 3], include_breakpoint=True)
    .with_columns(
        pl.col("breakpoint").cast(str).str.replace("inf", "4+").str.replace("\.0", ""),
        (100 * pl.col("count") / pl.col("count").sum()).round(2).alias("share"),
    ),
    x="breakpoint",
    y="share",
    text="share",
    template="simple_white",
    labels={"breakpoint": "Nombre de passagers", "share": "% des trajets"},
    title="Distribution du nombre de passagers pour les trajets effectués par les conducteurs fidèles"
    "<br><sub>Historique de 6 mois</sub>",
)
fig_passengers_loyal_drivers.show()
fig_passengers_loyal_drivers.write_html(
    OUTPUT_PATH / "histo_passagers_utilisateurs_fideles.html"
)
fig_passengers_loyal_drivers.write_image(
    OUTPUT_PATH / "histo_passagers_utilisateurs_fideles.svg",
    format="svg",
    width=1280,
    height=720,
)

## Trajets pre-CEE


In [None]:
df_trips_before_cee_loyal = pl.read_database(
    """
with loyal_drivers as (
select
cdv."uuid",
max(cdv.date_first_cee) as date_first_cee,
max(cdv.cee_operator_name) as cee_operator_name,
max(cdv.cohorte) as cohorte,
max(cdv.num_trips_before_cee) as num_trips_before_cee,
max(cdv.cee_aom_name) as cee_aom_name
from
carpool.carpools c
inner join carpool.identities i on
c.identity_id = i."_id"
inner join luis.cee_drivers_v4 cdv on
cdv."uuid" = i."uuid"
where
cdv.cohorte is not null
and c.datetime between cdv.date_first_cee and cdv.date_first_cee + interval '19 weeks'
and is_driver
and status = cast('ok' as covoiturage_production.carpool.carpool_status_enum)
group by
1
having
count(distinct date_trunc('week', c.datetime))>= 16
and max(cdv.cohorte) is not null)
select * from loyal_drivers
""",
    connection=db_engine,
)

In [None]:
df_trips_before_cee_loyal

In [None]:
df_trips_before_cee_loyal.group_by(pl.col("num_trips_before_cee") > 0).agg(
    pl.len() / len(df_trips_before_cee_loyal)
)

In [None]:
df_trips_before_cee_loyal["num_trips_before_cee"].hist(bins=[0, 1, 2, 3]).with_columns(
    (pl.col("count") / pl.col("count").sum()).round(2).alias("share"),
    pl.col("breakpoint").cast(str).str.replace("inf", "4+").str.replace("\.0", ""),
)

In [None]:
fig_trips_pre_cee_loyal_drivers = px.bar(
    df_trips_before_cee_loyal["num_trips_before_cee"]
    .hist(bins=[0, 1, 2, 3])
    .with_columns(
        (100 * pl.col("count") / pl.col("count").sum()).round(2).alias("share"),
        pl.col("breakpoint").cast(str).str.replace("inf", "4+").str.replace("\.0", ""),
    ),
    x="breakpoint",
    y="share",
    text="share",
    template="simple_white",
    labels={"breakpoint": "Nombre de trajets pre-CEE", "share": "% des conducteurs"},
    title="Distribution du nombre de trajets pre CEE effectués par les conducteurs fidèles",
)
fig_trips_pre_cee_loyal_drivers.show()
fig_trips_pre_cee_loyal_drivers.write_html(
    OUTPUT_PATH / "histo_trajets_pre_cee_utilisateurs_fideles.html"
)
fig_trips_pre_cee_loyal_drivers.write_image(
    OUTPUT_PATH / "histo_trajets_pre_cee_utilisateurs_fideles.svg",
    format="svg",
    width=1280,
    height=720,
)

## AOM


In [None]:
df_aom_loyal = pl.read_database(
    """
select
cdv."uuid",
max(cdv.cohorte) as cohorte,
max(cdv.max_aom_name) as aom_name
from
  carpool.carpools c
  inner join carpool.identities i on
  c.identity_id = i."_id"
  inner join luis.cee_drivers_v4 cdv on
  cdv."uuid" = i."uuid"
where
  cdv.cohorte is not null
  and c.datetime between cdv.date_first_cee and cdv.date_first_cee + interval '19 weeks'
  and is_driver
  and status = cast('ok' as covoiturage_production.carpool.carpool_status_enum)
group by
1
having
  count(distinct date_trunc('week', c.datetime))>= 16
  and max(cdv.cohorte) is not null
""",
    connection=db_engine,
)

In [None]:
df_aom_loyal_agg = (
    df_aom_loyal.with_columns(
        pl.when(pl.col("aom_name").is_in(aom_with_incentives))
        .then(pl.lit("Avec incitation"))
        .when(pl.col("aom_name").is_in(aom_without_incentives))
        .then(pl.lit("Sans incitation"))
        .otherwise(pl.lit("Autre"))
        .alias("aom_type")
    )
    .group_by("aom_type")
    .len()
    .with_columns((100 * pl.col("len") / pl.col("len").sum()).alias("share").round(2))
)
df_aom_loyal_agg

In [None]:
fig_aom_loyal_drivers = px.bar(
    df_aom_loyal_agg.sort("aom_type"),
    orientation="h",
    y="aom_type",
    color="aom_type",
    x="share",
    text="share",
    hover_data="len",
    labels={
        "aom_type": "Type d'AOM",
        "len": "Nombre de conducteurs",
        "share": "% des conducteurs",
    },
    title="Dans quel type d'AOM se situent les conducteurs fidèles ?",
    color_discrete_map={
        "Avec incitation": "rgba(39, 174, 96,1.0)",
        "Sans incitation": "rgba(47, 54, 64,1.0)",
        "Autre": "gray",
    },
)
fig_aom_loyal_drivers.show()
fig_aom_loyal_drivers.write_html(OUTPUT_PATH / "histo_aom_utilisateurs_fideles.html")
fig_aom_loyal_drivers.write_image(
    OUTPUT_PATH / "histo_aom_utilisateurs_fideles.svg",
    format="svg",
    width=1280,
    height=720,
)

## Rural vs Urbain


In [None]:
df_insee = pl.read_excel(
    "insee_zones_urbaines.xlsx",
    sheet_name="Composition_communale",
    read_options={"header_row": 0},
    engine="calamine",
)
df_insee.describe()

In [None]:
df_insee.filter(pl.col("TYPE_COMMUNE_UU") == "Unité urbaine").group_by("LIBUU2020").agg(
    pl.col("CODGEO").n_unique().alias("num_com")
).sort("num_com", descending=True)

In [None]:
df_communes = pl.read_csv(
    Path() / "v_commune_2024.csv",
    schema_overrides={"COM": pl.String, "DEP": pl.String, "REG": pl.String},
)
df_communes.describe()

In [None]:
df_trips_geo_loyal = pl.read_database(
    """
select
    cdv."uuid"::text,
    max(cdv.cohorte) as cohorte,
    array_agg(c.start_geo_code) as start_geo_codes,
    array_agg(c.end_geo_code) as end_geo_codes,
    max(cdv.max_aom_name) as aom_name
from
  carpool.carpools c
  inner join carpool.identities i on
  c.identity_id = i."_id"
  inner join luis.cee_drivers_v4 cdv on
  cdv."uuid" = i."uuid"
where
  cdv.cohorte is not null
  and c.datetime between cdv.date_first_cee and cdv.date_first_cee + interval '23 weeks'
  and is_driver
  and status = cast('ok' as covoiturage_production.carpool.carpool_status_enum)
group by
1
having
  count(distinct date_trunc('week', c.datetime))>= 16
  and max(cdv.cohorte) is not null
""",
    connection=db_engine,
)

In [None]:
df_trips_geo_loyal.describe()

In [None]:
df_trips_geo_loyal_agg = (
    (
        df_trips_geo_loyal.with_columns(
            pl.col("start_geo_codes")
            .list.concat(pl.col("end_geo_codes"))
            .alias("all_geo_codes")
        )
        .explode("all_geo_codes")
        .group_by(["uuid", "all_geo_codes"])
        .agg(pl.len())
        .with_columns(
            pl.col("len")
            .rank(descending=True, method="random")
            .over("uuid")
            .alias("rank")
        )
    )
    .filter(pl.col("rank") == 1)
    .join(df_insee, left_on="all_geo_codes", right_on="CODGEO")
)
df_trips_geo_loyal_agg

In [None]:
df_trips_geo_loyal_agg["TYPE_COMMUNE_UU"].unique()

In [None]:
fig_territories = px.pie(
    df_trips_geo_loyal_agg.group_by("TYPE_COMMUNE_UU").len(),
    names="TYPE_COMMUNE_UU",
    color="TYPE_COMMUNE_UU",
    values="len",
    title="Dans quel type de territoire évoluent les conducteurs fidèles ?<br><sub>Historique de trajets de 6 mois.</sub>",
    color_discrete_map={
        "Hors unité urbaine": "rgba(163, 203, 56,1.0)",
        "Unité urbaine": "rgba(52, 73, 94,1.0)",
    },
)
fig_territories.show()
fig_territories.write_html(OUTPUT_PATH / "fig_territoires_utilisateurs_fideles.html")
fig_territories.write_image(
    OUTPUT_PATH / "fig_territoires_utilisateurs_fideles.svg",
    format="svg",
    width=1280,
    height=720,
)

In [None]:
df_trips_geo_loyal

### Analyse du TOP 30 des Unités Urbaines

In [None]:
df_trips_geo_loyal.explode(["start_geo_codes", "end_geo_codes"])

In [None]:
df_trips_geo_loyal_uu_agg = (
    df_trips_geo_loyal.explode(["start_geo_codes", "end_geo_codes"])
    .join(
        df_insee.select(
            pl.col("CODGEO"),
            pl.col("LIBUU2020").alias("uu_name_start"),
            pl.col("LIBGEO"),
        ),
        left_on="start_geo_codes",
        right_on="CODGEO",
        how="left",
    )
    .join(
        df_insee.select(pl.col("CODGEO"), pl.col("LIBUU2020").alias("uu_name_end")),
        left_on="end_geo_codes",
        right_on="CODGEO",
        how="left",
    )
    .join(
        df_communes.select(pl.col("COM"), pl.col("LIBELLE").alias("city_name_start")),
        left_on="start_geo_codes",
        right_on="COM",
        how="left",
    )
    .join(
        df_communes.select(pl.col("COM"), pl.col("LIBELLE").alias("city_name_end")),
        left_on="end_geo_codes",
        right_on="COM",
        how="left",
    )
    .with_columns(
        pl.when(
            pl.col("uu_name_start").is_null()
            & pl.col("city_name_start").str.contains("Paris [0-9]+")
        )
        .then(pl.lit("Paris"))
        .when(
            pl.col("uu_name_start").is_null()
            & pl.col("city_name_start").str.contains("Lyon [0-9]+")
        )
        .then(pl.lit("Lyon"))
        .when(
            pl.col("uu_name_start").is_null()
            & pl.col("city_name_start").str.contains("Marseille [0-9]+")
        )
        .then(pl.lit("Marseille"))
        .when(
            pl.col("uu_name_start").is_null()
            & pl.col("city_name_start").str.contains("Bordeaux [0-9]+")
        )
        .then(pl.lit("Bordereaux"))
        .otherwise(pl.coalesce(pl.col("uu_name_start"), pl.col("city_name_start")))
        .alias("uu_name_start")
        .fill_null("Etranger"),
        pl.when(
            pl.col("uu_name_end").is_null()
            & pl.col("city_name_end").str.contains("Paris [0-9]+")
        )
        .then(pl.lit("Paris"))
        .when(
            pl.col("uu_name_end").is_null()
            & pl.col("city_name_end").str.contains("Lyon [0-9]+")
        )
        .then(pl.lit("Lyon"))
        .when(
            pl.col("uu_name_end").is_null()
            & pl.col("city_name_end").str.contains("Marseille [0-9]+")
        )
        .then(pl.lit("Marseille"))
        .when(
            pl.col("uu_name_end").is_null()
            & pl.col("city_name_end").str.contains("Bordeaux [0-9]+")
        )
        .then(pl.lit("Bordereaux"))
        .otherwise(pl.coalesce(pl.col("uu_name_end"), pl.col("city_name_end")))
        .alias("uu_name_end")
        .fill_null("Etranger"),
    )
    .group_by(["uu_name_start", "uu_name_end"])
    .agg(pl.len().alias("num_trips"))
    .with_columns((100 * pl.col("num_trips") / pl.col("num_trips").sum()).round(2))
    .sort("num_trips", descending=True)
)
df_trips_geo_loyal_uu_agg

In [None]:
df_trips_geo_loyal_uu_agg.write_clipboard()

### Analyse du TOP 30 par communes

In [None]:
df_trips_geo_loyal_agg_com = (
    df_trips_geo_loyal.explode(["start_geo_codes", "end_geo_codes"])
    .join(
        df_insee.select(
            pl.col("CODGEO"),
            pl.col("LIBUU2020").alias("uu_name_start"),
            pl.col("LIBGEO"),
        ),
        left_on="start_geo_codes",
        right_on="CODGEO",
        how="left",
    )
    .join(
        df_insee.select(pl.col("CODGEO"), pl.col("LIBUU2020").alias("uu_name_end")),
        left_on="end_geo_codes",
        right_on="CODGEO",
        how="left",
    )
    .join(
        df_communes.select(
            pl.col("COM").alias("code_commune_start"),
            pl.col("LIBELLE").alias("city_name_start"),
        ),
        left_on="start_geo_codes",
        right_on="code_commune_start",
        how="left",
    )
    .join(
        df_communes.select(
            pl.col("COM").alias("code_commune_end"),
            pl.col("LIBELLE").alias("city_name_end"),
        ),
        left_on="end_geo_codes",
        right_on="code_commune_end",
        how="left",
    )
    .with_columns(
        pl.when(pl.col("city_name_start").str.contains("Paris [0-9]+"))
        .then(pl.lit("Paris"))
        .when(pl.col("city_name_start").str.contains("Lyon [0-9]+"))
        .then(pl.lit("Lyon"))
        .when(pl.col("city_name_start").str.contains("Marseille [0-9]+"))
        .then(pl.lit("Marseille"))
        .when(pl.col("city_name_start").str.contains("Bordeaux [0-9]+"))
        .then(pl.lit("Bordereaux"))
        .otherwise("city_name_start")
        .alias("city_name_start")
        .fill_null("Etranger"),
        pl.when(pl.col("city_name_end").str.contains("Paris [0-9]+"))
        .then(pl.lit("Paris"))
        .when(pl.col("city_name_end").str.contains("Lyon [0-9]+"))
        .then(pl.lit("Lyon"))
        .when(pl.col("city_name_end").str.contains("Marseille [0-9]+"))
        .then(pl.lit("Marseille"))
        .when(pl.col("city_name_end").str.contains("Bordeaux [0-9]+"))
        .then(pl.lit("Bordereaux"))
        .otherwise("city_name_end")
        .alias("city_name_end")
        .fill_null("Etranger"),
    )
    .group_by(["city_name_start", "city_name_end"])
    .agg(
        pl.len().alias("num_trips"),
        pl.col("start_geo_codes").max(),
        pl.col("end_geo_codes").max(),
    )
    .with_columns(
        (100 * pl.col("num_trips") / pl.col("num_trips").sum()).round(2).alias("share")
    )
    .sort("num_trips", descending=True)
)
df_trips_geo_loyal_agg_com

In [None]:
df_trips_geo_loyal_agg_com.write_clipboard()

## Nombre de trajets effectués par rapport aux autres


In [None]:
df_trips_loyal = pl.read_database(
    """
select
    cdv."uuid"::text,
    max(cdv.cohorte) as cohorte,
    max(cdv.max_aom_name) as aom_name,
    count(distinct date_trunc('week', c.datetime))>= 16 as is_loyal,
    count(distinct trip_id) as num_trips
from
  carpool.carpools c
  inner join carpool.identities i on
  c.identity_id = i."_id"
  inner join luis.cee_drivers_v4 cdv on
  cdv."uuid" = i."uuid"
where
  cdv.cohorte is not null
  and c.datetime between cdv.date_first_cee and cdv.date_first_cee + interval '19 weeks'
  and is_driver
  and status = cast('ok' as covoiturage_production.carpool.carpool_status_enum)
  and cdv.cohorte is not null
group by
1
""",
    connection=db_engine,
)

In [None]:
df_trips_loyal.describe()

In [None]:
df_trips_loyal["is_loyal"].value_counts()

In [None]:
fig_share_trips_loyal_drivers = px.pie(
    df_trips_loyal.group_by("is_loyal").agg(
        pl.col("num_trips").sum() / df_trips_loyal["num_trips"].sum()
    ),
    color="is_loyal",
    names=["Conducteur non fidèle", "Conducteur fidèle"],
    color_discrete_map={
        True: "rgba(16, 172, 132,1.0)",
        False: "rgba(34, 47, 62,1.0)",
    },
    values="num_trips",
    title="Part des trajets réalisés par les conducteurs fidèles par rapport aux autres conducteurs<br>"
    "<sub>Les coducteurs fidèles représentent 4% de l'ensemble des conducteurs et pourant ils font presque 24% des trajets.</sub>",
)
fig_share_trips_loyal_drivers.update_traces({"pull": [0.1, 0]})
fig_share_trips_loyal_drivers.show()
fig_share_trips_loyal_drivers.write_html(
    OUTPUT_PATH / "fig_part_trajets_utilisateurs_fideles.html"
)
fig_share_trips_loyal_drivers.write_image(
    OUTPUT_PATH / "fig_part_trajets_utilisateurs_fideles.svg",
    format="svg",
    width=1280,
    height=720,
)

# Analyse des equipages


## Nombre de passagers différents

In [None]:
df_passengers = pl.read_database(
    """
with "data" as (select 
	driver_uuid,
	"user",
    datetime
from luis.equipages e, unnest(e.users) as "user"
where driver_uuid != "user")
select 
	driver_uuid,
	array_agg(distinct "user") as distinct_users_transported
from "data" d
inner join luis.cohorte_2022_v2 cv on d.driver_uuid = cv."uuid"::text
WHERE d.datetime BETWEEN cv.date_first_trip AND cv.date_first_trip + INTERVAL '12 weeks'
group by 1
""",
    connection=db_engine,
)

In [None]:
df_passengers.describe()

In [None]:
df_passengers_agg = df_passengers.with_columns(
    pl.col("distinct_users_transported")
    .list.n_unique()
    .alias("num_distinct_users_transported")
).select(pl.col("num_distinct_users_transported").mean())

In [None]:
df_passengers_cee = pl.read_database(
    """
with "data" as (select 
	driver_uuid,
	"user",
    datetime
from luis.equipages e, unnest(e.users) as "user"
where driver_uuid != "user")
select 
	driver_uuid,
	array_agg(distinct "user") as distinct_users_transported,
    max(cv.cohorte) as cohorte
from "data" d
inner join luis.cee_drivers_v4 cv on d.driver_uuid = cv."uuid"::text
where cv.cohorte is not null
and d.datetime BETWEEN cv.date_first_cee AND cv.date_first_cee + INTERVAL '12 weeks'
group by 1
""",
    connection=db_engine,
)

In [None]:
df_passengers_cee.describe()

In [None]:
df_passengers_cee_agg = (
    df_passengers_cee.with_columns(
        pl.col("distinct_users_transported")
        .list.n_unique()
        .alias("num_distinct_users_transported")
    )
    .group_by("cohorte")
    .agg(pl.col("num_distinct_users_transported").mean())
    .sort(pl.col("cohorte").str.reverse())
)

In [None]:
df_passengers_loyal_drivers = pl.read_database(
    """
with trips as (
  select
    cdv.uuid,
    c.trip_id,
    max(date_trunc('week',
    c.datetime)) as semaine,
    min(cdv.date_first_cee) as date_first_cee
  from
    carpool.carpools c
    inner join carpool.identities i on
    c.identity_id = i."_id"
    inner join luis.cee_drivers_v4 cdv on cdv."uuid" = i."uuid"
  where
    c.datetime between cdv.date_first_cee and cdv.date_first_cee + interval '19 weeks'
    and is_driver
    and status = cast('ok' as covoiturage_production.carpool.carpool_status_enum)
    and cdv.cohorte is not null
  group by 1,2
),
drivers as (
select
  uuid::text,
  min(date_first_cee) as date_first_cee
from
  trips tr
group by
1
having count(distinct semaine)>=16),
"data" as (select 
	driver_uuid,
	"user",
    datetime
from luis.equipages e, unnest(e.users) as "user"
where driver_uuid != "user")
select 
	driver_uuid,
	array_agg(distinct "user") as distinct_users_transported
from "data" d
inner join drivers cv on d.driver_uuid = cv."uuid"::text
where d.datetime BETWEEN cv.date_first_cee AND cv.date_first_cee + INTERVAL '12 weeks'
group by 1
""",
    connection=db_engine,
)

In [None]:
df_passengers_loyal_drivers.describe()

In [None]:
df_passengers_loyal_drivers_agg = df_passengers_loyal_drivers.with_columns(
    pl.col("distinct_users_transported")
    .list.n_unique()
    .alias("num_distinct_users_transported")
).select(pl.col("num_distinct_users_transported").mean())

In [None]:
fig_distinct_transported_users = px.bar(
    pl.concat(
        [
            df_passengers_agg.with_columns(pl.lit("2022").alias("cohorte")),
            df_passengers_cee_agg,
            df_passengers_loyal_drivers_agg.with_columns(
                pl.lit("Conducteurs fidèles").alias("cohorte")
            ),
        ],
        how="diagonal",
    ).with_columns(pl.col("num_distinct_users_transported").round(1)),
    x="cohorte",
    y="num_distinct_users_transported",
    text="num_distinct_users_transported",
    color="cohorte",
    template="simple_white",
    labels={
        "num_distinct_users_transported": "Nombre moyen d'utilisateurs différents transportés",
        "cohorte": "Cohorte",
    },
    title="En moyenne, combien d'utilisateurs différents un conducteur transporte t'il ?",
    color_discrete_map={
        **cohortes_color_mapping,
        "Conducteurs fidèles": "rgba(39, 174, 96,1.0)",
    },
)
fig_distinct_transported_users.update_yaxes(
    title="Nombre d'utilisateurs transportés <br>(moyenne)"
)
fig_distinct_transported_users.show()
fig_distinct_transported_users.write_html(
    OUTPUT_PATH / "fig_num_passagers_distincts_cohortes.html"
)
fig_distinct_transported_users.write_image(
    OUTPUT_PATH / "fig_num_passagers_distincts_cohortes.svg",
    format="svg",
    width=1280,
    height=720,
)

## Conducteurs CEE ayant covoiturés avec un autre conducteur CEE

In [None]:
df_cee_drivers_passengers_cee = pl.read_database(
    """
with "data" as (
select
	"driver_uuid",
	"passenger_uuid",
	datetime
from
	luis.equipages e,
	unnest(e.users) as "passenger_uuid"
where
	driver_uuid != "passenger_uuid"
),
data_only_cee as (
	select 
	d.*
from "data" d
inner join luis.cee_drivers_v4 cdv on d."driver_uuid" = cdv."uuid"::text
where d.datetime between cdv.date_first_cee and cdv.date_first_cee + interval '12 weeks'
)
select 
	cdv."uuid",
	cdv.date_first_cee,
	cdv.cohorte,
	d."passenger_uuid" is not null as has_been_passenger
from
	luis.cee_drivers_v4 cdv
left join "data_only_cee" d on
	cdv."uuid"::text = d."passenger_uuid" and cdv."uuid"::text != d."driver_uuid"
""",
    connection=db_engine,
)

In [None]:
df_cee_drivers_passengers_agg = (
    df_cee_drivers_passengers.group_by("cohorte")
    .agg((100 * pl.col("has_been_passenger").sum() / pl.len()).alias("share"))
    .sort(pl.col("cohorte").str.reverse())
)
df_cee_drivers_passengers_agg

In [None]:
fig_cee_drivers_also_passengers = px.bar(
    df_cee_drivers_passengers_agg.with_columns(
        pl.format("{}%", pl.col("share").round(1)).alias("share_fmt")
    ),
    x="cohorte",
    y="share",
    text="share_fmt",
    color="cohorte",
    template="simple_white",
    labels={
        "share": "Part des conducteurs ayant été passagers d'un autre conducteur CEE",
        "cohorte": "Cohorte",
    },
    title="En moyenne, combien d'utilisateurs différents un conducteur transporte t'il ?",
    color_discrete_map=cohortes_color_mapping,
)
fig_cee_drivers_also_passengers.update_yaxes(
    title="% des conducteurs ayant été passagers <br>d'un autre conducteur CEE"
)
fig_cee_drivers_also_passengers.show()
fig_cee_drivers_also_passengers.write_html(
    OUTPUT_PATH / "fig_cee_passagers_cohortes.html"
)
fig_cee_drivers_also_passengers.write_image(
    OUTPUT_PATH / "fig_cee_passagers_cohortes.svg",
    format="svg",
    width=1280,
    height=720,
)

## Analyse en réseau

In [None]:
df_links_cee = pl.read_database(
    """
with "data" as (
select
		"driver_uuid",
		"passenger_uuid",
		datetime
from
		luis.equipages e,
		unnest(e.users) as "passenger_uuid"
where
		driver_uuid != "passenger_uuid"
	)
	select 
		d.*,
		cdv.cohorte
from
	"data" d
inner join luis.cee_drivers_v4 cdv on
	d."driver_uuid" = cdv."uuid"::text
where
	d.datetime between cdv.date_first_cee and cdv.date_first_cee + interval '12 weeks'
	and cohorte is not null
""",
    connection=db_engine,
)

In [None]:
df_links_cee.head(10).write_clipboard()

In [None]:
df_links_cee.describe()

In [None]:
df_links_cee_agg = (
    df_links_cee.filter(pl.col("cohorte") == "t1_24")
    .group_by(["driver_uuid", "passenger_uuid"])
    .agg(pl.len())
)

In [None]:
df_links_cee_agg.head(10)

In [None]:
link_dicts = (
    df_links_cee.group_by(["driver_uuid", "passenger_uuid"]).agg(pl.len()).to_dicts()
)

In [None]:
link_tuples = [tuple(e.values()) for e in link_dicts]

In [None]:
g = Graph(
    link_tuples,
    hashed=True,
    eprops=[
        ("weight", "double"),
    ],
    directed=False,
)
g

In [None]:
state = gt.inference.minimize_blockmodel_dl(g, multilevel_mcmc_args={"verbose": True})

In [None]:
state.get_B()

In [None]:
state.draw(pos=g.vp.pos)

In [None]:
graph_draw(
    g,
    output_size=(1280, 720),
)