In [None]:
import os
from pathlib import Path

import plotly.express as px
import plotly.graph_objects as go
import polars as pl
from sqlalchemy import create_engine

# DB

In [None]:
DATABASE_URL = os.environ["DB_URL_RPC"]

In [None]:
OUTPUT_PATH = Path("outputs")
OUTPUT_PATH.absolute()

In [None]:
db_engine = create_engine(DATABASE_URL)

# Création de la cohorte

```sql
create table luis.drivers_incentived_cohort as (
with first_trips as (
select 
		c.driver_operator_user_id,
		array_agg(distinct c.driver_identity_key) as identity_keys, 
		array_agg(distinct c.operator_id) as operators_ids,
		min(c.start_datetime) as first_trip_datetime
from
	carpool_v2.carpools c
where
	c.start_datetime between '2023-01-01' and now() - interval '3 months'
group by
	1
having
	min(c.start_datetime) between '2024-03-01' and now() - interval '3 months'
),
incentived_drivers as (
select 
		c.driver_operator_user_id,
		array_agg(distinct oi.siret) as incentives_sirets,
		array_agg(distinct co.company_naf_code) as incentives_naf_codes
from
	carpool_v2.carpools c
inner join carpool_v2.operator_incentives oi on
	c._id = oi.carpool_id
left join company.companies co on
	oi.siret = co.siret
where
	c.start_datetime between '2024-03-01' and now() - interval '3 months'
group by
	1
),
cee_drivers as (
select
	 	driver_operator_user_id,
	 	min(c.start_datetime) as cee_trip_datetime
from
	carpool_v2.carpools c
inner join cee.cee_applications ca on
	c.operator_journey_id = ca.operator_journey_id
	and ca.journey_type = 'short'
	and not ca.is_specific
where
	c.start_datetime between '2024-03-01' and now() - interval '3 months'
group by
	1
)
select 
	ft.*,
	id.incentives_sirets,
	id.incentives_naf_codes,
	cd.cee_trip_datetime
from
	first_trips ft
left join incentived_drivers id on
	ft.driver_operator_user_id = id.driver_operator_user_id
left join cee_drivers cd on
	cd.driver_operator_user_id = ft.driver_operator_user_id)
```

# Analyses

In [None]:
is_company_incentive_expr = (pl.col("incentives_naf_codes").map_elements(
            lambda x: all(e is None for e in x), return_dtype=pl.Boolean
        ))
cohorte_cat_expr = (
    pl.when(
        pl.col("cee_trip_datetime").is_not_null()
        & is_company_incentive_expr
    )
    .then(pl.lit("CEE avec incitations Entreprise"))
    .when(
        pl.col("cee_trip_datetime").is_not_null()
        & ~is_company_incentive_expr
    )
    .then(pl.lit("CEE avec incitations Collectivité OU Opérateur"))
    .when(
        pl.col("cee_trip_datetime").is_null()
        & is_company_incentive_expr
    )
    .then(pl.lit("Incitations Entreprise"))
    .when(
        pl.col("cee_trip_datetime").is_null()
        & ~is_company_incentive_expr
    )
    .then(pl.lit("Incitations Collectivité OU Opérateurs"))
    .alias("cohorte_cat")
)

## Nombre de trajets

In [None]:
df_trips_by_driver = pl.read_database(
    """
select 
	dic.driver_operator_user_id,
    min(dic.incentives_sirets) as incentives_sirets,
    min(dic.incentives_naf_codes) as incentives_naf_codes,
    min(dic.cee_trip_datetime) as cee_trip_datetime,
	count(distinct coalesce(operator_trip_id,operator_journey_id)) as num_trips
from 
	luis.drivers_incentived_cohort dic
left join carpool_v2.carpools c on
	dic.driver_operator_user_id = c.driver_operator_user_id
	and (c.start_datetime between dic.first_trip_datetime and dic.first_trip_datetime + interval '3 month')
group by
	1
    """,
    connection=db_engine,
)

In [None]:
df_trips_by_driver = df_trips_by_driver.with_columns(cohorte_cat_expr)

In [None]:
df_trips_by_driver["cohorte_cat"].value_counts()

In [None]:
df_trips_by_driver.group_by("cohorte_cat").agg(pl.col("num_trips").mean())

# Distance moyenne

In [None]:
df_distance_by_driver = pl.read_database(
    """
select 
	dic.driver_operator_user_id,
    min(dic.incentives_sirets) as incentives_sirets,
    min(dic.incentives_naf_codes) as incentives_naf_codes,
    min(dic.cee_trip_datetime) as cee_trip_datetime,
	avg(c.distance)::float as distance
from 
	luis.drivers_incentived_cohort dic
left join carpool_v2.carpools c on
	dic.driver_operator_user_id = c.driver_operator_user_id
	and (c.start_datetime between dic.first_trip_datetime and dic.first_trip_datetime + interval '3 month')
group by
	1
    """,
    connection=db_engine,
)

In [None]:
df_distance_by_driver

In [None]:
df_distance_by_driver = df_distance_by_driver.with_columns(
    cohorte_cat_expr
)

In [None]:
df_distance_by_driver.group_by("cohorte_cat").agg((pl.col("distance")/1000).mean().round(2))

# Passagers

In [None]:
df_passengers_by_driver = pl.read_database(
    """
with trips as (
select 
	dic.driver_operator_user_id,
    coalesce(c.operator_trip_id,c.operator_journey_id) as trip_id,
    min(dic.incentives_sirets) as incentives_sirets,
    min(dic.incentives_naf_codes) as incentives_naf_codes,
    min(dic.cee_trip_datetime) as cee_trip_datetime,
    coalesce(c.operator_trip_id,operator_journey_id) as trip_id,
	sum(c.passenger_seats) as num_passagers
from 
	luis.drivers_incentived_cohort dic
left join carpool_v2.carpools c on
	dic.driver_operator_user_id = c.driver_operator_user_id
	and (c.start_datetime between dic.first_trip_datetime and dic.first_trip_datetime + interval '3 month')
group by
	1,2)
select 
    driver_operator_user_id,
    min(incentives_sirets) as incentives_sirets,
    min(incentives_naf_codes) as incentives_naf_codes,
    min(cee_trip_datetime) as cee_trip_datetime,
    avg(num_passagers)::float as num_passagers
from trips
group by 1
    """,
    connection=db_engine,
)
df_passengers_by_driver

In [None]:
df_passengers_by_driver = df_passengers_by_driver.with_columns(
    cohorte_cat_expr
)

In [None]:
df_passengers_by_driver.group_by("cohorte_cat").agg((pl.col("num_passagers")).mean().round(2))

# Churn

In [None]:
df_churn_by_driver = pl.read_database("""
with "template" as (
select 
	*,
	generate_series(date_trunc('week',dic.first_trip_datetime at time zone 'Europe/Paris'),
	date_trunc('week',dic.first_trip_datetime at time zone 'Europe/Paris' + interval '11 weeks'),
	interval '1 weeks') as semaine
from
	luis.drivers_incentived_cohort dic
  ),

trips as (
select
	dic.driver_operator_user_id,
	date_trunc('week',
	c.start_datetime at time zone 'Europe/Paris') as semaine
from
	carpool_v2.carpools c
inner JOIN luis.drivers_incentived_cohort dic on
	dic.driver_operator_user_id = c.driver_operator_user_id
	and (c.start_datetime between dic.first_trip_datetime and dic.first_trip_datetime + interval '3 month')
group by
	 1,2),
aggregated_data as (
select 
	t.driver_operator_user_id::text,
	t.semaine,
    min(t.incentives_sirets) as incentives_sirets,
    min(t.incentives_naf_codes) as incentives_naf_codes,
    min(t.cee_trip_datetime) as cee_trip_datetime,
	count(tr.semaine)>0 had_trip
from
	"template" t
left join trips tr on
	t.driver_operator_user_id = tr.driver_operator_user_id
	and t.semaine = tr.semaine
group by
	1,2)

select 
	*,
	row_number() over (partition by driver_operator_user_id order by semaine) as num_semaine
from aggregated_data
order by 1,2
""",
    connection=db_engine,)
df_churn_by_driver

In [None]:
df_churn_by_driver = df_churn_by_driver.with_columns(
    cohorte_cat_expr
)

In [None]:
df_churn_by_driver

In [None]:
def preprocess_week_by_driver_df(df: pl.DataFrame) -> pl.DataFrame:
    df_week_by_cohorte = (
        df.group_by(["num_semaine"])
        .agg((100 * pl.col("had_trip").sum() / df["driver_operator_user_id"].n_unique()).alias("share"))
        .sort("num_semaine")
    )
    return df_week_by_cohorte

In [None]:
df_churn_by_driver["cohorte_cat"].unique().to_list()

In [None]:
plot_configs = [
    {
        "data": preprocess_week_by_driver_df(df_churn_by_driver.filter(pl.col("cohorte_cat") == "CEE avec incitations Entreprise")),
        "name": "CEE avec incitations Entreprise",
        "color": "#d60036",
        "line_dash":"solid"
    },
    {
        "data": preprocess_week_by_driver_df(
            df_churn_by_driver.filter(pl.col("cohorte_cat") == "CEE avec incitations Collectivité OU Opérateur")
        ),
        "name": "CEE avec incitations Collectivité OU Opérateur",
        "color": "#0057ba",
        "line_dash":"solid"
    },
    {
        "data": preprocess_week_by_driver_df(
            df_churn_by_driver.filter(pl.col("cohorte_cat") == "Incitations Entreprise")
        ),
        "name": "Incitations Entreprise",
        "color": "#d60036",
        "line_dash":"dash"
    },
    {
        "data": preprocess_week_by_driver_df(
            df_churn_by_driver.filter(pl.col("cohorte_cat") == "Incitations Collectivité OU Opérateurs")
        ),
        "name": "Incitations Collectivité OU Opérateurs",
        "color": "#0057ba",
        "line_dash":"dash"
    },
]
traces = []
for config in plot_configs:
    data = config["data"]
    trace = go.Scatter(
        x=data["num_semaine"],
        y=data["share"],
        marker_color=config["color"],
        hovertemplate="%{y:.2f}% des conducteurs ont été actifs %{x} semaine(s)",
        name=config["name"],
        visible=config.get("visible", True),
        line_dash=config["line_dash"]
    )
    traces.append(trace)

fig_weeks_by_driver_multi = go.Figure(traces)
fig_weeks_by_driver_multi.update_layout(
    plot_bgcolor="white",
    title="Courbes d'attrition des conducteurs<br>Quel cohorte a réussi à retenir les conducteurs le plus longtemps ?",
    hovermode="x unified",
)
fig_weeks_by_driver_multi.update_yaxes(
    showgrid=True,
    griddash="dot",
    gridwidth=1,
    gridcolor="gray",
    title="Part des conducteurs (%)",
    zeroline=True,
    zerolinecolor="black",
)
fig_weeks_by_driver_multi.update_xaxes(
    title="Numéro de la semaine",
    range=[0, 12],
    dtick=1,
    tickprefix="Semaine ",
    showtickprefix="none",
)

fig_weeks_by_driver_multi.show()
fig_weeks_by_driver_multi.write_html(OUTPUT_PATH / "churn_by_incentives_cohortes.html")
fig_weeks_by_driver_multi.write_image(
    OUTPUT_PATH / "num_semaines_multichurn_by_incentives_cohortes.svg", format="svg", width=1920, height=1080
)