In [None]:
import pandas as pd

In [None]:
import pelage as plg
import polars as pl

In [None]:
emission_schema = {
    "Vehicle Identifier": pl.Int64,
    "OBFCM data source": pl.String,
    "OBFCM ReportingPeriod": pl.Int64,
    "Total fuel consumed (lifetime) (l)": pl.Float64,
    "Total distance travelled (lifetime) (km)": pl.Float64,
    "Total distance travelled in charge depleting operation with engine off (lifetime) (km)": pl.Float64,
    "Total distance travelled in charge depleting operation with engine running (lifetime) (km)": pl.Float64,
    "Total distance travelled in driver-selectable charge increasing operation (lifetime) (km)": pl.Float64,
    "Total fuel consumed in charge depleting operation (lifetime) (l)": pl.Float64,
    "Total fuel consumed in driver-selectable charge increasing operation (lifetime) (l)": pl.Float64,
    "Total grid energy into the battery (lifetime) (kWh)": pl.Float64,
    "Country": pl.String,
    "VFN": pl.String,
    "Mh": pl.String,
    "T": pl.String,
    "Va": pl.String,
    "Ve": pl.String,
    "Mk": pl.String,
    "Cn": pl.String,
    "Cr": pl.String,
    "M (kg)": pl.Float64,
    "Mt": pl.Float64,
    "Ewltp (g/km)": pl.Float64,
    "Ft": pl.String,
    "Fm": pl.String,
    "Ec (cm3)": pl.Float64,
    "Ep (KW)": pl.Float64,
    "Z (Wh/km)": pl.Float64,
    "Year": pl.Int64,
    "Fuel consumption": pl.Float64,
    "Electric range (km)": pl.Float64,
    "Used in calculation": pl.Int64,
}

In [None]:
emissions = pl.scan_csv(
    "eea_t_real-world-co2-emission_p_2024_v01_r00/2023_Cars_Raw.csv",
    schema=emission_schema,
    null_values=["NULL"],
).rename(
    {
        "Vehicle Identifier": "vehicle_id",
        "OBFCM data source": "obfcm_data_source",
        "OBFCM ReportingPeriod": "reporting_period",
        # "Total fuel consumed (lifetime) (l)": pl.Float64,
        # "Total distance travelled (lifetime) (km)": pl.Float64,
        # "Total distance travelled in charge depleting operation with engine off (lifetime) (km)": pl.Float64,
        # "Total distance travelled in charge depleting operation with engine running (lifetime) (km)": pl.Float64,
        # "Total distance travelled in driver-selectable charge increasing operation (lifetime) (km)": pl.Float64,
        # "Total fuel consumed in charge depleting operation (lifetime) (l)": pl.Float64,
        # "Total fuel consumed in driver-selectable charge increasing operation (lifetime) (l)": pl.Float64,
        # "Total grid energy into the battery (lifetime) (kWh)": pl.Float64,
        "Country": "country",
        "VFN": "vehicle_family_number",
        "Mh": "manufacturer_name",
        "T": "model_type",
        "Va": "model_variant",
        "Ve": "license_plate",
        "Mk": "brand_name",
        "Cn": "commercial_name",
        "Cr": "registered_category",
        "M (kg)": "mass_kd",
        "Mt": "wltp_test_mass",
        "Ewltp (g/km)": "ewltp_g_per_km",
        "Ft": "fuel_type",
        "Fm": "fuel_mode",
        # "Ec (cm3)": pl.Float64,
        # "Ep (KW)": pl.Float64,
        # "Z (Wh/km)": pl.Float64,
        "Year": "year",
        "Fuel consumption": "fuel_consumption",
        "Electric range (km)": "electric_range_km",
        # "Used in calculation": pl.Int64,
    }
)
(
    emissions.pipe(plg.accepted_values, {"obfcm_data_source": ["OEM", "MS"]})
    .pipe(plg.accepted_range, {"reporting_period": (2021, 2023)})
    .pipe(plg.accepted_range, {"year": (2021, 2023)})

)

In [None]:
emissions.collect()

In [None]:
emissions.select(pl.col.country.unique()).collect()

In [None]:
df = emissions.collect().to_pandas()
df = df[~df["country"].isin(["RO", "LT", "LU", "LV", "EE", "CY"])]
df = df[df["electric_range_km"] < 40]
df.drop(["vehicle_family_number"], axis=1, inplace=True)
df = df.dropna(subset=["fuel_consumption"])
df["manufacturer_name"] = df["manufacturer_name"].apply(lambda x: x.replace(" ", "-"))
df["fuel_consumption_per_100km"] = df["fuel_consumption"] * 100

grouped = []
for country in df["country"].unique():
    for manufacturer in df["manufacturer_name"].unique():

        for year in df["year"].unique():
            subset_df = df[
                (df["year"] == year)
                & (df["manufacturer_name"] == manufacturer)
                & (df["country"] == country)
            ]
            result = {
                "country": country,
                "manufacturer_name": manufacturer,
                "year": year,
                "mean_fuel_consumption": subset_df["fuel_consumption_per_100km"].mean(),
                "mean_electric_range": subset_df["electric_range_km"].mean(),
                "vehicle_count": subset_df.vehicle_id.nunique(),
            }
            grouped.append(result)

grouped = pd.DataFrame(grouped)

grouped = grouped.sort_values("mean_fuel_consumption", ascending=False)
# Filter for years 2022 and 2023 only
grouped = grouped[grouped["year"].isin([2022, 2023])]

In [None]:
grouped

In [None]:
df = emissions.collect().to_pandas()
df = df[~df["country"].isin(["RO", "LT", "LU", "LV", "EE", "CY"])]
df.drop(["vehicle_family_number"], axis=1, inplace=True)
df = df.dropna(subset=["fuel_consumption"])
df = df[df["electric_range_km"] < 40]
df["manufacturer_name"] = df["manufacturer_name"].apply(lambda x: x.replace(" ", "-"))
df["fuel_consumption_per_100km"] = df["fuel_consumption"] * 100
grouped = (
    df.groupby(["country", "manufacturer_name", "year"])
    .agg(
        mean_fuel_consumption=("fuel_consumption", "mean"),
        mean_electric_range=("electric_range_km", "mean"),
        vehicle_count=("vehicle_id", "count"),
    )
    .reset_index()
)
grouped = grouped.sort_values("mean_fuel_consumption", ascending=False)

grouped = grouped[grouped["year"].isin([2022, 2023])]

In [None]:
df = emissions.collect().to_pandas()
df = (
    df[~df["country"].isin(["RO", "LT", "LU", "LV", "EE", "CY"])]
    .drop(["vehicle_family_number"], axis=1)
    .dropna(subset=["fuel_consumption"])
    .loc[lambda df: df["electric_range_km"] < 40]
    .assign(
        manufacturer_name=lambda df: df["manufacturer_name"].str.replace(" ", "-"),
        fuel_consumption_per_100km=lambda df: df["fuel_consumption"] * 100,
    )
)

grouped = (
    df.groupby(["country", "manufacturer_name", "year"])
    .agg(
        mean_fuel_consumption=("fuel_consumption", "mean"),
        mean_electric_range=("electric_range_km", "mean"),
        vehicle_count=("vehicle_id", "count"),
    )
    .reset_index()
    .sort_values("mean_fuel_consumption", ascending=False)
    .loc[lambda df: df["year"].isin([2022, 2023])]
)

In [None]:
grouped

In [None]:
df = df[df["country"].isin(["FR", "ES", "IT"])]
df = df.drop(["vehicle_family_number"], axis=1)

In [None]:
pd.read_parquet(
    "Pathologies-effectif de patients par pathologie, sexe, classe d'âge et territoire.parquet"
)

In [None]:
first_names = pd.read_csv(
    "DS_PRENOM_2024_data.csv",
    sep=";",
)

In [None]:
pd.read_xml("https://www.hatvp.fr/livraison/merge/declarations.xml", parser="etree")