In [None]:
import pandas as pd
import polars as pl

import load_data

emissions = load_data.data_loader()

In [None]:
emissions.select(pl.col.country.unique()).collect()

In [None]:
df = emissions.collect().to_pandas()
df = df[~df["country"].isin(["RO", "LT", "LU", "LV", "EE", "CY"])]
df = df[df["electric_range_km"] < 40]
df.drop(["vehicle_family_number"], axis=1, inplace=True)
df = df.dropna(subset=["fuel_consumption"])
df["manufacturer_name"] = df["manufacturer_name"].apply(lambda x: x.replace(" ", "-"))
df["fuel_consumption_per_100km"] = df["fuel_consumption"] * 100

grouped = []
for country in df["country"].unique():
    for manufacturer in df["manufacturer_name"].unique():

        for year in df["year"].unique():
            subset_df = df[
                (df["year"] == year)
                & (df["manufacturer_name"] == manufacturer)
                & (df["country"] == country)
            ]
            result = {
                "country": country,
                "manufacturer_name": manufacturer,
                "year": year,
                "mean_fuel_consumption": subset_df["fuel_consumption_per_100km"].mean(),
                "mean_electric_range": subset_df["electric_range_km"].mean(),
                "vehicle_count": subset_df.vehicle_id.nunique(),
            }
            grouped.append(result)

grouped = pd.DataFrame(grouped)

grouped = grouped.sort_values("mean_fuel_consumption", ascending=False)
# Filter for years 2022 and 2023 only
grouped = grouped[grouped["year"].isin([2022, 2023])]

In [None]:
grouped

In [None]:
df = emissions.collect().to_pandas()
df = df[~df["country"].isin(["RO", "LT", "LU", "LV", "EE", "CY"])]
df.drop(["vehicle_family_number"], axis=1, inplace=True)
df = df.dropna(subset=["fuel_consumption"])
df = df[df["electric_range_km"] < 40]
df["manufacturer_name"] = df["manufacturer_name"].apply(lambda x: x.replace(" ", "-"))
df["fuel_consumption_per_100km"] = df["fuel_consumption"] * 100
grouped = (
    df.groupby(["country", "manufacturer_name", "year"])
    .agg(
        mean_fuel_consumption=("fuel_consumption", "mean"),
        mean_electric_range=("electric_range_km", "mean"),
        vehicle_count=("vehicle_id", "count"),
    )
    .reset_index()
)
grouped = grouped.sort_values("mean_fuel_consumption", ascending=False)

grouped = grouped[grouped["year"].isin([2022, 2023])]

In [None]:
df = emissions.collect().to_pandas()
df = (
    df[~df["country"].isin(["RO", "LT", "LU", "LV", "EE", "CY"])]
    .drop(["vehicle_family_number"], axis=1)
    .dropna(subset=["fuel_consumption"])
    .loc[lambda df: df["electric_range_km"] < 40]
    .assign(
        manufacturer_name=lambda df: df["manufacturer_name"].str.replace(" ", "-"),
        fuel_consumption_per_100km=lambda df: df["fuel_consumption"] * 100,
    )
)

grouped = (
    df.groupby(["country", "manufacturer_name", "year"])
    .agg(
        mean_fuel_consumption=("fuel_consumption", "mean"),
        mean_electric_range=("electric_range_km", "mean"),
        vehicle_count=("vehicle_id", "count"),
    )
    .reset_index()
    .sort_values("mean_fuel_consumption", ascending=False)
    .loc[lambda df: df["year"].isin([2022, 2023])]
)

In [None]:
grouped

In [None]:
df = df[df["country"].isin(["FR", "ES", "IT"])]
df = df.drop(["vehicle_family_number"], axis=1)

In [None]:
pd.read_parquet(
    "Pathologies-effectif de patients par pathologie, sexe, classe d'Ã¢ge et territoire.parquet"
)

In [None]:
first_names = pd.read_csv(
    "DS_PRENOM_2024_data.csv",
    sep=";",
)

In [None]:
pd.read_xml("https://www.hatvp.fr/livraison/merge/declarations.xml", parser="etree")