In [None]:
import pandas as pd
import pelage as plg
import polars as pl

In [None]:
emission_schema = {
    "Vehicle Identifier": pl.Int64,
    "OBFCM data source": pl.String,
    "OBFCM ReportingPeriod": pl.Int64,
    "Total fuel consumed (lifetime) (l)": pl.Float64,
    "Total distance travelled (lifetime) (km)": pl.Float64,
    "Total distance travelled in charge depleting operation with engine off (lifetime) (km)": pl.Float64,
    "Total distance travelled in charge depleting operation with engine running (lifetime) (km)": pl.Float64,
    "Total distance travelled in driver-selectable charge increasing operation (lifetime) (km)": pl.Float64,
    "Total fuel consumed in charge depleting operation (lifetime) (l)": pl.Float64,
    "Total fuel consumed in driver-selectable charge increasing operation (lifetime) (l)": pl.Float64,
    "Total grid energy into the battery (lifetime) (kWh)": pl.Float64,
    "Country": pl.String,
    "VFN": pl.String,
    "Mh": pl.String,
    "T": pl.String,
    "Va": pl.String,
    "Ve": pl.String,
    "Mk": pl.String,
    "Cn": pl.String,
    "Cr": pl.String,
    "M (kg)": pl.Float64,
    "Mt": pl.Float64,
    "Ewltp (g/km)": pl.Float64,
    "Ft": pl.String,
    "Fm": pl.String,
    "Ec (cm3)": pl.Float64,
    "Ep (KW)": pl.Float64,
    "Z (Wh/km)": pl.Float64,
    "Year": pl.Int64,
    "Fuel consumption": pl.Float64,
    "Electric range (km)": pl.Float64,
    "Used in calculation": pl.Int64,
}

In [None]:
emissions = pl.scan_csv(
    "eea_t_real-world-co2-emission_p_2024_v01_r00/2023_Cars_Raw.csv",
    schema=emission_schema,
    null_values=["NULL"],
).rename(
    {
        "Vehicle Identifier": "vehicle_id",
        "OBFCM data source": "obfcm_data_source",
        "OBFCM ReportingPeriod": "reporting_period",
        # "Total fuel consumed (lifetime) (l)": pl.Float64,
        # "Total distance travelled (lifetime) (km)": pl.Float64,
        # "Total distance travelled in charge depleting operation with engine off (lifetime) (km)": pl.Float64,
        # "Total distance travelled in charge depleting operation with engine running (lifetime) (km)": pl.Float64,
        # "Total distance travelled in driver-selectable charge increasing operation (lifetime) (km)": pl.Float64,
        # "Total fuel consumed in charge depleting operation (lifetime) (l)": pl.Float64,
        # "Total fuel consumed in driver-selectable charge increasing operation (lifetime) (l)": pl.Float64,
        # "Total grid energy into the battery (lifetime) (kWh)": pl.Float64,
        "Country": "country",
        "VFN": "vehicle_family_number",
        "Mh": "manufacturer_name",
        "T": "model_type",
        "Va": "model_variant",
        "Ve": "license_plate",
        "Mk": "brand_name",
        "Cn": "commercial_name",
        "Cr": "registered_category",
        "M (kg)": "mass_kd",
        "Mt": "wltp_test_mass",
        "Ewltp (g/km)": "ewltp_g_per_km",
        "Ft": "fuel_type",
        "Fm": "fuel_mode",
        "Ec (cm3)": "engine_capacity_cm3",
        "Ep (KW)": "engine_power_kw",
        "Z (Wh/km)": "electric_consumption_wh_per_km",
        "Year": "year",
        "Fuel consumption": "fuel_consumption",
        "Electric range (km)": "electric_range_km",
        "Used in calculation": "used_in_calculation",
    }
)


In [None]:
emissions_pandas = (
    emissions.filter(pl.col.electric_range_km.is_null()).collect().to_pandas()
)

In [None]:
df = emissions_pandas
df = df[~df["country"].isin(["RO", "LT"])]
df.drop(["vehicle_family_number"], axis=1, inplace=True)
df = df.dropna(subset=["fuel_consumption"])

df[df["fuel_type"] == "PETROL"]["fuel_type"] = "petrol"
df["fuel_consumption_per_100km"] = df["fuel_consumption"] * 100

grouped = []
for manufacturer in df["manufacturer_name"].unique():
    manuf_df = df[df["manufacturer_name"] == manufacturer]
    for year in df["year"].unique():
        subset_df = manuf_df[(manuf_df["year"] == year)]
        result = {
            "manufacturer_name": manufacturer,
            "year": year,
            "mean_fuel_consumption": subset_df["fuel_consumption_per_100km"].mean(),
            "mean_electric_range": subset_df["electric_range_km"].mean(),
            "vehicle_count": subset_df.vehicle_id.nunique(),
        }
        grouped.append(result)

grouped = pd.DataFrame(grouped)
grouped = grouped.dropna()
grouped = grouped.sort_values(["mean_fuel_consumption", "year"], ascending=False)
grouped = grouped[grouped["year"].isin([2022, 2023])]
grouped = grouped.reset_index(drop=True)

old_df = grouped

pd.testing.assert_frame_equal(
    grouped.reset_index(drop=True), old_df.reset_index(drop=True)
)

In [None]:
df = emissions_pandas
df = (
    df[~df["country"].isin(["RO", "LT"])]
    .drop(columns=["vehicle_family_number"])
    .dropna(subset=["fuel_consumption"])
    .assign(
        fuel_type=lambda df: df["fuel_type"].replace("PETROL", "petrol"),
        fuel_consumption_per_100km=lambda df: df["fuel_consumption"] * 100,
    )
)

grouped = (
    df.groupby(["manufacturer_name", "year"])
    .agg(
        mean_fuel_consumption=("fuel_consumption_per_100km", "mean"),
        mean_electric_range=("electric_range_km", "mean"),
        vehicle_count=("vehicle_id", "nunique"),
    )
    .reset_index()
    .sort_values(["mean_fuel_consumption", "year"], ascending=False)
    .loc[lambda df: df["year"].isin([2022, 2023])]
)

In [None]:
df_pl = (
    pl.LazyFrame(emissions_pandas)
    .filter(pl.col.country.is_in(["RO", "LT"]).not_())
    .drop("vehicle_family_number")
    .drop_nulls(subset=["fuel_consumption"])
    .with_columns(
        fuel_type=pl.col.fuel_type.replace("PETROL", "petrol"),
        fuel_consumption_per_100km=pl.col.fuel_consumption * 100,
    )
)

grouped = (
    df_pl.group_by("manufacturer_name", "year")
    .agg(
        mean_fuel_consumption=pl.col.fuel_consumption_per_100km.mean(),
        mean_electric_range=pl.col.electric_range_km.mean(),
        vehicle_count=pl.col.vehicle_id.n_unique(),
    )
    .cast({"vehicle_count": pl.Int64})
    .sort(["mean_fuel_consumption", "year"], descending=True)
    .filter(pl.col.year.is_in([2022, 2023]))
    .collect()
)