In [1]:
import pandas as pd
import numpy as np
import upsetplot

@pd.api.extensions.register_dataframe_accessor("missing")
class MissingMethods:
    def __init__(self, pandas_obj):
        self._obj = pandas_obj

    def missing_variable_summary(self) -> pd.DataFrame:
        return self._obj.isnull().pipe(
            lambda df_1: (
                df_1.sum()
                .reset_index(name="n_missing")
                .rename(columns={"index": "variable"})
                .assign(
                    n_cases=len(df_1),
                    pct_missing=lambda df_2: df_2.n_missing / df_2.n_cases
                )
            )
        )
    
    def missing_case_summary(self) -> pd.DataFrame:
        return self._obj.assign(
            case=lambda df: df.index,
            n_missing=lambda df: df.apply(
                axis="columns", func=lambda row: row.isna().sum()
            ),
            pct_missing=lambda df: df["n_missing"] / df.shape[1] * 100
        )[["case", "n_missing", "pct_missing"]]
    
    def missing_variable_table(self) -> pd.DataFrame:
        return (
            self._obj.missing.missing_variable_summary()
            .value_counts("n_missing")
            .reset_index()
            .rename(columns={
                "n_missing": "n_missing_in_variable",
                0: "n_variables"
            })
            .assign(
                pct_variables=lambda df: df["n_variables"] / df["n_variables"].sum() * 100
            )
            .sort_values("pct_variables", ascending=False)
        )
    
    def missing_variable_span(self, variable: str, span_every: int) -> pd.DataFrame:
        return (
            self._obj.assign(
                span_counter=lambda df: (
                    np.repeat(a=range(df.shape[0]), repeats=span_every)[: df.shape[0]]
                )
            )
            .groupby("span_counter")
            .aggregate(
                n_in_span=(variable, "size"),
                n_missing=(variable, lambda s: s.isnull().sum())
            )
            .assign(
                n_complete=lambda df: df["n_in_span"] - df["n_missing"],
                pct_missing=lambda df: df["n_missing"] / df["n_in_span"] * 100,
                pct_complete=lambda df: 100 - df["pct_missing"]
            )
            .drop(columns=["n_in_span"])
            .reset_index()
        )
    
    def missing_upsetplot(self, variables: list[str] = None, **kwargs):

        if variables is None:
            variables = self._obj.columns.tolist()

        return (
            self._obj.isna()
            .value_counts(variables)
            .pipe(lambda df: upsetplot.plot(df, **kwargs))
        )

In [2]:
data = pd.read_csv(
    "./data/owid-co2-data.csv",
)

data.head()

Unnamed: 0,country,year,iso_code,population,gdp,cement_co2,cement_co2_per_capita,co2,co2_growth_abs,co2_growth_prct,...,share_global_cumulative_other_co2,share_global_flaring_co2,share_global_gas_co2,share_global_luc_co2,share_global_oil_co2,share_global_other_co2,total_ghg,total_ghg_excluding_lucf,trade_co2,trade_co2_share
0,Afghanistan,1850,AFG,3752993.0,,,,,,,...,,,,0.121,,,,,,
1,Afghanistan,1851,AFG,3769828.0,,,,,,,...,,,,0.118,,,,,,
2,Afghanistan,1852,AFG,3787706.0,,,,,,,...,,,,0.116,,,,,,
3,Afghanistan,1853,AFG,3806634.0,,,,,,,...,,,,0.115,,,,,,
4,Afghanistan,1854,AFG,3825655.0,,,,,,,...,,,,0.114,,,,,,


In [12]:
df = pd.DataFrame(data)

In [25]:
df.pipe(
    lambda x: (
        x[x.isnull().sum().sort_values(ascending = False).index]
    )
)

Unnamed: 0,other_industry_co2,share_global_other_co2,other_co2_per_capita,cumulative_other_co2,share_global_cumulative_other_co2,consumption_co2_per_gdp,consumption_co2_per_capita,trade_co2_share,trade_co2,consumption_co2,...,co2,land_use_change_co2_per_capita,population,share_global_cumulative_luc_co2,cumulative_luc_co2,share_global_luc_co2,land_use_change_co2,iso_code,year,country
0,,,,,,,,,,,...,,0.781,3752993.0,0.121,2.931,0.121,2.931,AFG,1850,Afghanistan
1,,,,,,,,,,,...,,0.787,3769828.0,0.119,5.899,0.118,2.968,AFG,1851,Afghanistan
2,,,,,,,,,,,...,,0.784,3787706.0,0.118,8.867,0.116,2.968,AFG,1852,Afghanistan
3,,,,,,,,,,,...,,0.789,3806634.0,0.118,11.871,0.115,3.004,AFG,1853,Afghanistan
4,,,,,,,,,,,...,,0.785,3825655.0,0.117,14.876,0.114,3.004,AFG,1854,Afghanistan
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46518,,,,,,0.479,0.712,9.486,0.910,10.506,...,9.596,0.643,14751101.0,0.314,2281.390,0.219,9.490,ZWE,2017,Zimbabwe
46519,,,,,,0.553,0.835,6.537,0.771,12.567,...,11.795,0.587,15052191.0,0.314,2290.220,0.211,8.830,ZWE,2018,Zimbabwe
46520,,,,,,,0.788,8.795,0.978,12.092,...,11.115,0.542,15354606.0,0.313,2298.537,0.183,8.317,ZWE,2019,Zimbabwe
46521,,,,,,,0.741,9.481,1.006,11.614,...,10.608,0.503,15669663.0,0.312,2306.415,0.194,7.878,ZWE,2020,Zimbabwe
