# **Pandas API Extension**

In [None]:
#Imports
import pandas as pd
import upsetplot

In [None]:
try:
    del pd.DataFrame.explore
except AttributeError:
    pass

In [None]:
@pd.api.extensions.register_dataframe_accessor("explore")
class ExploreMethods:
    def __init__(self, pandas_obj):
        self._obj = pandas_obj
    
    # Methods for Exploration of Missing Values
    ## Total number of missing values in the dataset
    def number_missing(self):
        return self._obj.isnull().sum().sum()
    
    ## Total number of complete values in the dataset
    def number_complete(self):
        return self._obj.size - self._obj.explore.number_missing()
    
    ## Summary dataframe of missing values by variables
    def missing_variable_summary(self) -> pd.DataFrame:
        return self._obj.isnull().pipe(
            lambda df_1: (
                df_1.sum()
                .reset_index(name = "n_missing")
                .rename(columns = {"index": "variable"})
                .assign(
                    n_cases = len(df_1),
                    pct_missing = lambda df_2: np.round(df_2["n_missing"] / df_2["n_cases"] * 100, 2)
                )
            )
        )
    
    ## Summary dataframe of missing values by cases
    def missing_case_summary(self) -> pd.DataFrame:
        return self._obj.assign(
            case = lambda df: df.index,
            n_missing = lambda df: df.apply(
                axis = "columns", func = lambda row: row.isna().sum()
            ),
            pct_missing = lambda df: df["n_missing"] / df.shape[1] * 100
        )[["case", "n_missing", "pct_missing"]]
    
    def missing_variable_table(self) -> pd.DataFrame:
        return (
            self._obj.explore.missing_variable_summary()
            .value_counts("n_missing")
            .reset_index()
            .rename(columns = {
                "n_missing": "n_missing_in_variable",
                0: "n_variables"
            })
            .assign(
                pct_variables = lambda df: df["n_variables"] / df["n_variables"].sum() * 100
            )
            .sort_values("pct_variables", ascending=False)
        )
    
    ## Dataframe to check how many rows have a certain amount of missing variables, and viceversa
    def missing_case_table(self) -> pd.DataFrame:
        return (
            self._obj.explore.missing_case_summary()
            .value_counts("n_missing")
            .reset_index()
            .rename(columns = {
                "n_missing": "n_missing_in_case",
                0: "n_cases"
            })
            .assign(
                pct_case = lambda df: df["n_cases"] / df["n_cases"].sum() * 100
            )
            .sort_values("pct_case", ascending = False)
        )
    
    ## Dataframe to check the % of missing values for a specific row span
    def missing_variable_span(self, variable: str, span_every: int) -> pd.DataFrame:
        return (
            self._obj.assign(
                span_counter = lambda df: (
                    np.repeat(a = range(df.shape[0]), repeats = span_every)[:df.shape[0]]
                )
            )
            .groupby("span_counter")
            .aggregate(
                n_in_span = (variable, "size"),
                n_missing = (variable, lambda s: s.isnull().sum())
            )
            .assign(
                n_complete = lambda df: df["n_in_span"] - df["n_missing"],
                pct_missing = lambda df: df["n_missing"] / df["n_in_span"] * 100,
                pct_complete = lambda df: 100 - df["pct_missing"]
            )
            .drop(columns = ["n_in_span"])
            .reset_index()
        )
    
    ## Method to sort the dataset variables by missingness
    def sort_variables_by_missingness(self, ascending = False):
        return (
            self._obj.pipe(
                lambda df: (
                    df[df.isnull().sum().sort_values(ascending = ascending).index]
                )
            )
        )
    
    # Plotting methods
    
    def missing_variable_plot(self):
        sns.displot(
            data = self._obj.isnull().melt(value_name = "Missing"),
            y = "variable",
            hue = "Missing",
            multiple = "fill",
            height = 10,
            aspect = 1
        )

    def missing_variable_plot_2(self):
        df = self._obj.explore.missing_variable_summary().sort_values(by="n_missing")
        plot_range = range(1, len(df.index) + 1)
        
        plt.hlines(
            y = plot_range,
            xmin = 0,
            xmax = df["n_missing"],
            color="black"
        )

        plt.plot(
            df["n_missing"],
            plot_range,
            "o",
            color="black"
        )

        plt.yticks(plot_range, df.variable)
        plt.grid(axis = "y")
        plt.xlabel("Number missing")
        plt.ylabel("Variable")
    
    # Method for plotting the missing values matrix with missingno (msno) library
    def missing_variable_plot_matrix(self):
        msno.matrix(
            self._obj,
            color = (0.3, 0.36, 0.44)
        )
    
    # Upsetplot for missing values
    def missing_upsetplot(self, variables: list[str] = None, **kwargs):

        if variables is None:
            variables = self._obj.columns.tolist()

        return (
            self._obj.isnull().value_counts(variables).pipe(
                lambda df: upsetplot.plot(df, **kwargs)
            )
        )