#### Getting DataFrame dictionary and setting another variables

In [None]:
df_dictionary = pd.read_csv('../data/utils/DF_DICTIONARY.csv')

years_of_military_dictatorship = [
    (1930,1932),
    (1943,1946),
    (1955,1958),
    (1962,1963),
    (1966,1973),
    (1976,1983)
]

# Setting initial plots styles
sns.set_style(
    rc={
        "figure.figsize": (8, 6)
    },
    style="whitegrid"
)

## DataFrame Dictionary Functions

In [1]:
def get_indicator_name(indicator_code, indicator_code_in_snake_case=False):
    if indicator_code_in_snake_case == False:
        df_filtered = df_dictionary.loc[df_dictionary['Indicator Code'] == indicator_code, 'Indicator Name']
        if df_filtered.empty:
            return ''
        else:
            return df_filtered.iloc[0]
    else:
        df_filtered = df_dictionary.loc[df_dictionary['Indicator Code Snake Case'] == indicator_code, 'Indicator Name']
        if df_filtered.empty:
            return ''
        else:
            return df_filtered.iloc[0]

# Dataframe getting columns with missing values

In [None]:
def get_columns_with_missing_values(data):
    columns_with_missing_values = data.isna().sum().sort_values(ascending=False)
    columns_with_missing_values = columns_with_missing_values[columns_with_missing_values > 0].index.to_list()
    return columns_with_missing_values

## DataFrame filtering by missing values count

In [None]:
def filter_by_cols_first(df, percentage):
    cols_filtered_first = df.dropna(thresh=int(df.shape[0] * percentage), axis=1)
    return cols_filtered_first.dropna(thresh=int(cols_filtered_first.shape[1] * percentage), axis=0)

In [None]:
def filter_by_rows_first(df, percentage):
    rows_filtered_second = df.dropna(thresh=int(df.shape[1] * percentage), axis=0)
    return rows_filtered_second.dropna(thresh=int(rows_filtered_second.shape[0] * percentage), axis=1)


In [None]:
#TODO: Check this alternative function
# Calculate the percentage of non-missing values for each column
# threshold = 0.7  # Keep indicators with at least 70% non-missing data
# non_missing_percentage = data.notnull().mean()

# # Filter indicators
# useful_indicators = non_missing_percentage[non_missing_percentage >= threshold].index
# filtered_data = data[useful_indicators]

In [None]:
def add_indicators_of_year(df, year):
    new_year_indicator = arg_di_df[arg_di_df["Year"] == year]
    new_year_indicator.missing.missing_case_summary()
    df = pd.concat([df, new_year_indicator]).sort_index()

## Plotting functions

In [1]:
def plot_missing_vs_variable(data, variable, variable_na, type_of_plot="displot", kind="kde"): 
    if type_of_plot=="displot":
        sns.displot(data=data,x=variable,hue=variable_na,kind=kind)
    elif type_of_plot=="boxenplot":
        sns.boxenplot(data=data,y=variable,x=variable_na)

In [2]:
def plot_2_missing_vs_variable(data, variable, variable_1_na, variable_2_na, type_of_plot="displot"): 
    if type_of_plot=="displot":
        sns.displot(data=data,x=variable,col=variable_1_na,row=variable_2_na)
        
def plot_2_missing(data, variable_1, variable_2, type_of_plot="scatterplot"): 
    if type_of_plot=="scatterplot":
        data_with_na_filled = data.apply(
            axis="rows",
            func= lambda column: column.fillna(column.min()) if column.name in [variable_1.split("_NA")[0], variable_2.split("_NA")[0]] else column 
        ).assign(
                nullity=lambda df: (df[variable_1] == "Missing") | (df[variable_2] == "Missing")
            )

        sns.scatterplot(data=data_with_na_filled, x=variable_1.split("_NA")[0], y=variable_2.split("_NA")[0], hue="nullity")

# Imputation Data

In [None]:
def imputation_of_missing_variable(data, variable):
    (
        data
            [[variable, variable + "_NA"]]
            .apply(
                axis="rows",
                func=lambda column: column.fillna(column.mean()) if column.name == variable else column
            )
            .pipe(
                lambda df: (
                    sns.displot(
                        data=df,
                        x=variable,
                        hue=variable+"_NA"
                    )
                )
            )
    )

## Old functions

#### Getting the percentage of variables with a minimum percentage of missing values

In [None]:
def percentage_of_variables_with_pct_missing(df: pd.DataFrame, percentage: float, greater_than_pct = True):
    variable_summary = df.missing.missing_variable_summary()
    if greater_than_pct:
        return round(variable_summary[variable_summary['pct_missing'] >= percentage]['variable'].count() / variable_summary.shape[0], 2)
    else:
        return round(variable_summary[variable_summary['pct_missing'] <= percentage]['variable'].count() / variable_summary.shape[0], 2)

#### Getting the column names with a maximum percentage of missing values

In [None]:
def columns_with_pct_missing(df: pd.DataFrame, percentage: float, greater_than_pct = True):
    variable_summary = df.missing.missing_variable_summary()
    if greater_than_pct:
        missing_variables_code = list(variable_summary[variable_summary['pct_missing'] >= percentage]['variable'])
    else:
        missing_variables_code = list(variable_summary[variable_summary['pct_missing'] <= percentage]['variable'])
    return missing_variables_code

### Getting the rows with more missing values

In [None]:
def cases_with_pct_missing(df: pd.DataFrame, percentage: float, greater_than_pct = True):
    variable_summary = df.missing.missing_case_summary()
    if greater_than_pct:
        indexes_of_cases = list(variable_summary[variable_summary['pct_missing'] > percentage]['case'])
    else:
        indexes_of_cases = list(variable_summary[variable_summary['pct_missing'] < percentage]['case'])
    return arg_di_df.iloc[indexes_of_cases]