In [None]:
def instance_based_na_values_stats(train_dat, output_path):
    """
    Calculates the number and percentage of NA values for each sample in the df,
    creates visualizations, and saves the graphs to the specified output path.

    Parameters:
    - train_dat: pandas df containing the data.
    - output_path: string, path to save the output graphs.

    Returns:
    - na_data_samples: pandas df containing samples, NA counts, and NA percentages, histogram and freq plot
    """

    # Calculate the number and percentage of NA values for each sample
    na_counts_samples = train_dat.isna().sum(axis=1)
    na_percentages_samples = (na_counts_samples / train_dat.shape[1]) * 100

    # Create a df with the results for samples
    na_data_samples = pd.DataFrame({
        'Sample': train_dat.index,
        'NA_Count': na_counts_samples.values,
        'NA_Percentage': na_percentages_samples.values
    })

    # Create bins for percentages (0% to 100% in steps of 5%)
    bins = np.arange(0, 105, 5)
    na_data_samples['Percentage_Bin'] = pd.cut(na_data_samples['NA_Percentage'], bins, right=False, include_lowest=True)

    # Count the number of samples in each bin
    bin_counts = na_data_samples['Percentage_Bin'].value_counts().sort_index()

    # Plotting the histogram
    plt.figure(figsize=(12, 6))
    bin_labels = [f'{int(interval.left)}-{int(interval.right)}%' for interval in bin_counts.index]
    plt.bar(bin_labels, bin_counts.values, width=0.8, edgecolor='black', align='center')
    plt.xlabel('Percentage of NA Values per Sample')
    plt.ylabel('Number of Samples')
    plt.title('Frequency of Samples by Percentage of Missing Values')
    plt.xticks(rotation=45)
    plt.grid(axis='y', linestyle='--', alpha=0.2)
    plt.tight_layout()
    # Save the histogram
    histogram_path = os.path.join(output_path, 'na_percentage_samples_histogram.png')
    plt.savefig(histogram_path)
    plt.close()

    # Plotting frequency of samples with missing values
    na_data_samples_sorted = na_data_samples.sort_values(by='NA_Percentage').reset_index(drop=True)
    cumulative_samples = np.arange(1, len(na_data_samples_sorted) + 1)

    plt.figure(figsize=(10, 6))
    plt.plot(na_data_samples_sorted['NA_Percentage'], cumulative_samples, marker='o', linestyle='-', color='c')
    plt.xlabel('Percentage of NA Values per Sample')
    plt.ylabel(' Number of Samples')
    plt.title('Distribution of Missing Values per Sample')
    plt.grid(True)
    plt.tight_layout()
    # Save the plot
    cdf_plot_path = os.path.join(output_path, 'na_percentage_samples_cumulative_plot.png')
    plt.savefig(cdf_plot_path)
    plt.close()

    return na_data_samples

In [None]:
def na_values_percentage_per_feature(train_dat):
    """
    Calculates the number and percentage of NA values for each feature in the df,
    and plots the percentage of missing values per feature as a bar plot.

    Parameters:
    - train_dat: pandas df containing the data (all original data)

    Returns:
    - na_data: pandas df containing features, NA counts, and NA percentages.
    """
    # Calculate the number of NA values for each feature
    na_counts = train_dat.isna().sum()
    
    # Calculate the percentage of NA values for each feature
    na_percentages = (na_counts / len(train_dat)) * 100
    
    # Create a df with the results
    na_data = pd.DataFrame({
        'Feature': na_counts.index,
        'NA_Count': na_counts.values,
        'NA_Percentage': na_percentages.values
    })
    
    # Sort the df based on NA_Percentage in descending order
    na_data = na_data.sort_values(by='NA_Percentage', ascending=False)
    
    # Plotting the NA percentages as a bar plot
    plt.figure(figsize=(12, 6))
    plt.bar(na_data['Feature'], na_data['NA_Percentage'])
    plt.xticks(rotation=90)
    plt.xlabel('Features')
    plt.ylabel('Percentage of NA Values')
    plt.title('Percentage of Missing Values per Feature')
    plt.tight_layout()
    plt.show()
    
    return na_data
