In [3]:
import pandas as pd
import os
import glob
import re
from openpyxl import load_workbook
from openpyxl.utils import get_column_letter
from openpyxl.styles import Alignment
from difflib import SequenceMatcher
from collections import defaultdict


def assess_completeness(df, column_name):
    total_rows = len(df)
    missing_values = df[column_name].isnull().sum()
    non_missing_values = total_rows - missing_values
    completeness_percentage = (non_missing_values / total_rows) * 100

    completeness_df = pd.DataFrame({
        'Column Name': [column_name],
        'Total Rows': [total_rows],
        'Missing Values': [missing_values],
        'Non-Missing Values': [non_missing_values],
        'Completeness (%)': [round(completeness_percentage, 2)]
    })

    return completeness_df

def read_data_quality_template(excel_file_path):
    df_template = pd.read_excel(excel_file_path, sheet_name='Data Quality Checks', skiprows=1)
    return df_template

def evaluate_data_quality(data_file_path, template_file_path):
    df_template = read_data_quality_template(template_file_path)

    if data_file_path.endswith('.csv'):
        df_data = pd.read_csv(data_file_path)
    elif data_file_path.endswith('.xlsx'):
        df_data = pd.read_excel(data_file_path)
    else:
        raise ValueError("Unsupported file format. Please use .csv or .xlsx files.")
    
    completeness_results = pd.DataFrame()

    if not df_data.empty:
        for index, row in df_template.iterrows():
            column_name = row['column_names']
            if str(row['test_completeness']).strip().lower() == 'yes':
                if column_name in df_data.columns:
                    completeness_df = assess_completeness(df_data, column_name)
                    completeness_results = pd.concat([completeness_results, completeness_df], ignore_index=True)
                else:
                    print(f"Warning: Column '{column_name}' not found in data file.")
    else:
        print("Warning: The data file is empty.")
    
    if completeness_results.empty:
        print("No completeness analysis results to display.")
    else:
        print("Completeness analysis results:")
        print(completeness_results)

    return completeness_results

# Example:
data_file_path = 'data/appointments.csv'
template_file_path = 'data/data_quality_checks_template.xlsx'


# Evaluate data quality based on the template
completeness_results = evaluate_data_quality(data_file_path, template_file_path)

# Display the DataFrame
completeness_results


Completeness analysis results:
        Column Name  Total Rows  Missing Values  Non-Missing Values  \
0    appointment_id         500               0                 500   
1        patient_id         500               0                 500   
2       provider_id         500               0                 500   
3  appointment_date         500               0                 500   
4            reason         500               0                 500   
5           waiting         500               0                 500   
6      waiting_time         500               0                 500   

   Completeness (%)  
0             100.0  
1             100.0  
2             100.0  
3             100.0  
4             100.0  
5             100.0  
6             100.0  


Unnamed: 0,Column Name,Total Rows,Missing Values,Non-Missing Values,Completeness (%)
0,appointment_id,500,0,500,100.0
1,patient_id,500,0,500,100.0
2,provider_id,500,0,500,100.0
3,appointment_date,500,0,500,100.0
4,reason,500,0,500,100.0
5,waiting,500,0,500,100.0
6,waiting_time,500,0,500,100.0
