In [1]:
import pandas as pd

# Define the table as a list of dictionaries for easier processing
data = [
    {"Characteristic": "Age", "Category": "under 18", "Kambia": 20.3, "Kailahun": 14.0, "Pujehun": 9.5, "Total": 14.6},
    {"Characteristic": "Age", "Category": "18-29", "Kambia": 27.1, "Kailahun": 56.8, "Pujehun": 76.2, "Total": 53.4},
    {"Characteristic": "Age", "Category": "30-49", "Kambia": 35.1, "Kailahun": 23.6, "Pujehun": 13.9, "Total": 24.2},
    {"Characteristic": "Age", "Category": "50+", "Kambia": 17.5, "Kailahun": 5.6, "Pujehun": 0.4, "Total": 7.8},
    # Add other characteristics similarly...
]

# Convert to DataFrame
df = pd.DataFrame(data)

# Define constants
n_kambia = 250
n_kailahun = 250
n_pujehun = 250
n_total = 750

# Error-checking functions
def check_percentage_totals(row):
    """Check if the weighted average of percentages matches the total percentage."""
    weighted_average = (
        row['Kambia'] * n_kambia + row['Kailahun'] * n_kailahun + row['Pujehun'] * n_pujehun
    ) / n_total
    return abs(weighted_average - row['Total']) < 0.1  # Allow a small tolerance

def check_category_totals(df, characteristic):
    """Check if percentages add up to 100% for a specific characteristic."""
    subset = df[df["Characteristic"] == characteristic]
    total = subset[["Kambia", "Kailahun", "Pujehun", "Total"]].sum()
    return (abs(total - 100) < 0.1).all()

# Perform checks
df['Percentage Check'] = df.apply(check_percentage_totals, axis=1)

# Print results
print("Checking percentage totals for each category:")
print(df[['Characteristic', 'Category', 'Percentage Check']])

# Check if category totals add up to 100%
characteristics = df['Characteristic'].unique()
for characteristic in characteristics:
    result = check_category_totals(df, characteristic)
    print(f"Totals check for {characteristic}: {'Pass' if result else 'Fail'}")

# Identify specific issues
issues = df[df['Percentage Check'] == False]
if not issues.empty:
    print("\nRows with percentage mismatch:")
    print(issues)
else:
    print("\nNo percentage mismatches found.")


Checking percentage totals for each category:
  Characteristic  Category  Percentage Check
0            Age  under 18              True
1            Age     18-29              True
2            Age     30-49              True
3            Age       50+              True
Totals check for Age: Pass

No percentage mismatches found.


##  Anomalies found in the Table = Sex distribution per district

In [3]:
import pandas as pd

# Define data for the sex variable
data = [
    {"District": "Kambia", "Male": 94, "Female": 157},
    {"District": "Kailahun", "Male": 23, "Female": 192},
    {"District": "Pujehun", "Male": 23, "Female": 229},
]

# Convert data to a DataFrame
df_sex = pd.DataFrame(data)

# Add a column for total participants in each district
df_sex["Total Participants"] = df_sex["Male"] + df_sex["Female"]

# Define the expected total
expected_total = 250

# Check for inconsistencies
df_sex["Is Total Correct"] = df_sex["Total Participants"] == expected_total

# Print results
print("Sex variable validation results:")
print(df_sex)

# Highlight rows with errors
errors = df_sex[df_sex["Is Total Correct"] == False]
if not errors.empty:
    print("\nDiscrepancies found in the following districts:")
    print(errors)
else:
    print("\nAll districts have the correct total of participants.")


Sex variable validation results:
   District  Male  Female  Total Participants  Is Total Correct
0    Kambia    94     157                 251             False
1  Kailahun    23     192                 215             False
2   Pujehun    23     229                 252             False

Discrepancies found in the following districts:
   District  Male  Female  Total Participants  Is Total Correct
0    Kambia    94     157                 251             False
1  Kailahun    23     192                 215             False
2   Pujehun    23     229                 252             False


##  Anomalies found in the Table = marital status distribution per district

In [10]:
import pandas as pd

# Define data for marital status variables
data = [
    {"District": "Kambia", "Single": 54, "Married": 180, "Windowed": 15, "Divorced": 0, "Separated": 2},
    {"District": "Kailahun", "Single": 55, "Married": 181, "Windowed": 11, "Divorced": 1, "Separated": 2},
    {"District": "Pujehun", "Single": 94, "Married": 145, "Windowed": 9, "Divorced": 1, "Separated": 3},
]

# Convert data to a DataFrame
df_marital = pd.DataFrame(data)

# Add a column for total participants in each district
df_marital["Total Participants"] = (
    df_marital["Single"] + 
    df_marital["Married"] + 
    df_marital["Windowed"] + 
    df_marital["Divorced"] + 
    df_marital["Separated"]
)

# Define the expected total
expected_total = 250

# Check for inconsistencies
df_marital["Is Total Correct"] = df_marital["Total Participants"] == expected_total

# Print results
print("Marital Status variable validation results:")
print(df_marital)

# Highlight rows with errors
errors = df_marital[df_marital["Is Total Correct"] == False]
if not errors.empty:
    print("\nDiscrepancies found in the following districts:")
    print(errors)
else:
    print("\nAll districts have the correct total of participants.")


Marital Status variable validation results:
   District  Single  Married  Windowed  Divorced  Separated  \
0    Kambia      54      180        15         0          2   
1  Kailahun      55      181        11         1          2   
2   Pujehun      94      145         9         1          3   

   Total Participants  Is Total Correct  
0                 251             False  
1                 250              True  
2                 252             False  

Discrepancies found in the following districts:
  District  Single  Married  Windowed  Divorced  Separated  \
0   Kambia      54      180        15         0          2   
2  Pujehun      94      145         9         1          3   

   Total Participants  Is Total Correct  
0                 251             False  
2                 252             False  


##  Anomalies found in the Table = Facility Ownership  distribution per district

In [12]:
import pandas as pd

# Define data for facility ownership variables
data = [
    {"District": "Kambia", "Public": 242, "Private": 3, "Unknown": 6},
    {"District": "Kailahun", "Public": 221, "Private": 3, "Unknown": 27},
    {"District": "Pujehun", "Public": 212, "Private": 1, "Unknown": 39},
]

# Convert data to a DataFrame
df_facility = pd.DataFrame(data)

# Add a column for total participants in each district
df_facility["Total Participants"] = df_facility["Public"] + df_facility["Private"] + df_facility["Unknown"]

# Define the expected total
expected_total = 250

# Check for inconsistencies
df_facility["Is Total Correct"] = df_facility["Total Participants"] == expected_total

# Print results
print("Facility Ownership variable validation results:")
print(df_facility)

# Highlight rows with errors
errors = df_facility[df_facility["Is Total Correct"] == False]
if not errors.empty:
    print("\nDiscrepancies found in the following districts:")
    print(errors)
else:
    print("\nAll districts have the correct total of participants.")


Facility Ownership variable validation results:
   District  Public  Private  Unknown  Total Participants  Is Total Correct
0    Kambia     242        3        6                 251             False
1  Kailahun     221        3       27                 251             False
2   Pujehun     212        1       39                 252             False

Discrepancies found in the following districts:
   District  Public  Private  Unknown  Total Participants  Is Total Correct
0    Kambia     242        3        6                 251             False
1  Kailahun     221        3       27                 251             False
2   Pujehun     212        1       39                 252             False


##  Anomalies found in the Table = Education Level distribution per district

In [16]:
import pandas as pd

# Define data for education level variables
data = [
    {"District": "Kambia", "None": 117, "Primary": 55, "Secondary": 56, "Tertiary": 6, "Other": 17},
    {"District": "Kailahun", "None": 91, "Primary": 51, "Secondary": 98, "Tertiary": 9, "Other": 1},
    {"District": "Pujehun", "None": 75, "Primary": 40, "Secondary": 134, "Tertiary": 0, "Other": 3},
]

# Convert data to a DataFrame
df_education = pd.DataFrame(data)

# Add a column for total participants in each district
df_education["Total Participants"] = (
    df_education["None"] +
    df_education["Primary"] +
    df_education["Secondary"] +
    df_education["Tertiary"] +
    df_education["Other"]
)

# Define the expected total
expected_total = 250

# Check for inconsistencies
df_education["Is Total Correct"] = df_education["Total Participants"] == expected_total

# Print results
print("Education Level variable validation results:")
print(df_education)

# Highlight rows with errors
errors = df_education[df_education["Is Total Correct"] == False]
if not errors.empty:
    print("\nDiscrepancies found in the following districts:")
    print(errors)
else:
    print("\nAll districts have the correct total of participants.")


Education Level variable validation results:
   District  None  Primary  Secondary  Tertiary  Other  Total Participants  \
0    Kambia   117       55         56         6     17                 251   
1  Kailahun    91       51         98         9      1                 250   
2   Pujehun    75       40        134         0      3                 252   

   Is Total Correct  
0             False  
1              True  
2             False  

Discrepancies found in the following districts:
  District  None  Primary  Secondary  Tertiary  Other  Total Participants  \
0   Kambia   117       55         56         6     17                 251   
2  Pujehun    75       40        134         0      3                 252   

   Is Total Correct  
0             False  
2             False  


##  Anomalies found in the Table = Health Facility Utilization distribution per district

In [18]:
import pandas as pd

# Define data for health facility utilization variables
data = [
    {"District": "Kambia", "Yes": 245, "No": 6},
    {"District": "Kailahun", "Yes": 224, "No": 26},
    {"District": "Pujehun", "Yes": 213, "No": 39},
]

# Convert data to a DataFrame
df_utilization = pd.DataFrame(data)

# Add a column for total participants in each district
df_utilization["Total Participants"] = df_utilization["Yes"] + df_utilization["No"]

# Define the expected total
expected_total = 250

# Check for inconsistencies
df_utilization["Is Total Correct"] = df_utilization["Total Participants"] == expected_total

# Print results
print("Health Facility Utilization variable validation results:")
print(df_utilization)

# Highlight rows with errors
errors = df_utilization[df_utilization["Is Total Correct"] == False]
if not errors.empty:
    print("\nDiscrepancies found in the following districts:")
    print(errors)
else:
    print("\nAll districts have the correct total of participants.")


Health Facility Utilization variable validation results:
   District  Yes  No  Total Participants  Is Total Correct
0    Kambia  245   6                 251             False
1  Kailahun  224  26                 250              True
2   Pujehun  213  39                 252             False

Discrepancies found in the following districts:
  District  Yes  No  Total Participants  Is Total Correct
0   Kambia  245   6                 251             False
2  Pujehun  213  39                 252             False
