In [1]:
import pandas as pd
import numpy as np
import unittest

# ------------- Generate synthetic healthcare dataset --------------

def generate_healthcare_data(n=500):
    np.random.seed(42)
    data = {
        'patient_id': np.arange(n),
        'age': np.random.randint(0, 100, size=n),
        'blood_pressure': np.random.normal(120, 15, size=n),
        'heart_rate': np.random.normal(70, 10, size=n),
        'diagnosis_code': np.random.choice(['A01', 'B02', 'C03', 'D04', None], size=n, p=[0.25, 0.25, 0.25, 0.20, 0.05]),
        'prediction_score': np.random.uniform(0, 1, size=n)
    }
    # Introduce some errors
    data['age'][np.random.choice(n, 5)] = -5  # Invalid ages
    data['blood_pressure'][np.random.choice(n, 5)] = 300  # Unrealistic BP
    data['heart_rate'][np.random.choice(n, 5)] = 0  # Invalid HR
    return pd.DataFrame(data)

# -------------------- Validation Rules ---------------------

def validate_age(df: pd.DataFrame) -> bool:
    try:
        invalid_ages = df['age'][(df['age'] < 0) | (df['age'] > 120)]
        print(f"Invalid ages found: {len(invalid_ages)}")
        return len(invalid_ages) == 0
    except Exception as e:
        print(f"Error validating age: {e}")
        return False

def validate_blood_pressure(df: pd.DataFrame) -> bool:
    try:
        invalid_bp = df['blood_pressure'][(df['blood_pressure'] < 40) | (df['blood_pressure'] > 250)]
        print(f"Invalid blood pressure readings found: {len(invalid_bp)}")
        return len(invalid_bp) == 0
    except Exception as e:
        print(f"Error validating blood pressure: {e}")
        return False

def validate_heart_rate(df: pd.DataFrame) -> bool:
    try:
        invalid_hr = df['heart_rate'][(df['heart_rate'] < 30) | (df['heart_rate'] > 220)]
        print(f"Invalid heart rate readings found: {len(invalid_hr)}")
        return len(invalid_hr) == 0
    except Exception as e:
        print(f"Error validating heart rate: {e}")
        return False

def validate_diagnosis_code(df: pd.DataFrame, valid_codes=None) -> bool:
    if valid_codes is None:
        valid_codes = {'A01', 'B02', 'C03', 'D04'}
    try:
        invalid_codes = df[~df['diagnosis_code'].isin(valid_codes) & df['diagnosis_code'].notnull()]
        print(f"Invalid diagnosis codes found: {len(invalid_codes)}")
        return len(invalid_codes) == 0
    except Exception as e:
        print(f"Error validating diagnosis codes: {e}")
        return False

def validate_prediction_score(df: pd.DataFrame) -> bool:
    try:
        invalid_scores = df[(df['prediction_score'] < 0) | (df['prediction_score'] > 1)]
        print(f"Invalid prediction scores found: {len(invalid_scores)}")
        return len(invalid_scores) == 0
    except Exception as e:
        print(f"Error validating prediction scores: {e}")
        return False

# ----------------------- Unit Tests ---------------------------

class TestHealthcareDataValidation(unittest.TestCase):

    def setUp(self):
        self.df = generate_healthcare_data()

    def test_validate_age(self):
        self.assertIsInstance(validate_age(self.df), bool)

    def test_validate_blood_pressure(self):
        self.assertIsInstance(validate_blood_pressure(self.df), bool)

    def test_validate_heart_rate(self):
        self.assertIsInstance(validate_heart_rate(self.df), bool)

    def test_validate_diagnosis_code(self):
        self.assertIsInstance(validate_diagnosis_code(self.df), bool)

    def test_validate_prediction_score(self):
        self.assertIsInstance(validate_prediction_score(self.df), bool)

# ----------------------- Main Execution -------------------------

if __name__ == "__main__":
    print("---- Healthcare Dataset Validation ----")

    df = generate_healthcare_data()

    print("\n1️⃣ Age Validation:")
    age_valid = validate_age(df)

    print("\n2️⃣ Blood Pressure Validation:")
    bp_valid = validate_blood_pressure(df)

    print("\n3️⃣ Heart Rate Validation:")
    hr_valid = validate_heart_rate(df)

    print("\n4️⃣ Diagnosis Code Validation:")
    diag_valid = validate_diagnosis_code(df)

    print("\n5️⃣ Prediction Score Validation:")
    pred_valid = validate_prediction_score(df)

    if all([age_valid, bp_valid, hr_valid, diag_valid, pred_valid]):
        print("\n✅ All validation checks passed.")
    else:
        print("\n❌ Some validation checks failed. Investigate the data errors.")

    print("\n---- Running Unit Tests ----")
    unittest.main(argv=['first-arg-is-ignored'], exit=False)


.....
----------------------------------------------------------------------
Ran 5 tests in 0.011s

OK


---- Healthcare Dataset Validation ----

1️⃣ Age Validation:
Invalid ages found: 5

2️⃣ Blood Pressure Validation:
Invalid blood pressure readings found: 5

3️⃣ Heart Rate Validation:
Invalid heart rate readings found: 5

4️⃣ Diagnosis Code Validation:
Invalid diagnosis codes found: 0

5️⃣ Prediction Score Validation:
Invalid prediction scores found: 0

❌ Some validation checks failed. Investigate the data errors.

---- Running Unit Tests ----
Invalid ages found: 5
Invalid blood pressure readings found: 5
Invalid diagnosis codes found: 0
Invalid heart rate readings found: 5
Invalid prediction scores found: 0
