### Healthcare – Patient Data Accuracy

**Task 1**: Patient Record Accuracy Assessment

**Objective**: Achieve high accuracy in patient records.

**Steps**:
1. Examine a sample patient dataset for common inaccuracies.
2. Identify at least three common issues, such as medication errors or misdiagnoses.
3. Propose validation measures to ensure data accuracy at the point of entry.

In [1]:
# Write your code from here
import pandas as pd

# Sample Patient Data
data = {
    'Patient_ID': ['001', '002', '003', '004', '005', '006'],
    'Name': ['John Doe', 'Jane Doe', 'Mary Johnson', 'James Brown', 'Sarah Green', 'Michael White'],
    'Date_of_Birth': ['1980-05-01', '1992-07-15', '1970-08-22', '1995-02-28', '2003-09-10', '2000-03-15'],
    'Gender': ['Male', 'Female', 'Female', 'Male', 'Female', 'Male'],
    'Medications': ['Aspirin', 'Metformin', 'Incorrect Medicine', 'None', 'Ibuprofen', '-'],
    'Diagnosis': ['Hypertension', 'Diabetes Type 2', 'Hypertension', '-', 'Asthma', 'Unspecified'],
    'Date_of_Last_Visit': ['2025-03-01', '2025-01-15', '2024-12-10', '2025-03-20', '2025-04-05', '2025-03-10'],
    'Primary_Care_Physician': ['Dr. Smith', 'Dr. Taylor', 'Dr. Davis', 'Dr. Smith', 'Dr. Taylor', 'Dr. Davis'],
    'Insurance_Details': ['ABC Health', 'XYZ Insurance', 'DEF Health', 'ABC Health', 'XYZ Insurance', 'None']
}

# Convert to DataFrame
df = pd.DataFrame(data)

# Save to CSV
df.to_csv('patient_data.csv', index=False)
print("CSV file 'patient_data.csv' has been generated successfully!")

CSV file 'patient_data.csv' has been generated successfully!


**Task 2**: Implement Healthcare Data Quality Checks

**Objective**: Maintain accurate health records within a healthcare system.

**Steps**:
1. Develop a validation workflow for patient data.
2. Use appropriate software to automate checks for common errors.

In [2]:
# Write your code from here
import pandas as pd
from datetime import datetime

# Sample Patient Data
data = {
    'Patient_ID': ['001', '002', '003', '004', '005', '006'],
    'Name': ['John Doe', 'Jane Doe', 'Mary Johnson', 'James Brown', 'Sarah Green', 'Michael White'],
    'Date_of_Birth': ['1980-05-01', '1992-07-15', '1970-08-22', '1995-02-28', '2003-09-10', '2000-03-15'],
    'Gender': ['Male', 'Female', 'Female', 'Male', 'Female', 'Male'],
    'Medications': ['Aspirin', 'Metformin', 'Incorrect Medicine', 'None', 'Ibuprofen', '-'],
    'Diagnosis': ['Hypertension', 'Diabetes Type 2', 'Hypertension', '-', 'Asthma', 'Unspecified'],
    'Date_of_Last_Visit': ['2025-03-01', '2025-01-15', '2024-12-10', '2025-03-20', '2025-04-05', '2025-03-10'],
    'Primary_Care_Physician': ['Dr. Smith', 'Dr. Taylor', 'Dr. Davis', 'Dr. Smith', 'Dr. Taylor', 'Dr. Davis'],
    'Insurance_Details': ['ABC Health', 'XYZ Insurance', 'DEF Health', 'ABC Health', 'XYZ Insurance', 'None']
}

# Convert to DataFrame
df = pd.DataFrame(data)

def validate_patient_data(df):
    issues = []
    
    # Error Handling for empty DataFrame
    if df.empty:
        issues.append("Error: DataFrame is empty!")
        return issues

    # Check for missing required fields
    required_columns = ['Patient_ID', 'Name', 'Date_of_Birth', 'Gender', 'Medications', 'Diagnosis', 
                        'Date_of_Last_Visit', 'Primary_Care_Physician', 'Insurance_Details']
    for column in required_columns:
        if column not in df.columns:
            issues.append(f"Error: Missing required column '{column}'")
    
    # Medication Validation
    valid_medications = ['Aspirin', 'Metformin', 'Ibuprofen']
    for index, row in df.iterrows():
        if row['Medications'] not in valid_medications and row['Medications'] != '-':
            issues.append(f"Invalid medication for Patient ID {row['Patient_ID']}: {row['Medications']}")
    
    # Diagnosis Validation
    for index, row in df.iterrows():
        if row['Diagnosis'] in ['-', 'Unspecified', 'None']:
            issues.append(f"Missing or invalid diagnosis for Patient ID {row['Patient_ID']}: {row['Diagnosis']}")
    
    # Insurance Validation
    for index, row in df.iterrows():
        if row['Insurance_Details'] == 'None':
            issues.append(f"Missing insurance details for Patient ID {row['Patient_ID']}")
    
    # Date Validation (DOB and Visit Date)
    current_date = datetime.now()
    for index, row in df.iterrows():
        try:
            dob = pd.to_datetime(row['Date_of_Birth'])
            if dob > current_date:
                issues.append(f"Future birthdate for Patient ID {row['Patient_ID']}: {row['Date_of_Birth']}")
        except Exception as e:
            issues.append(f"Error in Date_of_Birth for Patient ID {row['Patient_ID']}: {e}")
        
        try:
            visit_date = pd.to_datetime(row['Date_of_Last_Visit'])
            if visit_date > current_date:
                issues.append(f"Future visit date for Patient ID {row['Patient_ID']}: {row['Date_of_Last_Visit']}")
        except Exception as e:
            issues.append(f"Error in Date_of_Last_Visit for Patient ID {row['Patient_ID']}: {e}")
    
    # Gender Consistency
    valid_genders = ['Male', 'Female', 'Other']
    for index, row in df.iterrows():
        if row['Gender'] not in valid_genders:
            issues.append(f"Invalid gender for Patient ID {row['Patient_ID']}: {row['Gender']}")
    
    return issues

# Validate Patient Data
validation_issues = validate_patient_data(df)
if validation_issues:
    for issue in validation_issues:
        print(issue)
else:
    print("No issues found in the patient records.")

Invalid medication for Patient ID 003: Incorrect Medicine
Invalid medication for Patient ID 004: None
Missing or invalid diagnosis for Patient ID 004: -
Missing or invalid diagnosis for Patient ID 006: Unspecified
Missing insurance details for Patient ID 006


In [3]:
import unittest
import pandas as pd
from datetime import datetime

# Assuming `validate_patient_data` function is imported from the validation module
# Test DataFrame with invalid data
test_data = {
    'Patient_ID': ['001', '002', '003'],
    'Name': ['John Doe', 'Jane Doe', 'Mary Johnson'],
    'Date_of_Birth': ['1980-05-01', '1992-07-15', '1970-08-22'],
    'Gender': ['Male', 'Female', 'Invalid'],
    'Medications': ['Aspirin', 'Metformin', 'InvalidMed'],
    'Diagnosis': ['Hypertension', '-', 'Diabetes'],
    'Date_of_Last_Visit': ['2025-03-01', '2025-05-01', '2025-03-20'],
    'Primary_Care_Physician': ['Dr. Smith', 'Dr. Taylor', 'Dr. Davis'],
    'Insurance_Details': ['ABC Health', 'None', 'DEF Health']
}

df_test = pd.DataFrame(test_data)

class TestPatientDataValidation(unittest.TestCase):
    def setUp(self):
        self.df = df_test

    def test_validate_medications(self):
        issues = validate_patient_data(self.df)
        self.assertIn("Invalid medication for Patient ID 003: InvalidMed", issues)
    
    def test_validate_gender(self):
        issues = validate_patient_data(self.df)
        self.assertIn("Invalid gender for Patient ID 003: Invalid", issues)
    
    def test_validate_diagnosis(self):
        issues = validate_patient_data(self.df)
        self.assertIn("Missing or invalid diagnosis for Patient ID 002: -", issues)

    def test_validate_future_visit(self):
        issues = validate_patient_data(self.df)
        self.assertIn("Future visit date for Patient ID 002: 2025-05-01", issues)
    
    def test_empty_dataframe(self):
        empty_df = pd.DataFrame()
        issues = validate_patient_data(empty_df)
        self.assertIn("Error: DataFrame is empty!", issues)

if __name__ == '__main__':
    unittest.main()

usage: ipykernel_launcher.py [-h] [-v] [-q] [--locals] [-f] [-c] [-b]
                             [-k TESTNAMEPATTERNS]
                             [tests ...]
ipykernel_launcher.py: error: argument -f/--failfast: ignored explicit argument '/home/vscode/.local/share/jupyter/runtime/kernel-v37840ad6cfc2a7c2664184bc44f23849a067fc06c.json'


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
