In [1]:
import pandas as pd
import re

# --------- Validation Helper ---------
def validate_dataframe(df):
    if not isinstance(df, pd.DataFrame):
        raise ValueError("Input must be a pandas DataFrame.")
    if df.empty or df.isnull().all().all():
        return False
    return True

# --------- Completeness Metric ---------
def calculate_completeness(df):
    if not validate_dataframe(df):
        return 0.0
    total_cells = df.size
    non_missing = df.notnull().sum().sum()
    return round(non_missing / total_cells, 2)

# --------- Accuracy Metric ---------
def calculate_accuracy(df):
    if not validate_dataframe(df):
        return 0.0
    if 'email' not in df.columns:
        return 0.0

    pattern = r'^[\w\.-]+@[\w\.-]+\.\w+$'
    valid_emails = df['email'].dropna().apply(lambda x: bool(re.match(pattern, str(x))))
    return round(valid_emails.sum() / len(valid_emails), 2) if len(valid_emails) > 0 else 0.0

# --------- Consistency Metric ---------
def calculate_consistency(df):
    if not validate_dataframe(df):
        return 0.0
    if 'grade' not in df.columns:
        return 0.0

    valid_grades = {'A', 'B', 'C', 'D', 'E', 'F'}
    consistent_grades = df['grade'].dropna().apply(lambda x: x in valid_grades)
    return round(consistent_grades.sum() / len(consistent_grades), 2) if len(consistent_grades) > 0 else 0.0

# --------- Combined DQI Score ---------
def calculate_dqi(df):
    completeness = calculate_completeness(df)
    accuracy = calculate_accuracy(df)
    consistency = calculate_consistency(df)
    dqi_score = round((completeness + accuracy + consistency) / 3, 2)
    return {
        "completeness": completeness,
        "accuracy": accuracy,
        "consistency": consistency,
        "dqi_score": dqi_score
    }

# --------- Sample Usage ---------
df = pd.read_csv('data_quality_sample.csv')
score = calculate_dqi(df)
print(score)

EmptyDataError: No columns to parse from file

In [2]:
import pandas as pd
import re
import unittest

# --------- Validation Helper ---------
def validate_dataframe(df):
    if not isinstance(df, pd.DataFrame):
        raise ValueError("Input must be a pandas DataFrame.")
    if df.empty or df.isnull().all().all():
        return False
    return True

# --------- Completeness Metric ---------
def calculate_completeness(df):
    if not validate_dataframe(df):
        return 0.0
    total_cells = df.size
    non_missing = df.notnull().sum().sum()
    return round(non_missing / total_cells, 2)

# --------- Accuracy Metric ---------
def calculate_accuracy(df):
    if not validate_dataframe(df):
        return 0.0
    if 'email' not in df.columns:
        return 0.0
    pattern = r'^[\w\.-]+@[\w\.-]+\.\w+$'
    valid_emails = df['email'].dropna().apply(lambda x: bool(re.match(pattern, str(x))))
    return round(valid_emails.sum() / len(valid_emails), 2) if len(valid_emails) > 0 else 0.0

# --------- Consistency Metric ---------
def calculate_consistency(df):
    if not validate_dataframe(df):
        return 0.0
    if 'grade' not in df.columns:
        return 0.0
    valid_grades = {'A', 'B', 'C', 'D', 'E', 'F'}
    consistent_grades = df['grade'].dropna().apply(lambda x: x in valid_grades)
    return round(consistent_grades.sum() / len(consistent_grades), 2) if len(consistent_grades) > 0 else 0.0

# --------- Combined DQI Score ---------
def calculate_dqi(df):
    completeness = calculate_completeness(df)
    accuracy = calculate_accuracy(df)
    consistency = calculate_consistency(df)
    dqi_score = round((completeness + accuracy + consistency) / 3, 2)
    return {
        "completeness": completeness,
        "accuracy": accuracy,
        "consistency": consistency,
        "dqi_score": dqi_score
    }

# --------- Sample Usage ---------
df = pd.DataFrame({
    'name': ['Alice', 'Bob', None],
    'email': ['alice@example.com', 'bob[at]email.com', None],
    'grade': ['A', 'Z', 'B']
})

print("DQI Metrics:", calculate_dqi(df))

# ----------------------------------
# Unit Tests
# ----------------------------------

class TestDataQualityFunctions(unittest.TestCase):

    def test_completeness_all_nan(self):
        df = pd.DataFrame({'A': [None, None], 'B': [None, None]})
        self.assertEqual(calculate_completeness(df), 0.0)

    def test_completeness_partial(self):
        df = pd.DataFrame({'A': [1, None], 'B': [None, 2]})
        self.assertEqual(calculate_completeness(df), 0.5)

    def test_accuracy_valid_and_invalid(self):
        df = pd.DataFrame({'email': ['good@email.com', 'bademail', None]})
        self.assertEqual(calculate_accuracy(df), 0.5)

    def test_accuracy_missing_column(self):
        df = pd.DataFrame({'name': ['Alice']})
        self.assertEqual(calculate_accuracy(df), 0.0)

    def test_consistency_valid_and_invalid(self):
        df = pd.DataFrame({'grade': ['A', 'Z', 'C', None]})
        self.assertEqual(calculate_consistency(df), 0.5)

    def test_consistency_missing_column(self):
        df = pd.DataFrame({'name': ['Alice']})
        self.assertEqual(calculate_consistency(df), 0.0)

    def test_dqi_combined(self):
        df = pd.DataFrame({
            'name': ['Alice', 'Bob'],
            'email': ['a@b.com', 'wrong'],
            'grade': ['A', 'X']
        })
        result = calculate_dqi(df)
        self.assertIn('dqi_score', result)
        self.assertTrue(0.0 <= result['dqi_score'] <= 1.0)

# Run the tests if this notebook is executed as main
unittest.main(argv=[''], exit=False)

.....F.
FAIL: test_consistency_valid_and_invalid (__main__.TestDataQualityFunctions)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/tmp/ipykernel_11700/2605225692.py", line 87, in test_consistency_valid_and_invalid
    self.assertEqual(calculate_consistency(df), 0.5)
AssertionError: np.float64(0.67) != 0.5

----------------------------------------------------------------------
Ran 7 tests in 0.011s

FAILED (failures=1)


DQI Metrics: {'completeness': np.float64(0.78), 'accuracy': np.float64(0.5), 'consistency': np.float64(0.67), 'dqi_score': np.float64(0.65)}


<unittest.main.TestProgram at 0x77fc53c1fc70>

In [3]:
import pandas as pd

data = {
    "id": [1,2,3,4,5,6,7,8,9,10],
    "name": ["Alice", "Bob", "Charlie", "David", "Eve", "Frank", "Grace", "Hank", "Ivy", "Jack"],
    "age": [25, None, 30, 27, None, 22, 29, 35, 28, 31],
    "email": ["alice@example.com", "bob[at]example.com", None, "david@example.com", "eve@example.com",
              "frank@example.com", "grace@example.com", "hank@example.com", "ivy@example.com", "jack@example.com"],
    "gender": ["F", "M", "M", "M", "F", "M", "F", "M", "F", "M"],
    "grade": ["A", "B", "C", "A", "B", "D", "E", "F", "G", "H"]
}

df = pd.DataFrame(data)
df.to_csv("data_quality_sample.csv", index=False)