## Check Accuracy & Completeness

**Objective**: Learn to assess data quality by checking for accuracy and completeness using Python.

For this, you will use a sample dataset students.csv that contains the following
columns: ID , Name , Age , Grade , Email .

**Steps**:
1. Check Accuracy
    - Verify Numerical Data Accuracy
    - Validate Email Format
    - Integer Accuracy Check for Age
2. Check Completeness
    - Identify Missing Values
    - Rows with Missing Data
    - Column Specific Missing Value Check

In [1]:
# Ques_2.ipynb

import pandas as pd
import re
import unittest

# -- Data Quality Validation Functions --

def is_grade_valid(grade):
    """
    Check if grade is numeric and between 0 and 100 inclusive.
    Returns True if valid, else False.
    """
    if pd.isna(grade):
        return False
    try:
        val = float(grade)
        return 0 <= val <= 100
    except (ValueError, TypeError):
        return False

def is_email_valid(email):
    """
    Validate email format using regex pattern.
    Returns True if valid, else False.
    """
    if not isinstance(email, str) or not email.strip():
        return False
    pattern = r'^[\w\.-]+@[\w\.-]+\.\w+$'
    return bool(re.fullmatch(pattern, email.strip()))

def is_age_valid(age):
    """
    Validate age: must be integer and non-negative.
    Returns True if valid, else False.
    """
    if pd.isna(age):
        return False
    if isinstance(age, float) and age.is_integer():
        age = int(age)
    if not isinstance(age, int):
        return False
    return age >= 0

def check_completeness(df, mandatory_fields):
    """
    Check for rows missing any mandatory fields.
    Returns DataFrame of rows with missing data.
    """
    missing_mask = df[mandatory_fields].isnull() | (df[mandatory_fields].astype(str).apply(lambda x: x.str.strip()) == '')
    return df[missing_mask.any(axis=1)]

# -- Load Data --

df = pd.read_csv('students.csv')

# -- Apply Checks --

df['Grade_Valid'] = df['Grade'].apply(is_grade_valid)
df['Email_Valid'] = df['Email'].apply(is_email_valid)
df['Age_Valid'] = df['Age'].apply(is_age_valid)

mandatory_columns = ['ID', 'Name', 'Age', 'Grade', 'Email']
rows_missing = check_completeness(df, mandatory_columns)

# -- Display Summary --

print("Invalid Grade rows:")
print(df.loc[~df['Grade_Valid']])

print("\nInvalid Email rows:")
print(df.loc[~df['Email_Valid']])

print("\nInvalid Age rows:")
print(df.loc[~df['Age_Valid']])

print("\nRows with missing mandatory fields:")
print(rows_missing)

# -- Unit Tests --

class TestDataQuality(unittest.TestCase):

    def test_is_grade_valid(self):
        self.assertTrue(is_grade_valid(0))
        self.assertTrue(is_grade_valid(100))
        self.assertTrue(is_grade_valid(75.5))
        self.assertFalse(is_grade_valid(-1))
        self.assertFalse(is_grade_valid(150))
        self.assertFalse(is_grade_valid('abc'))
        self.assertFalse(is_grade_valid(None))

    def test_is_email_valid(self):
        self.assertTrue(is_email_valid('test@example.com'))
        self.assertTrue(is_email_valid('user.name-123@sub.domain.co'))
        self.assertFalse(is_email_valid('test@example'))
        self.assertFalse(is_email_valid(''))
        self.assertFalse(is_email_valid(None))
        self.assertFalse(is_email_valid('invalid@.com'))

    def test_is_age_valid(self):
        self.assertTrue(is_age_valid(0))
        self.assertTrue(is_age_valid(25))
        self.assertFalse(is_age_valid(-5))
        self.assertFalse(is_age_valid(25.5))
        self.assertFalse(is_age_valid('25'))
        self.assertFalse(is_age_valid(None))

if __name__ == "__main__":
    unittest.main(argv=[''], verbosity=2, exit=False)


FileNotFoundError: [Errno 2] No such file or directory: 'students.csv'