## Check Accuracy & Completeness

**Objective**: Learn to assess data quality by checking for accuracy and completeness using Python.

For this, you will use a sample dataset students.csv that contains the following
columns: ID , Name , Age , Grade , Email .

**Steps**:
1. Check Accuracy
    - Verify Numerical Data Accuracy
    - Validate Email Format
    - Integer Accuracy Check for Age
2. Check Completeness
    - Identify Missing Values
    - Rows with Missing Data
    - Column Specific Missing Value Check

In [1]:
# Write your code from here
import pandas as pd
import re

# Function to check for missing values in a DataFrame
def check_null_values(df):
    """
    Check for null values in the DataFrame.
    
    Parameters:
    df (pandas.DataFrame): Input DataFrame to check for null values.
    
    Returns:
    pandas.DataFrame: Rows with null values.
    """
    if not isinstance(df, pd.DataFrame):
        raise ValueError("Input must be a pandas DataFrame.")
    
    return df[df.isnull().any(axis=1)]

# Function to check numerical accuracy of a column
def check_numerical_accuracy(df, column, min_value, max_value):
    """
    Check if the values in a numerical column fall within the specified range.
    
    Parameters:
    df (pandas.DataFrame): Input DataFrame.
    column (str): Column name to check.
    min_value (int or float): Minimum valid value for the column.
    max_value (int or float): Maximum valid value for the column.
    
    Returns:
    pandas.DataFrame: Rows where values in the column are out of the specified range.
    """
    if column not in df.columns:
        raise KeyError(f"Column '{column}' not found in DataFrame.")
    
    if df[column].dtype not in ['int64', 'float64']:
        raise TypeError(f"Column '{column}' must be numeric.")

    return df[(df[column] < min_value) | (df[column] > max_value)]

# Function to validate email format using regex
def validate_email_format(df, column):
    """
    Validate if email addresses in the specified column have the correct format.
    
    Parameters:
    df (pandas.DataFrame): Input DataFrame.
    column (str): Column name to validate.
    
    Returns:
    pandas.DataFrame: Rows with invalid email format.
    """
    if column not in df.columns:
        raise KeyError(f"Column '{column}' not found in DataFrame.")
    
    if not df[column].dtype == 'object':
        raise TypeError(f"Column '{column}' must be of type string.")
    
    email_regex = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
    return df[~df[column].str.match(email_regex, na=False)]

# Function to check if the age is valid
def check_age_validity(df, column, min_age=0, max_age=120):
    """
    Ensure that the age values in the column are within a reasonable human range.
    
    Parameters:
    df (pandas.DataFrame): Input DataFrame.
    column (str): Column name to check age validity.
    min_age (int): Minimum valid age (default is 0).
    max_age (int): Maximum valid age (default is 120).
    
    Returns:
    pandas.DataFrame: Rows with invalid age values.
    """
    if column not in df.columns:
        raise KeyError(f"Column '{column}' not found in DataFrame.")
    
    return check_numerical_accuracy(df, column, min_age, max_age)

# Function to check for missing values in a specific column
def check_column_missing(df, column):
    """
    Check if a specific column has missing values.
    
    Parameters:
    df (pandas.DataFrame): Input DataFrame.
    column (str): Column name to check for missing values.
    
    Returns:
    pandas.DataFrame: Rows where the specified column has missing values.
    """
    if column not in df.columns:
        raise KeyError(f"Column '{column}' not found in DataFrame.")
    
    return df[df[column].isnull()]

# Unit tests for validation functions
import unittest

class TestDataQualityFunctions(unittest.TestCase):
    
    def setUp(self):
        """Setup for unit tests - creating a sample dataframe."""
        data = {
            'ID': [1, 2, 3, 4, 5, 6],
            'Name': ['John Doe', 'Jane Smith', 'Bob Johnson', 'Alice White', 'Charlie Brown', 'David Lee'],
            'Age': [20, 22, 19, 25, 21, None],
            'Grade': [85, 90, 78, 88, 95, None],
            'Email': ['johndoe@example.com', 'janesmith@example.com', 'bobjohnson@example.com', 
                      'alicewhite@example.com', 'charliebrown@example.com', 'invalid-email']
        }
        self.df = pd.DataFrame(data)
    
    def test_check_null_values(self):
        """Test checking for rows with null values."""
        result = check_null_values(self.df)
        self.assertEqual(len(result), 2, "Should identify 2 rows with missing values.")
    
    def test_check_numerical_accuracy(self):
        """Test numerical accuracy for age and grade."""
        result = check_numerical_accuracy(self.df, 'Age', 0, 120)
        self.assertEqual(len(result), 1, "Should identify 1 invalid age.")

    def test_validate_email_format(self):
        """Test email validation."""
        result = validate_email_format(self.df, 'Email')
        self.assertEqual(len(result), 1, "Should identify 1 invalid email.")

    def test_check_age_validity(self):
        """Test the validity of the age column."""
        result = check_age_validity(self.df, 'Age', 0, 120)
        self.assertEqual(len(result), 1, "Should identify 1 invalid age.")
    
    def test_check_column_missing(self):
        """Test checking for missing values in specific column."""
        result = check_column_missing(self.df, 'Grade')
        self.assertEqual(len(result), 1, "Should identify 1 row with missing grade.")

if __name__ == "__main__":
    unittest.main(argv=[''], verbosity=2, exit=False)

test_check_age_validity (__main__.TestDataQualityFunctions)
Test the validity of the age column. ... FAIL
test_check_column_missing (__main__.TestDataQualityFunctions)
Test checking for missing values in specific column. ... ok
test_check_null_values (__main__.TestDataQualityFunctions)
Test checking for rows with null values. ... FAIL
test_check_numerical_accuracy (__main__.TestDataQualityFunctions)
Test numerical accuracy for age and grade. ... FAIL
test_validate_email_format (__main__.TestDataQualityFunctions)
Test email validation. ... ok

FAIL: test_check_age_validity (__main__.TestDataQualityFunctions)
Test the validity of the age column.
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/tmp/ipykernel_21504/2373566044.py", line 135, in test_check_age_validity
    self.assertEqual(len(result), 1, "Should identify 1 invalid age.")
AssertionError: 0 != 1 : Should identify 1 invalid age.

FAIL: test_check_null_values (__