## Check Uniqueness & Validity

**Objective**: Evaluate data quality by checking for uniqueness and validity of data entries.

For this activity, you will use a sample dataset students.csv that contains the following
columns: ID , Name , Age , Grade , Email .

**Steps**:
1. Check Uniqueness
    - Unique IDs
    - Unique Email Addresses
    - Unique Combination

2. Check Validity
    - Validate Age Range
    - Validate Grade Scale
    - Validate Name Format

In [1]:
# Write your code from here
import pandas as pd
import re

# Function to check for uniqueness of a column
def check_unique(df, column):
    """
    Check if all values in a specified column are unique.
    
    Parameters:
    df (pandas.DataFrame): Input DataFrame.
    column (str): Column name to check uniqueness.
    
    Returns:
    bool: True if all values are unique, False otherwise.
    """
    if column not in df.columns:
        raise KeyError(f"Column '{column}' not found in DataFrame.")
    
    return df[column].is_unique

# Function to check for unique combination of multiple columns
def check_unique_combination(df, columns):
    """
    Check if the combination of multiple columns has unique rows.
    
    Parameters:
    df (pandas.DataFrame): Input DataFrame.
    columns (list): List of column names to check unique combination.
    
    Returns:
    bool: True if all combinations of values are unique, False otherwise.
    """
    if not all(col in df.columns for col in columns):
        raise KeyError("One or more columns not found in DataFrame.")
    
    return df.duplicated(subset=columns).sum() == 0

# Function to validate Age (age should be within 0 to 120)
def validate_age(df, column, min_age=0, max_age=120):
    """
    Check if the Age values fall within a valid range.
    
    Parameters:
    df (pandas.DataFrame): Input DataFrame.
    column (str): Column name to check age validity.
    min_age (int): Minimum valid age (default is 0).
    max_age (int): Maximum valid age (default is 120).
    
    Returns:
    pandas.DataFrame: Rows with invalid age values.
    """
    if column not in df.columns:
        raise KeyError(f"Column '{column}' not found in DataFrame.")
    
    return df[(df[column] < min_age) | (df[column] > max_age)]

# Function to validate Grade (grade should be between 0 and 100)
def validate_grade(df, column, min_grade=0, max_grade=100):
    """
    Check if the Grade values fall within a valid range.
    
    Parameters:
    df (pandas.DataFrame): Input DataFrame.
    column (str): Column name to check grade validity.
    min_grade (int): Minimum valid grade (default is 0).
    max_grade (int): Maximum valid grade (default is 100).
    
    Returns:
    pandas.DataFrame: Rows with invalid grade values.
    """
    if column not in df.columns:
        raise KeyError(f"Column '{column}' not found in DataFrame.")
    
    return df[(df[column] < min_grade) | (df[column] > max_grade)]

# Function to validate Name format (name should be capitalized)
def validate_name_format(df, column):
    """
    Validate if the Name follows proper format (capitalized).
    
    Parameters:
    df (pandas.DataFrame): Input DataFrame.
    column (str): Column name to check name validity.
    
    Returns:
    pandas.DataFrame: Rows with invalid name format.
    """
    if column not in df.columns:
        raise KeyError(f"Column '{column}' not found in DataFrame.")
    
    name_regex = r'^[A-Z][a-z]* [A-Z][a-z]*$'  # First and Last name, both capitalized
    return df[~df[column].str.match(name_regex, na=False)]

# Unit tests for validation functions
import unittest

class TestDataQualityFunctions(unittest.TestCase):
    
    def setUp(self):
        """Setup for unit tests - creating a sample dataframe."""
        data = {
            'ID': [1, 2, 3, 4, 5, 5],
            'Name': ['John Doe', 'Jane Smith', 'Bob Johnson', 'alice white', 'Charlie Brown', 'David Lee'],
            'Age': [20, 22, 19, 25, 121, None],
            'Grade': [85, 90, 78, 88, 101, None],
            'Email': ['johndoe@example.com', 'janesmith@example.com', 'bobjohnson@example.com', 
                      'alicewhite@example.com', 'charliebrown@example.com', 'invalid-email']
        }
        self.df = pd.DataFrame(data)
    
    def test_check_unique(self):
        """Test uniqueness of a single column (ID)."""
        result = check_unique(self.df, 'ID')
        self.assertFalse(result, "IDs should not be unique due to duplication.")
    
    def test_check_unique_combination(self):
        """Test uniqueness of the combination of multiple columns (ID and Email)."""
        result = check_unique_combination(self.df, ['ID', 'Email'])
        self.assertFalse(result, "Combination of ID and Email should be unique.")
    
    def test_validate_age(self):
        """Test validation of the age column."""
        result = validate_age(self.df, 'Age', 0, 120)
        self.assertEqual(len(result), 1, "Should identify 1 invalid age.")

    def test_validate_grade(self):
        """Test validation of the grade column."""
        result = validate_grade(self.df, 'Grade', 0, 100)
        self.assertEqual(len(result), 1, "Should identify 1 invalid grade.")
    
    def test_validate_name_format(self):
        """Test validation of the name format column."""
        result = validate_name_format(self.df, 'Name')
        self.assertEqual(len(result), 1, "Should identify 1 invalid name format.")

if __name__ == "__main__":
    unittest.main(argv=[''], verbosity=2, exit=False)

test_check_unique (__main__.TestDataQualityFunctions)
Test uniqueness of a single column (ID). ... ok
test_check_unique_combination (__main__.TestDataQualityFunctions)
Test uniqueness of the combination of multiple columns (ID and Email). ... FAIL
test_validate_age (__main__.TestDataQualityFunctions)
Test validation of the age column. ... ok
test_validate_grade (__main__.TestDataQualityFunctions)
Test validation of the grade column. ... ok
test_validate_name_format (__main__.TestDataQualityFunctions)
Test validation of the name format column. ... ok

FAIL: test_check_unique_combination (__main__.TestDataQualityFunctions)
Test uniqueness of the combination of multiple columns (ID and Email).
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/tmp/ipykernel_22294/1524112830.py", line 120, in test_check_unique_combination
    self.assertFalse(result, "Combination of ID and Email should be unique.")
AssertionError: True is not 