## Check Uniqueness & Validity

**Objective**: Evaluate data quality by checking for uniqueness and validity of data entries.

For this activity, you will use a sample dataset students.csv that contains the following
columns: ID , Name , Age , Grade , Email .

**Steps**:
1. Check Uniqueness
    - Unique IDs
    - Unique Email Addresses
    - Unique Combination

2. Check Validity
    - Validate Age Range
    - Validate Grade Scale
    - Validate Name Format

In [1]:
import pandas as pd
import re
import unittest
import os

# ---------------------------
# Data Loading with Checks
# ---------------------------
def load_data(filepath: str) -> pd.DataFrame:
    """
    Loads the CSV data ensuring the file exists and required columns are present.

    Args:
        filepath (str): Path to the CSV file.

    Returns:
        pd.DataFrame: Loaded DataFrame.

    Raises:
        FileNotFoundError: If the file does not exist.
        ValueError: If required columns are missing.
    """
    if not os.path.exists(filepath):
        raise FileNotFoundError(f"File not found: {filepath}")

    df = pd.read_csv(filepath)
    required_cols = {'ID', 'Name', 'Age', 'Grade', 'Email'}
    missing_cols = required_cols - set(df.columns)
    if missing_cols:
        raise ValueError(f"Missing columns in dataset: {missing_cols}")

    return df

# ---------------------------
# Uniqueness Checks
# ---------------------------
def check_unique_ids(df: pd.DataFrame) -> bool:
    """
    Check if all IDs are unique.

    Args:
        df (pd.DataFrame): Dataset containing 'ID' column.

    Returns:
        bool: True if all IDs are unique, else False.
    """
    return df['ID'].is_unique

def check_unique_emails(df: pd.DataFrame) -> bool:
    """
    Check if all Email addresses are unique.

    Args:
        df (pd.DataFrame): Dataset containing 'Email' column.

    Returns:
        bool: True if all emails are unique, else False.
    """
    return df['Email'].is_unique

def check_unique_combinations(df: pd.DataFrame) -> bool:
    """
    Check if combination of Name, Age, and Grade is unique for each record.

    Args:
        df (pd.DataFrame): Dataset with 'Name', 'Age', 'Grade' columns.

    Returns:
        bool: True if all combinations are unique, else False.
    """
    combination = df[['Name', 'Age', 'Grade']]
    return combination.duplicated().sum() == 0

# ---------------------------
# Validity Checks
# ---------------------------
def validate_age(age) -> bool:
    """
    Validate age is an integer between 0 and 120 (inclusive).

    Args:
        age: The age value to validate.

    Returns:
        bool: True if valid, False otherwise.
    """
    if not isinstance(age, (int, float)) or pd.isna(age):
        return False
    if isinstance(age, float) and not age.is_integer():
        return False
    age_int = int(age)
    return 0 <= age_int <= 120

def validate_grade(grade) -> bool:
    """
    Validate grade is a number between 0 and 100 (inclusive).

    Args:
        grade: The grade value to validate.

    Returns:
        bool: True if valid, False otherwise.
    """
    if not isinstance(grade, (int, float)) or pd.isna(grade):
        return False
    return 0 <= grade <= 100

def validate_name(name: str) -> bool:
    """
    Validate name contains only alphabetic characters and spaces.

    Args:
        name (str): The name string.

    Returns:
        bool: True if valid, False otherwise.
    """
    if not isinstance(name, str) or pd.isna(name):
        return False
    # Allow letters (any case) and spaces only
    pattern = r'^[A-Za-z\s]+$'
    return bool(re.fullmatch(pattern, name.strip()))

# ---------------------------
# Aggregate Validity Checks on DataFrame
# ---------------------------
def check_age_validity(df: pd.DataFrame) -> pd.Series:
    """
    Returns a boolean Series indicating valid ages per record.

    Args:
        df (pd.DataFrame): Dataset with 'Age' column.

    Returns:
        pd.Series: True for valid ages, False otherwise.
    """
    return df['Age'].apply(validate_age)

def check_grade_validity(df: pd.DataFrame) -> pd.Series:
    """
    Returns a boolean Series indicating valid grades per record.

    Args:
        df (pd.DataFrame): Dataset with 'Grade' column.

    Returns:
        pd.Series: True for valid grades, False otherwise.
    """
    return df['Grade'].apply(validate_grade)

def check_name_validity(df: pd.DataFrame) -> pd.Series:
    """
    Returns a boolean Series indicating valid names per record.

    Args:
        df (pd.DataFrame): Dataset with 'Name' column.

    Returns:
        pd.Series: True for valid names, False otherwise.
    """
    return df['Name'].apply(validate_name)

# ---------------------------
# Example Usage (Wrap in main guard if needed)
# ---------------------------
if __name__ == "__main__":
    try:
        df = load_data('students.csv')
    except (FileNotFoundError, ValueError) as e:
        print(e)
        exit(1)

    print("Unique IDs:", check_unique_ids(df))
    print("Unique Emails:", check_unique_emails(df))
    print("Unique Name-Age-Grade Combinations:", check_unique_combinations(df))

    age_valid = check_age_validity(df)
    grade_valid = check_grade_validity(df)
    name_valid = check_name_validity(df)

    print("Number of invalid ages:", (~age_valid).sum())
    print("Number of invalid grades:", (~grade_valid).sum())
    print("Number of invalid names:", (~name_valid).sum())

# ---------------------------
# Unit Tests
# ---------------------------
class TestDataQuality(unittest.TestCase):

    def setUp(self):
        self.df = pd.DataFrame({
            'ID': [1, 2, 3, 4],
            'Name': ['Alice', 'Bob', 'Charlie', 'David'],
            'Age': [25, 30, 35, 40],
            'Grade': [88, 92, 79, 85],
            'Email': ['alice@example.com', 'bob@example.com', 'charlie@example.com', 'david@example.com']
        })

    def test_unique_ids(self):
        self.assertTrue(check_unique_ids(self.df))
        df_dup = self.df.copy()
        df_dup.loc[1, 'ID'] = 1
        self.assertFalse(check_unique_ids(df_dup))

    def test_unique_emails(self):
        self.assertTrue(check_unique_emails(self.df))
        df_dup = self.df.copy()
        df_dup.loc[2, 'Email'] = 'bob@example.com'
        self.assertFalse(check_unique_emails(df_dup))

    def test_unique_combinations(self):
        self.assertTrue(check_unique_combinations(self.df))
        df_dup = self.df.copy()
        df_dup.loc[3, ['Name', 'Age', 'Grade']] = ['Alice', 25, 88]
        self.assertFalse(check_unique_combinations(df_dup))

    def test_validate_age(self):
        self.assertTrue(validate_age(0))
        self.assertTrue(validate_age(120))
        self.assertFalse(validate_age(-1))
        self.assertFalse(validate_age(121))
        self.assertFalse(validate_age('twenty'))
        self.assertFalse(validate_age(25.5))  # not integer

    def test_validate_grade(self):
        self.assertTrue(validate_grade(0))
        self.assertTrue(validate_grade(100))
        self.assertFalse(validate_grade(-1))
        self.assertFalse(validate_grade(101))
        self.assertFalse(validate_grade('A+'))

    def test_validate_name(self):
        self.assertTrue(validate_name("John Doe"))
        self.assertFalse(validate_name("John123"))
        self.assertFalse(validate_name("Jane_Doe"))
        self.assertFalse(validate_name(""))
        self.assertFalse(validate_name(None))

if __name__ == "__main__":
    unittest.main(argv=[''], exit=False)


File not found: students.csv


NameError: name 'df' is not defined