**Task 1**: Checking Null Values for Completeness

**Description**: Verify if there are any null values in a dataset, which indicate incomplete data.

In [1]:
# Write your code from here
import pandas as pd

def check_nulls_for_completeness(df):
    null_counts = df.isnull().sum()
    total_missing = null_counts.sum()
    if total_missing == 0:
        return "The dataset is complete. No null values found."
    else:
        return f"Incomplete dataset. Total missing values: {total_missing}\nMissing values by column:\n{null_counts[null_counts > 0]}"


**Task 2**: Checking Data Type Validity

**Description**: Ensure that columns contain data of expected types, e.g., ages are integers.

In [2]:
# Write your code from here
import pandas as pd

def check_data_type_validity(df, expected_types):
    """
    Checks if each column in the DataFrame has the expected data type.
    
    Parameters:
    - df: pandas DataFrame
    - expected_types: dict where keys are column names and values are expected types (e.g., int, float, str)
    
    Returns:
    - A report indicating whether each column has valid data types.
    """
    report = {}
    for column, expected_type in expected_types.items():
        if column in df.columns:
            invalid_rows = ~df[column].apply(lambda x: isinstance(x, expected_type) or pd.isnull(x))
            invalid_count = invalid_rows.sum()
            if invalid_count > 0:
                report[column] = f"Invalid type count: {invalid_count}"
            else:
                report[column] = "All types valid"
        else:
            report[column] = "Column not found"
    return report


**Task 3**: Verify Uniqueness of Identifiers

**Description**: Check if a dataset has unique identifiers (e.g., emails).

In [4]:
# Write your code from here
import pandas as pd

def check_unique_identifiers(df, column_name):
    """
    Checks if the values in the specified column are unique.

    Parameters:
    - df: pandas DataFrame
    - column_name: name of the column to check for uniqueness

    Returns:
    - A tuple (is_unique, duplicate_count, duplicate_values)
    """
    if column_name not in df.columns:
        return (False, 0, f"Column '{column_name}' not found in DataFrame.")

    duplicated = df[column_name].duplicated(keep=False)
    duplicate_count = duplicated.sum()

    if duplicate_count == 0:
        return (True, 0, None)
    else:
        duplicate_values = df.loc[duplicated, column_name].unique().tolist()
        return (False, duplicate_count, duplicate_values)


Task 4: Validate Email Format Using Regex

Description: Validate if email addresses in a dataset have the correct format.

In [5]:
# Write your code from here
import pandas as pd
import re

def validate_email_format(df, column_name):
    """
    Validates email format in the specified column using regex.

    Parameters:
    - df: pandas DataFrame
    - column_name: name of the column with email addresses

    Returns:
    - A DataFrame with invalid email addresses
    """
    if column_name not in df.columns:
        raise ValueError(f"Column '{column_name}' not found in the DataFrame.")
    
    # Simple email regex pattern (can be adjusted for stricter validation)
    email_pattern = re.compile(r'^[\w\.-]+@[\w\.-]+\.\w{2,}$')

    # Check if each email matches the pattern
    invalid_emails = df[~df[column_name].astype(str).str.match(email_pattern)]

    return invalid_emails


Task 5: Check for Logical Age Validity

Description: Ensure ages are within a reasonable human range (e.g., 0-120).

In [None]:
# Write your code from here

Task 6: Identify and Handle Missing Data

Description: Identify missing values in a dataset and impute them using a simple strategy (e.g., mean).

In [None]:
# Write your code from here

Task 7: Detect Duplicates

Description: Detect duplicate rows in the dataset.

In [None]:
# Write your code from here

Task 8: Validate Correctness of Numerical Values

Description: Ensure numerical columns are within a specified range.

In [None]:
# Write your code from here

Task 9: Custom Completeness Rule Violation Report

Description: Create a report showing which rows violate specific completeness rules, such as mandatory fields being empty.

In [None]:
# Write your code from here

Task 10: Advanced Regex for Data Validity Check

Description: Check for validity with advanced regex patterns, such as validating complex fields with multi-level rules.

In [None]:
# Write your code from here