**Task 1**: Checking Null Values for Completeness

**Description**: Verify if there are any null values in a dataset, which indicate incomplete data.

In [1]:
import pandas as pd

def check_null_values(df):
    """
    Check if there are any null values in the DataFrame.

    Args:
    - df (pd.DataFrame): Input dataset.

    Returns:
    - bool: True if any null values exist, False otherwise.
    - pd.Series: Count of null values per column.
    """
    null_exists = df.isnull().values.any()
    null_counts = df.isnull().sum()
    return null_exists, null_counts

# Example usage:
data = {
    'Name': ['Alice', 'Bob', None],
    'Age': [25, None, 30],
    'City': ['New York', 'Los Angeles', 'Chicago']
}
df = pd.DataFrame(data)

has_null, null_counts = check_null_values(df)
print("Any null values?", has_null)
print("Null counts per column:\n", null_counts)


Any null values? True
Null counts per column:
 Name    1
Age     1
City    0
dtype: int64


**Task 2**: Checking Data Type Validity

**Description**: Ensure that columns contain data of expected types, e.g., ages are integers.

In [2]:
import pandas as pd

def check_column_types(df, expected_types):
    """
    Checks if columns contain data of the expected types.

    Args:
    - df (pd.DataFrame): Input dataset.
    - expected_types (dict): Dictionary mapping column names to expected Python types
      e.g., {'Age': int, 'Name': str}

    Returns:
    - dict: Keys are column names, values are booleans indicating if column matches expected type.
    """
    results = {}
    for col, expected_type in expected_types.items():
        # Check if column exists
        if col not in df.columns:
            results[col] = False
            continue
        
        # Check types for each value in the column (ignoring nulls)
        col_types_valid = df[col].dropna().apply(lambda x: isinstance(x, expected_type)).all()
        results[col] = col_types_valid
    return results

# Example usage:
data = {
    'Name': ['Alice', 'Bob', 'Charlie'],
    'Age': [25, 30, 'thirty-five'],  # 'thirty-five' is invalid
    'Salary': [50000.0, 60000.0, 55000.0]
}
df = pd.DataFrame(data)

expected = {'Name': str, 'Age': int, 'Salary': float}
print(check_column_types(df, expected))
# Output might be: {'Name': True, 'Age': False, 'Salary': True}


{'Name': np.True_, 'Age': np.False_, 'Salary': np.True_}


**Task 3**: Verify Uniqueness of Identifiers

**Description**: Check if a dataset has unique identifiers (e.g., emails).

In [3]:
import pandas as pd

def check_unique_identifiers(df, column):
    """
    Checks if the specified column contains unique identifiers.

    Args:
    - df (pd.DataFrame): Input dataset.
    - column (str): Column name to check for uniqueness.

    Returns:
    - bool: True if all values in the column are unique, False otherwise.
    - pd.Series: Counts of duplicate entries if any (empty if none).
    """
    if column not in df.columns:
        raise ValueError(f"Column '{column}' not found in DataFrame")
    
    duplicates = df[column][df[column].duplicated(keep=False)]
    all_unique = duplicates.empty
    
    return all_unique, duplicates

# Example usage:
data = {
    'email': ['alice@example.com', 'bob@example.com', 'alice@example.com', 'charlie@example.com'],
    'name': ['Alice', 'Bob', 'Alice', 'Charlie']
}
df = pd.DataFrame(data)

unique, duplicate_entries = check_unique_identifiers(df, 'email')
print("All unique?", unique)
print("Duplicates:\n", duplicate_entries)


All unique? False
Duplicates:
 0    alice@example.com
2    alice@example.com
Name: email, dtype: object


Task 4: Validate Email Format Using Regex

Description: Validate if email addresses in a dataset have the correct format.

In [4]:
import re
import pandas as pd

def validate_email_format(emails):
    """
    Validate email addresses using a regex pattern.

    Args:
    - emails (list or pd.Series): List or Series of email strings.

    Returns:
    - list of bool: True for valid emails, False for invalid.
    """
    pattern = re.compile(r'^[\w\.-]+@[\w\.-]+\.\w+$')
    return [bool(pattern.match(email)) for email in emails]

# Example usage with list:
email_list = [
    'test.email@example.com',
    'invalid-email',
    'user123@domain.co.uk',
    'user@domain',
    '@missingusername.com'
]

print(validate_email_format(email_list))
# Output: [True, False, True, False, False]

# Example usage with DataFrame:
data = {'email': email_list}
df = pd.DataFrame(data)
df['email_valid'] = validate_email_format(df['email'])
print(df)


[True, False, True, False, False]
                    email  email_valid
0  test.email@example.com         True
1           invalid-email        False
2    user123@domain.co.uk         True
3             user@domain        False
4    @missingusername.com        False


Task 5: Check for Logical Age Validity

Description: Ensure ages are within a reasonable human range (e.g., 0-120).

In [5]:
import pandas as pd

def check_logical_age_validity(ages, min_age=0, max_age=120):
    """
    Checks if ages are integers within a specified range.

    Args:
    - ages (list, pd.Series): List or Series of age values.
    - min_age (int): Minimum valid age (inclusive).
    - max_age (int): Maximum valid age (inclusive).

    Returns:
    - list of bool: True if age is valid, False otherwise.
    """
    results = []
    for age in ages:
        if isinstance(age, int) and min_age <= age <= max_age:
            results.append(True)
        else:
            results.append(False)
    return results

# Example usage:
age_list = [25, 130, -5, 0, 45, 120, 'thirty']
print(check_logical_age_validity(age_list))
# Output: [True, False, False, True, True, True, False]

# With Pandas DataFrame:
data = {'Age': age_list}
df = pd.DataFrame(data)
df['valid_age'] = check_logical_age_validity(df['Age'])
print(df)


[True, False, False, True, True, True, False]
      Age  valid_age
0      25       True
1     130      False
2      -5      False
3       0       True
4      45       True
5     120       True
6  thirty      False


Task 6: Identify and Handle Missing Data

Description: Identify missing values in a dataset and impute them using a simple strategy (e.g., mean).

In [6]:
import pandas as pd

def impute_missing_with_mean(df):
    """
    Identify missing values and fill them with the column mean for numeric columns.

    Args:
    - df (pd.DataFrame): Input dataset.

    Returns:
    - pd.DataFrame: Copy of dataset with missing numeric values imputed by mean.
    """
    df_imputed = df.copy()
    for col in df_imputed.select_dtypes(include=['number']).columns:
        mean_value = df_imputed[col].mean()
        df_imputed[col].fillna(mean_value, inplace=True)
    return df_imputed

# Example usage:
data = {
    'Age': [25, None, 30, 22, None],
    'Salary': [50000, 60000, None, 55000, 58000],
    'Name': ['Alice', 'Bob', 'Charlie', 'David', None]
}
df = pd.DataFrame(data)

print("Original DataFrame:")
print(df)

df_filled = impute_missing_with_mean(df)

print("\nDataFrame after imputing missing numeric values with mean:")
print(df_filled)


Original DataFrame:
    Age   Salary     Name
0  25.0  50000.0    Alice
1   NaN  60000.0      Bob
2  30.0      NaN  Charlie
3  22.0  55000.0    David
4   NaN  58000.0     None

DataFrame after imputing missing numeric values with mean:
         Age   Salary     Name
0  25.000000  50000.0    Alice
1  25.666667  60000.0      Bob
2  30.000000  55750.0  Charlie
3  22.000000  55000.0    David
4  25.666667  58000.0     None


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_imputed[col].fillna(mean_value, inplace=True)


Task 7: Detect Duplicates

Description: Detect duplicate rows in the dataset.

In [7]:
import pandas as pd

def detect_duplicates(df):
    """
    Detect duplicate rows in the DataFrame.

    Args:
    - df (pd.DataFrame): Input dataset.

    Returns:
    - pd.DataFrame: DataFrame containing only duplicate rows (including first occurrences).
    """
    duplicates = df[df.duplicated(keep=False)]
    return duplicates

# Example usage:
data = {
    'Name': ['Alice', 'Bob', 'Alice', 'Charlie', 'Bob'],
    'Age': [25, 30, 25, 35, 30],
    'City': ['NY', 'LA', 'NY', 'Chicago', 'LA']
}
df = pd.DataFrame(data)

duplicate_rows = detect_duplicates(df)
print("Duplicate rows:")
print(duplicate_rows)


Duplicate rows:
    Name  Age City
0  Alice   25   NY
1    Bob   30   LA
2  Alice   25   NY
4    Bob   30   LA


Task 8: Validate Correctness of Numerical Values

Description: Ensure numerical columns are within a specified range.

In [8]:
import pandas as pd

def validate_numerical_ranges(df, column_ranges):
    """
    Validates if numerical columns fall within the specified ranges.

    Args:
    - df (pd.DataFrame): Input dataset.
    - column_ranges (dict): Dictionary with column names as keys and (min, max) tuples as values.
      Example: {'Age': (0, 120), 'Salary': (30000, 200000)}

    Returns:
    - dict: Keys are column names, values are lists of booleans indicating validity for each row.
    """
    results = {}
    for col, (min_val, max_val) in column_ranges.items():
        if col not in df.columns:
            results[col] = None  # Or raise an error
            continue
        # Check if value is within range (ignoring NaNs)
        validity = df[col].apply(lambda x: pd.isna(x) or (min_val <= x <= max_val))
        results[col] = validity.tolist()
    return results

# Example usage:
data = {
    'Age': [25, 130, 45, -5, 60],
    'Salary': [50000, 60000, 250000, 40000, None]
}
df = pd.DataFrame(data)

ranges = {'Age': (0, 120), 'Salary': (30000, 200000)}
validity = validate_numerical_ranges(df, ranges)
print(validity)
# Output:
# {'Age': [True, False, True, False, True], 'Salary': [True, True, False, True, True]}


{'Age': [True, False, True, False, True], 'Salary': [True, True, False, True, True]}


Task 9: Custom Completeness Rule Violation Report

Description: Create a report showing which rows violate specific completeness rules, such as mandatory fields being empty.

In [9]:
import pandas as pd

def completeness_violation_report(df, mandatory_fields):
    """
    Generates a report of rows violating completeness rules where mandatory fields are empty.

    Args:
    - df (pd.DataFrame): Input dataset.
    - mandatory_fields (list of str): List of column names that must not be null or empty.

    Returns:
    - pd.DataFrame: Rows violating completeness rules with info on which fields are missing.
    """
    # Create a mask for each mandatory field where values are null or empty strings
    violation_mask = pd.DataFrame(False, index=df.index, columns=mandatory_fields)

    for field in mandatory_fields:
        if field not in df.columns:
            raise ValueError(f"Mandatory field '{field}' not found in DataFrame")
        violation_mask[field] = df[field].isnull() | (df[field].astype(str).str.strip() == '')

    # Rows where any mandatory field violates completeness
    violating_rows = violation_mask.any(axis=1)
    report = df.loc[violating_rows].copy()

    # Add columns showing which mandatory fields are violated (True = violation)
    for field in mandatory_fields:
        report[f'{field}_missing'] = violation_mask.loc[violating_rows, field]

    return report

# Example usage:
data = {
    'Name': ['Alice', '', 'Charlie', None],
    'Email': ['alice@example.com', 'bob@example.com', '', None],
    'Age': [25, 30, 22, 45]
}
df = pd.DataFrame(data)

mandatory = ['Name', 'Email']
violation_report = completeness_violation_report(df, mandatory)
print(violation_report)


      Name            Email  Age  Name_missing  Email_missing
1           bob@example.com   30          True          False
2  Charlie                    22         False           True
3     None             None   45          True           True


Task 10: Advanced Regex for Data Validity Check

Description: Check for validity with advanced regex patterns, such as validating complex fields with multi-level rules.

In [10]:
import re

def advanced_regex_validation(values, pattern):
    """
    Validate list of strings against an advanced regex pattern.

    Args:
    - values (list of str): List of strings to validate.
    - pattern (str): Regex pattern string.

    Returns:
    - list of bool: True if valid, False otherwise.
    """
    regex = re.compile(pattern)
    return [bool(regex.fullmatch(value)) for value in values]

# Example: US phone number validation pattern
# Format examples it accepts:
# +1-800-555-1234
# 800-555-1234
# (800) 555-1234
# 8005551234
# 800.555.1234
# 800 555 1234 x123 (extension)
phone_pattern = r'^(\+1[-\s]?)*(\(\d{3}\)|\d{3})[-.\s]?\d{3}[-.\s]?\d{4}(\s*x\d+)?$'

# Test data
phone_numbers = [
    '+1-800-555-1234',
    '800-555-1234',
    '(800) 555-1234',
    '8005551234',
    '800.555.1234',
    '800 555 1234 x123',
    '5551234',        # invalid
    '180055512345',   # invalid
]

results = advanced_regex_validation(phone_numbers, phone_pattern)
for number, valid in zip(phone_numbers, results):
    print(f"{number}: {'Valid' if valid else 'Invalid'}")


+1-800-555-1234: Valid
800-555-1234: Valid
(800) 555-1234: Valid
8005551234: Valid
800.555.1234: Valid
800 555 1234 x123: Valid
5551234: Invalid
180055512345: Invalid
