In [6]:
import pandas as pd
import numpy as np
import re
import logging

# Configure logging once
logging.basicConfig(
    filename='data_cleaning.log',
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

# Load dataset with error handling
def load_dataset(file_path):
    try:
        df = pd.read_csv(file_path)
        logging.info(f"Successfully loaded dataset from: {file_path}")
        return df
    except FileNotFoundError:
        logging.error(f"File not found: {file_path}")
        raise
    except Exception as e:
        logging.error(f"Unexpected error while loading data: {str(e)}")
        raise

# 13. Standardize Date Format
def standardize_dates(df, column):
    if column not in df.columns:
        msg = f"Column '{column}' not found. Available columns: {list(df.columns)}"
        logging.error(msg)
        raise KeyError(msg)
    try:
        df[column] = pd.to_datetime(df[column], errors='coerce').dt.strftime('%Y-%m-%d')
        logging.info(f"Date format standardized for column '{column}'")
    except Exception as e:
        logging.error(f"Date conversion failed for column '{column}': {str(e)}")
        raise
    return df

# 14. Enforce Numeric Constraints
def enforce_numeric_constraint(df, column, condition=lambda x: x > 0):
    if column not in df.columns:
        msg = f"Column '{column}' not found. Available columns: {list(df.columns)}"
        logging.error(msg)
        raise KeyError(msg)
    try:
        mask = pd.to_numeric(df[column], errors='coerce').apply(condition)
        df.loc[~mask, column] = np.nan
        logging.info(f"Numeric constraint applied on '{column}', invalid values set to NaN: {(~mask).sum()}")
    except Exception as e:
        logging.error(f"Constraint enforcement failed on '{column}': {str(e)}")
        raise
    return df

# 15. Email Format Validation
def validate_email_format(df, column):
    if column not in df.columns:
        msg = f"Column '{column}' not found. Available columns: {list(df.columns)}"
        logging.error(msg)
        raise KeyError(msg)
    try:
        regex = re.compile(r'^([a-z0-9_\.-]+)@([\da-z\.-]+)\.([a-z\.]{2,6})$', re.IGNORECASE)
        mask = df[column].astype(str).str.match(regex)
        df.loc[~mask, column] = np.nan
        logging.info(f"Email format validated in '{column}', invalid entries: {(~mask).sum()}")
    except Exception as e:
        logging.error(f"Email validation failed for '{column}': {str(e)}")
        raise
    return df

In [7]:
# 16. Handle Mixed Date Formats (already covered by standardize_dates)

# 17. Standardize Phone Number Patterns
def standardize_phone_format(df, column):
    if column not in df.columns:
        msg = f"Column '{column}' not found. Available columns: {list(df.columns)}"
        logging.error(msg)
        raise KeyError(msg)
    try:
        df[column] = df[column].astype(str).str.replace(r'\D', '', regex=True)
        df[column] = df[column].apply(lambda x: f"({x[0:3]}) {x[3:6]}-{x[6:10]}" if len(x) == 10 else np.nan)
        logging.info(f"Phone numbers standardized in '{column}'")
    except Exception as e:
        logging.error(f"Phone standardization failed for '{column}': {str(e)}")
        raise
    return df

# 18. Standardize Text Case
def standardize_text_case(df, column):
    if column not in df.columns:
        msg = f"Column '{column}' not found. Available columns: {list(df.columns)}"
        logging.error(msg)
        raise KeyError(msg)
    try:
        df[column] = df[column].astype(str).str.upper()
        logging.info(f"Text converted to uppercase in column '{column}'")
    except Exception as e:
        logging.error(f"Case standardization failed for '{column}': {str(e)}")
        raise
    return df