In [1]:
# Activity 3: Data Standardization & Validation

# Task A: Enforcing Data Formats & Constraints

# 13. Date Format Standardization:
# - Convert all date entries into a uniform format (e.g., YYYY-MM-DD).


import pandas as pd
import re

# Sample DataFrame for demonstration
data = {
    'Date': ['2025-05-10', '10/05/2025', 'May 5, 2025', '2025/05/06'],
    'Age': [25, -5, 30, 45],
    'Email': ['example@mail.com', 'invalid-email', 'hello@domain.com', 'user@site.org']
}

df = pd.DataFrame(data)

# Task 13: Date Format Standardization (convert all dates to YYYY-MM-DD format)
df['Date'] = pd.to_datetime(df['Date'], errors='coerce').dt.strftime('%Y-%m-%d')

# Task 14: Numeric Constraints Enforcement (ensure age > 0)
df['Age'] = df['Age'].apply(lambda x: x if x > 0 else None)  # Replace invalid ages with None

# Task 15: String Format Checks (validate email format)
def is_valid_email(email):
    # Simple regex pattern for basic email validation
    pattern = r'^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$'
    return bool(re.match(pattern, email))

df['Email'] = df['Email'].apply(lambda x: x if is_valid_email(x) else None)  # Replace invalid emails with None

# Display the cleaned DataFrame
print(df)



# 14. Numeric Constraints Enforcement:
# - Check and enforce numeric constraints (e.g., age > 0).






# 15. String Format Checks:
# - Ensure text fields meet certain constraints (e.g., valid email format).

         Date   Age             Email
0  2025-05-10  25.0  example@mail.com
1         NaN   NaN              None
2         NaN  30.0  hello@domain.com
3         NaN  45.0     user@site.org


In [2]:
# Task B: Addressing Inconsistent Representations

# 16. Standardizing Date Formats:
# - Identify and correct inconsistent date formats within the dataset.

import pandas as pd
import re

# Sample DataFrame for demonstration
data = {
    'Date': ['2025-05-10', '10/05/2025', 'May 5, 2025', '2025/05/06'],
    'PhoneNumber': ['123-456-7890', '(123) 456-7890', '123 456 7890', '123.456.7890'],
    'Name': ['John Doe', 'jane smith', 'ALICE Johnson', 'boB Brown']
}

df = pd.DataFrame(data)

# Task 16: Standardizing Date Formats (convert all dates to YYYY-MM-DD format)
df['Date'] = pd.to_datetime(df['Date'], errors='coerce').dt.strftime('%Y-%m-%d')

# Task 17: Pattern Matching for Consistency (standardize phone numbers to (XXX) XXX-XXXX format)
def standardize_phone_number(phone):
    # Remove all non-numeric characters and format as (XXX) XXX-XXXX
    digits = re.sub(r'\D', '', phone)
    return f"({digits[:3]}) {digits[3:6]}-{digits[6:]}" if len(digits) == 10 else None

df['PhoneNumber'] = df['PhoneNumber'].apply(standardize_phone_number)

# Task 18: Handling Mixed Case Text (convert all text entries in 'Name' to uppercase)
df['Name'] = df['Name'].apply(lambda x: x.upper())

# Display the cleaned DataFrame
print(df)







# 17. Pattern Matching for Consistency:
# - Standardize phone numbers to a specific pattern (e.g., (123) 456-7890).





# 18. Handling Mixed Case Text:
# - Convert all text entries to a consistent case (e.g., all uppercase).











         Date     PhoneNumber           Name
0  2025-05-10  (123) 456-7890       JOHN DOE
1         NaN  (123) 456-7890     JANE SMITH
2         NaN  (123) 456-7890  ALICE JOHNSON
3         NaN  (123) 456-7890      BOB BROWN
