## Detect Schema Mismatches in Data Pipelines
**Objective**: Identify and resolve schema mismatches that commonly occur in data pipelines.

**Task**: Missing Column

1. Load the source DataFrame with the below schema:
    - id : Integer
    - email : String
    - signup_date : Date
2. Load the target DataFrame with the below schema:
    - id : Integer
    - email : String
3. Implement a check to identify any columns that are present in the source DataFrame but missing in the target.
4. Add the missing `signup_date` column to the target DataFrame.

In [1]:
# write your code from here
import pandas as pd
from datetime import datetime

# === Step 1: Define Source and Target DataFrames ===

def load_source_data():
    return pd.DataFrame({
        'id': [1, 2, 3],
        'email': ['a@example.com', 'b@example.com', 'c@example.com'],
        'signup_date': [pd.to_datetime('2023-01-01'), pd.to_datetime('2023-01-02'), pd.to_datetime('2023-01-03')]
    })

def load_target_data():
    return pd.DataFrame({
        'id': [4, 5, 6],
        'email': ['d@example.com', 'e@example.com', 'f@example.com']
    })

# === Step 2: Schema Validation ===

def validate_dataframe(df, required_columns):
    if df.empty:
        raise ValueError("DataFrame is empty.")
    if not set(required_columns).issubset(df.columns):
        missing = set(required_columns) - set(df.columns)
        raise ValueError(f"Missing columns: {missing}")
    return True

# === Step 3: Detect Missing Columns ===

def detect_missing_columns(source_df, target_df):
    return list(set(source_df.columns) - set(target_df.columns))

# === Step 4: Add Missing Columns with Default Values ===

def add_missing_columns(target_df, missing_columns, source_df):
    for col in missing_columns:
        if pd.api.types.is_datetime64_any_dtype(source_df[col]):
            target_df[col] = pd.NaT  # default for datetime
        elif pd.api.types.is_numeric_dtype(source_df[col]):
            target_df[col] = 0
        else:
            target_df[col] = None
    return target_df

# === Execution ===

try:
    source_df = load_source_data()
    target_df = load_target_data()

    # Validate dataframes for structure
    validate_dataframe(source_df, ['id', 'email', 'signup_date'])
    validate_dataframe(target_df, ['id', 'email'])

    # Detect missing columns
    missing_cols = detect_missing_columns(source_df, target_df)
    print("🔍 Missing columns in target:", missing_cols)

    # Fix the mismatch
    target_df = add_missing_columns(target_df, missing_cols, source_df)

    # Final validation
    validate_dataframe(target_df, ['id', 'email', 'signup_date'])

    print("\n✅ Updated target DataFrame:")
    print(target_df)

except Exception as e:
    print(f"❌ Error: {e}")


🔍 Missing columns in target: ['signup_date']

✅ Updated target DataFrame:
   id          email signup_date
0   4  d@example.com         NaT
1   5  e@example.com         NaT
2   6  f@example.com         NaT


In [2]:
import unittest

class TestSchemaMismatchFix(unittest.TestCase):

    def test_valid_dataframes(self):
        self.assertTrue(validate_dataframe(load_source_data(), ['id', 'email', 'signup_date']))
        self.assertTrue(validate_dataframe(load_target_data(), ['id', 'email']))

    def test_detect_missing_column(self):
        source_df = load_source_data()
        target_df = load_target_data()
        missing = detect_missing_columns(source_df, target_df)
        self.assertIn('signup_date', missing)

    def test_add_missing_column(self):
        source_df = load_source_data()
        target_df = load_target_data()
        missing = detect_missing_columns(source_df, target_df)
        updated_target = add_missing_columns(target_df, missing, source_df)
        self.assertIn('signup_date', updated_target.columns)
        self.assertTrue(pd.isna(updated_target['signup_date']).all())

    def test_empty_dataframe_validation(self):
        with self.assertRaises(ValueError):
            validate_dataframe(pd.DataFrame(), ['id', 'email'])

    def test_incomplete_schema_validation(self):
        df = pd.DataFrame({'id': [1]})
        with self.assertRaises(ValueError):
            validate_dataframe(df, ['id', 'email'])

if __name__ == "__main__":
    unittest.main(argv=[''], exit=False)


.....
----------------------------------------------------------------------
Ran 5 tests in 0.011s

OK
