## Detect Schema Mismatches in Data Pipelines
**Objective**: Identify and resolve schema mismatches that commonly occur in data pipelines.

**Task**: Column Name Mismatch

**Steps**:
1. Load the source DataFrame with the below schema:
    - id : Integer
    - name : String
    - age : Integer
2. Load the target DataFrame with the below schema:
    - id : Integer
    - fullname : String
    - age : Integer
3. Use a schema comparison tool or write a simple function to detect mismatches in column names.
4. Resolve the mismatch by renaming the `fullname` column in the target DataFrame to `name` .

In [2]:
import pandas as pd

def load_source_data():
    return pd.DataFrame({
        'id': [1, 2, 3],
        'name': ['Alice', 'Bob', 'Charlie'],
        'age': [25, 30, 35]
    })

def load_target_data():
    return pd.DataFrame({
        'id': [4, 5, 6],
        'fullname': ['David', 'Eve', 'Frank'],
        'age': [28, 32, 38]
    })

def validate_dataframe(df, expected_columns):
    if df.empty:
        raise ValueError("DataFrame is empty.")
    if not set(expected_columns).issubset(df.columns):
        raise ValueError(f"Missing expected columns. Found: {df.columns.tolist()}")

def detect_column_mismatches(df1, df2):
    cols_df1 = set(df1.columns)
    cols_df2 = set(df2.columns)
    return cols_df1 - cols_df2, cols_df2 - cols_df1

def resolve_column_mismatch(df, rename_map):
    return df.rename(columns=rename_map)

# === Main Execution ===
try:
    source_df = load_source_data()
    target_df = load_target_data()

    # Step 1: Validate input schemas
    validate_dataframe(source_df, ['id', 'name', 'age'])
    validate_dataframe(target_df, ['id', 'fullname', 'age'])  # expected before rename

    # Step 2: Detect mismatches
    source_only, target_only = detect_column_mismatches(source_df, target_df)
    print("Before Fix:")
    print("Only in Source:", source_only)
    print("Only in Target:", target_only)

    # Step 3: Fix mismatch
    rename_map = {'fullname': 'name'}
    target_df = resolve_column_mismatch(target_df, rename_map)

    # Step 4: Revalidate schema
    updated_source_only, updated_target_only = detect_column_mismatches(source_df, target_df)
    print("\nAfter Fix:")
    print("Only in Source:", updated_source_only)
    print("Only in Target:", updated_target_only)

    # Optional: Combine if schemas match
    if not updated_source_only and not updated_target_only:
        combined_df = pd.concat([source_df, target_df], ignore_index=True)
        print("\n✅ Combined DataFrame:")
        print(combined_df)
    else:
        print("❌ Schema mismatch remains. Cannot combine.")

except Exception as e:
    print(f"❌ Error: {e}")


Before Fix:
Only in Source: {'name'}
Only in Target: {'fullname'}

After Fix:
Only in Source: set()
Only in Target: set()

✅ Combined DataFrame:
   id     name  age
0   1    Alice   25
1   2      Bob   30
2   3  Charlie   35
3   4    David   28
4   5      Eve   32
5   6    Frank   38


In [3]:
import unittest

class TestSchemaMismatch(unittest.TestCase):
    def test_valid_dataframes(self):
        df1 = load_source_data()
        df2 = load_target_data()
        self.assertFalse(df1.empty)
        self.assertFalse(df2.empty)

    def test_detect_mismatch(self):
        df1 = load_source_data()
        df2 = load_target_data()
        src_only, tgt_only = detect_column_mismatches(df1, df2)
        self.assertIn('name', src_only)
        self.assertIn('fullname', tgt_only)

    def test_resolve_mismatch(self):
        df2 = load_target_data()
        fixed_df = resolve_column_mismatch(df2, {'fullname': 'name'})
        self.assertIn('name', fixed_df.columns)
        self.assertNotIn('fullname', fixed_df.columns)

    def test_empty_df(self):
        with self.assertRaises(ValueError):
            validate_dataframe(pd.DataFrame(), ['id', 'name'])

    def test_missing_column(self):
        df = pd.DataFrame({'id': [1], 'age': [25]})
        with self.assertRaises(ValueError):
            validate_dataframe(df, ['id', 'name', 'age'])

if __name__ == "__main__":
    unittest.main(argv=[''], exit=False)


.....
----------------------------------------------------------------------
Ran 5 tests in 0.008s

OK
