## Ensuring Consistency in Multi-source Data Integration

**Description**: Validate the integration of two datasets `products_A.csv` and `products_B.csv` . Ensure consistency in product "category" information.

In [9]:
import pandas as pd
from io import StringIO

def validate_category_consistency(file_path_a, file_path_b, id_column='product_id', category_column_a='category_A', category_column_b='category_B'):
    """
    Validates the consistency of product category information between two datasets.

    Args:
        file_path_a (str): Path to the first CSV file.
        file_path_b (str): Path to the second CSV file.
        id_column (str): Name of the common product identifier column.
        category_column_a (str): Name of the category column in the first dataset.
        category_column_b (str): Name of the category column in the second dataset.

    Returns:
        pandas.DataFrame: DataFrame containing products with inconsistent category information,
                          including the product ID and the categories from both datasets.
        float: Percentage of products with inconsistent categories.
    """
    try:
        products_A_df = pd.read_csv(file_path_a)
        products_B_df = pd.read_csv(file_path_b)
    except FileNotFoundError as e:
        print(f"Error: One or both files not found: {e}")
        return pd.DataFrame(), 0.0

    # Merge the two dataframes on the specified ID column
    merged_df = pd.merge(products_A_df, products_B_df, on=id_column, how='inner', suffixes=('_A', '_B'))

    # Identify inconsistencies in product categories
    inconsistent_categories_df = merged_df[merged_df[category_column_a] != merged_df[category_column_b]]
    inconsistent_categories_df = inconsistent_categories_df[[id_column, category_column_a, category_column_b]]
    inconsistent_categories_df.rename(columns={category_column_a: f'category_from_{file_path_a.split("/")[-1]}',
                                                category_column_b: f'category_from_{file_path_b.split("/")[-1]}'},
                                       inplace=True)

    # Calculate the number and percentage of inconsistencies
    num_inconsistencies = len(inconsistent_categories_df)
    total_products = len(merged_df)
    percentage_inconsistent = (num_inconsistencies / total_products) * 100 if total_products > 0 else 0

    return inconsistent_categories_df, percentage_inconsistent

if __name__ == "__main__":
    # Sample data (using StringIO for demonstration - replace with your file paths)
    products_A_data = """product_id,name,category_A
    1,Laptop,Electronics
    2,T-Shirt,Apparel
    3,Coffee Maker,Home Goods
    4,Running Shoes,Apparel
    5,Smartphone,Electronics
    """
    products_B_data = """product_id,description,category_B
    1,High-performance laptop,Electronics
    2,Cotton T-shirt,Clothing
    3,Brewing machine,Kitchen
    4,Athletic footwear,Shoes
    5,Latest mobile device,Mobile
    """

    file_path_a_sample = StringIO(products_A_data)
    file_path_b_sample = StringIO(products_B_data)

    # Use your actual file paths here:
    # file_path_a = 'products_A.csv'
    # file_path_b = 'products_B.csv'

    inconsistent_df, inconsistency_percentage = validate_category_consistency(file_path_a_sample, file_path_b_sample)

    print("Products with Inconsistent Category Information:")
    if not inconsistent_df.empty:
        print(inconsistent_df)
    else:
        print("No inconsistent categories found.")

    print(f"\nPercentage of Products with Inconsistent Categories: {inconsistency_percentage:.2f}%")

AttributeError: '_io.StringIO' object has no attribute 'split'