## Ensuring Consistency in Multi-source Data Integration

**Description**: Validate the integration of two datasets `products_A.csv` and `products_B.csv` . Ensure consistency in product "category" information.

In [1]:
# Write your code from here
import pandas as pd

# Create sample data for products_A.csv
data_A = {
    'product_id': [1, 2, 3, 4, 5],
    'category': ['Electronics', 'Clothing', 'Furniture', 'Toys', 'Electronics']
}

# Create a DataFrame from the data
df_A = pd.DataFrame(data_A)

# Write the DataFrame to CSV
df_A.to_csv('products_A.csv', index=False)

# Create sample data for products_B.csv
data_B = {
    'product_id': [1, 2, 3, 4, 5],
    'category': ['Electronics', 'Apparel', 'Furniture', 'Kids', 'Electronics']
}

# Create a DataFrame from the data
df_B = pd.DataFrame(data_B)

# Write the DataFrame to CSV
df_B.to_csv('products_B.csv', index=False)

print("CSV files 'products_A.csv' and 'products_B.csv' have been created.")

CSV files 'products_A.csv' and 'products_B.csv' have been created.


In [2]:
import pandas as pd

# Function to load CSV file and validate if the necessary columns exist and the dataset is not empty
def load_and_validate_data(file_path):
    """
    Load CSV file and validate if the necessary columns exist and the dataset is not empty.
    """
    try:
        data = pd.read_csv(file_path)
        if data.empty:
            raise ValueError(f"Dataset at {file_path} is empty.")
        if 'category' not in data.columns:
            raise ValueError(f"'category' column is missing in {file_path}.")
        return data
    except Exception as e:
        print(f"Error loading data from {file_path}: {e}")
        return None


# Function to merge two datasets on the 'product_id' column
def merge_datasets(products_A, products_B):
    """
    Merge two product datasets on the 'product_id' column.
    """
    return pd.merge(products_A, products_B, on='product_id', how='outer', suffixes=('_A', '_B'))


# Function to check if there are inconsistencies in the 'category' column after merging the datasets
def check_category_consistency(merged_data):
    """
    Check if there are inconsistencies in the 'category' column after merging the datasets.
    """
    inconsistent_data = merged_data[merged_data['category_A'] != merged_data['category_B']]
    return inconsistent_data


# Function to print inconsistent data (if any) or a message when no inconsistencies are found
def print_inconsistencies(inconsistent_data):
    if not inconsistent_data.empty:
        print("Inconsistent data found:")
        print(inconsistent_data)
    else:
        print("No inconsistencies found.")


# Main function to execute the data consistency check
def run_data_consistency_check(file_A, file_B):
    """
    Run the data consistency check between two datasets: file_A and file_B.
    """
    # Load the datasets and validate
    products_A = load_and_validate_data(file_A)
    products_B = load_and_validate_data(file_B)

    if products_A is None or products_B is None:
        print("One of the datasets is invalid. Exiting the process.")
        return

    # Merge the datasets
    merged_data = merge_datasets(products_A, products_B)

    # Check for category consistency
    inconsistent_data = check_category_consistency(merged_data)

    # Print any inconsistencies
    print_inconsistencies(inconsistent_data)


# Example of how to run the consistency check with the provided CSV files
if __name__ == "__main__":
    run_data_consistency_check('products_A.csv', 'products_B.csv')

Inconsistent data found:
   product_id category_A category_B
1           2   Clothing    Apparel
3           4       Toys       Kids


In [3]:
import pytest
import pandas as pd

# Unit Test 1: Test if merging works correctly
def test_merge_datasets():
    df_A = pd.DataFrame({'product_id': [1, 2], 'category': ['Electronics', 'Furniture']})
    df_B = pd.DataFrame({'product_id': [1, 2], 'category': ['Electronics', 'Office']})
    merged = merge_datasets(df_A, df_B)
    
    # Assert categories are consistent except for product_id 2
    assert merged[merged['product_id'] == 2]['category_A'].iloc[0] != merged[merged['product_id'] == 2]['category_B'].iloc[0]

# Unit Test 2: Test if the function catches empty or invalid CSV paths
def test_load_and_validate_data():
    # Assuming valid file 'valid_file.csv' exists and invalid ones do not
    assert load_and_validate_data("valid_file.csv") is not None
    assert load_and_validate_data("empty_file.csv") is None
    assert load_and_validate_data("missing_column_file.csv") is None

In [4]:
import pandas as pd

# Function to load CSV file and validate if the necessary columns exist and the dataset is not empty
def load_and_validate_data(file_path):
    """
    Load CSV file and validate if the necessary columns exist and the dataset is not empty.
    """
    try:
        data = pd.read_csv(file_path)
        if data.empty:
            raise ValueError(f"Dataset at {file_path} is empty.")
        if 'category' not in data.columns:
            raise ValueError(f"'category' column is missing in {file_path}.")
        return data
    except Exception as e:
        print(f"Error loading data from {file_path}: {e}")
        return None


# Function to merge two datasets on the 'product_id' column
def merge_datasets(products_A, products_B):
    """
    Merge two product datasets on the 'product_id' column.
    """
    return pd.merge(products_A, products_B, on='product_id', how='outer', suffixes=('_A', '_B'))


# Function to check if there are inconsistencies in the 'category' column after merging the datasets
def check_category_consistency(merged_data):
    """
    Check if there are inconsistencies in the 'category' column after merging the datasets.
    """
    inconsistent_data = merged_data[merged_data['category_A'] != merged_data['category_B']]
    return inconsistent_data


# Function to print inconsistent data (if any) or a message when no inconsistencies are found
def print_inconsistencies(inconsistent_data):
    if not inconsistent_data.empty:
        print("Inconsistent data found:")
        print(inconsistent_data)
    else:
        print("No inconsistencies found.")


# Main function to execute the data consistency check
def run_data_consistency_check(file_A, file_B):
    """
    Run the data consistency check between two datasets: file_A and file_B.
    """
    # Load the datasets and validate
    products_A = load_and_validate_data(file_A)
    products_B = load_and_validate_data(file_B)

    if products_A is None or products_B is None:
        print("One of the datasets is invalid. Exiting the process.")
        return

    # Merge the datasets
    merged_data = merge_datasets(products_A, products_B)

    # Check for category consistency
    inconsistent_data = check_category_consistency(merged_data)

    # Print any inconsistencies
    print_inconsistencies(inconsistent_data)


# Example of how to run the consistency check with the provided CSV files
if __name__ == "__main__":
    run_data_consistency_check('products_A.csv', 'products_B.csv')

Inconsistent data found:
   product_id category_A category_B
1           2   Clothing    Apparel
3           4       Toys       Kids
