# Ensuring Consistency

**Activity Overview**: Ensure consistency by identifying and resolving conflicting values across datasets.

## Title: Customer Address Discrepancies

**Task**: Address customer address mismatches between CRM and marketing databases.

**Steps**:
1. Compare customer addresses in the CRM with those in the marketing database.
2. Identify records with conflicting address information.
3. Propose a method to consolidate records with verified addresses.

In [5]:
import pandas as pd
from fuzzywuzzy import fuzz

def load_data(crm_file, marketing_file):
    """Loads customer data from CRM and marketing database files (e.g., CSV)."""
    try:
        crm_df = pd.read_csv(crm_file)
        marketing_df = pd.read_csv(marketing_file)
        return crm_df, marketing_df
    except FileNotFoundError:
        print("Error: One or both files not found.")
        return None, None

def standardize_address(address):
    """Simple standardization of an address string (lowercase, remove extra spaces)."""
    if isinstance(address, str):
        return " ".join(address.lower().split())
    return ""

def compare_addresses(crm_address, marketing_address, threshold=80):
    """Compares two addresses using fuzzy matching.

    Args:
        crm_address (str): Address from the CRM database.
        marketing_address (str): Address from the marketing database.
        threshold (int): Percentage threshold for considering addresses similar.

    Returns:
        bool: True if the addresses are considered similar, False otherwise.
    """
    if not crm_address or not marketing_address:
        return False

    # Using a combination of ratio and partial ratio for better accuracy
    ratio = fuzz.ratio(crm_address, marketing_address)
    partial_ratio = fuzz.partial_ratio(crm_address, marketing_address)
    return max(ratio, partial_ratio) >= threshold

def identify_address_discrepancies(crm_df, marketing_df, on_column_crm='customer_id', on_column_marketing='customer_id', address_column_crm='address', address_column_marketing='address'):
    """Identifies customer records with conflicting address information.

    Args:
        crm_df (pd.DataFrame): DataFrame containing CRM data.
        marketing_df (pd.DataFrame): DataFrame containing marketing data.
        on_column_crm (str): Column name for the customer identifier in the CRM data.
        on_column_marketing (str): Column name for the customer identifier in the marketing data.
        address_column_crm (str): Column name for the address in the CRM data.
        address_column_marketing (str): Column name for the address in the marketing data.

    Returns:
        pd.DataFrame: DataFrame containing records with identified address discrepancies,
                      including customer IDs and both CRM and marketing addresses.
    """
    if crm_df is None or marketing_df is None:
        return pd.DataFrame()

    # Merge the two dataframes on the customer ID
    merged_df = pd.merge(crm_df[[on_column_crm, address_column_crm]],
                         marketing_df[[on_column_marketing, address_column_marketing]],
                         left_on=on_column_crm, right_on=on_column_marketing,
                         suffixes=('_crm', '_marketing'))

    discrepancies = []
    for index, row in merged_df.iterrows():
        crm_addr_standardized = standardize_address(row[f'{address_column_crm}_crm'])
        marketing_addr_standardized = standardize_address(row[f'{address_column_marketing}_marketing'])

        if not compare_addresses(crm_addr_standardized, marketing_addr_standardized):
            discrepancies.append({
                'customer_id': row[on_column_crm],
                'crm_address': row[f'{address_column_crm}_crm'],
                'marketing_address': row[f'{address_column_marketing}_marketing']
            })

    return pd.DataFrame(discrepancies)

def propose_consolidation_method(discrepancy_df):
    """Proposes a method to consolidate records with verified addresses.

    Args:
        discrepancy_df (pd.DataFrame): DataFrame containing address discrepancies.
    """
    if discrepancy_df.empty:
        print("No address discrepancies found.")
        return

    print("\nProposed Method for Consolidating Records with Verified Addresses:")
    print("-" * 60)
    print("1. **Identify Data Sources with Higher Address Accuracy:**")
    print("   - Determine which database (CRM or Marketing) generally contains more accurate and up-to-date address information. This might involve analyzing data entry processes, validation rules, or recent update frequencies.")
    print("\n2. **Implement a Verification Process:**")
    print("   - For each identified discrepancy, attempt to verify the correct address through:")
    print("     a. **Automated Verification:** Utilize address validation APIs or services to check the validity and suggest corrections for both addresses.")
    print("     b. **Manual Review:** If automated verification is inconclusive, manually review the customer's information, potentially contacting the customer or checking other internal systems.")
    print("\n3. **Prioritize Verified Addresses:**")
    print("   - If one of the addresses can be confidently verified (either automatically or manually), use that as the consolidated address.")
    print("\n4. **Establish a Rule-Based Consolidation Strategy:**")
    print("   - If neither address can be definitively verified immediately, establish rules for consolidation, such as:")
    print("     a. Prioritize the address from the system known to be more reliable.")
    print("     b. If there's a significant difference in the last updated date of the address fields, prioritize the more recent one.")
    print("     c. Flag the record for further review if no clear rule can be applied.")
    print("\n5. **Update and Audit:**")
    print("   - Once a consolidated address is determined, update the relevant systems consistently.")
    print("   - Maintain an audit log of the discrepancies found, the verification process, and the consolidation decisions made.")
    print("\n6. **Prevent Future Discrepancies:**")
    print("   - Review and improve data entry processes and validation rules in both CRM and marketing systems to minimize future address mismatches.")
    print("   - Consider implementing a centralized address management system if the issue is persistent.")

if __name__ == "__main__":
    # Replace with your actual file paths and column names
    crm_file_path = 'crm_data.csv'
    marketing_file_path = 'marketing_data.csv'
    customer_id_crm_col = 'CustomerID'
    customer_id_marketing_col = 'ID'
    address_crm_col = 'MailingAddress'
    address_marketing_col = 'StreetAddress'

    # Load the data
    crm_df, marketing_df = load_data(crm_file_path, marketing_file_path)

    if crm_df is not None and marketing_df is not None:
        # Identify address discrepancies
        discrepancy_df = identify_address_discrepancies(
            crm_df,
            marketing_df,
            on_column_crm=customer_id_crm_col,
            on_column_marketing=customer_id_marketing_col,
            address_column_crm=address_crm_col,
            address_column_marketing=address_marketing_col
        )

        if not discrepancy_df.empty:
            print("\nIdentified Customer Address Discrepancies:")
            print(discrepancy_df)

            # Propose a consolidation method
            propose_consolidation_method(discrepancy_df)
        else:
            print("\nNo significant customer address discrepancies found.")
    else:
        print("\n

SyntaxError: unterminated string literal (detected at line 141) (2363898067.py, line 141)