# Measuring Data Accuracy

**Activity Overview**: Assess data accuracy by comparing it to a trusted source and detecting incorrect values or mismatches.

## Title: Product Pricing

**Task**: Compare a dataset of product prices with the latest official price list.

**Steps**:
1. Obtain the latest product price list from the official company website.
2. Compare the dataset's product prices against the verified list.
3. Identify any discrepancies and mark them for correction.

In [1]:
# Write your code from here
import pandas as pd
import requests
from bs4 import BeautifulSoup

def compare_product_prices(dataset_file, official_price_url, product_id_col='product_id', price_col_dataset='price_dataset', price_col_official='price_official'):
    """
    Compares a dataset of product prices against the latest official price list
    obtained from a website.

    Args:
        dataset_file (str): Path to the CSV file containing the dataset.
        official_price_url (str): URL of the webpage containing the official price list.
        product_id_col (str, optional): Name of the product ID column in both datasets.
                                       Defaults to 'product_id'.
        price_col_dataset (str, optional): Name of the price column in the dataset.
                                          Defaults to 'price_dataset'.
        price_col_official (str, optional): Name to use for the price column from
                                           the official list in the comparison DataFrame.
                                           Defaults to 'price_official'.

    Returns:
        pandas.DataFrame: A DataFrame showing products with price discrepancies,
                          including the product ID and the prices from both sources.
                          Returns None if the dataset file is not found or if there
                          are issues fetching or parsing the official price list.
    """
    try:
        dataset_df = pd.read_csv(dataset_file)
    except FileNotFoundError:
        print(f"Error: Dataset file not found at '{dataset_file}'")
        return None

    if product_id_col not in dataset_df.columns or price_col_dataset not in dataset_df.columns:
        print(f"Error: Required columns '{product_id_col}' or '{price_col_dataset}' not found in the dataset.")
        return None

    try:
        response = requests.get(official_price_url)
        response.raise_for_status()  # Raise an exception for bad status codes

        soup = BeautifulSoup(response.content, 'html.parser')

        # --- STEP 1: Extract the official price list ---
        # This part is highly dependent on the structure of the official website.
        # You will need to inspect the HTML source of the webpage and identify
        # the elements that contain the product IDs and their corresponding prices.

        # Example extraction logic (you will likely need to adjust this):
        official_prices = {}
        product_elements = soup.find_all('div', class_='product-item')  # Example class

        for product in product_elements:
            product_id_element = product.find('span', class_='product-code')  # Example class
            price_element = product.find('span', class_='product-price')    # Example class

            if product_id_element and price_element:
                product_id = product_id_element.text.strip()
                price_str = price_element.text.strip().replace('$', '').replace(',', '')  # Clean up price string
                try:
                    price = float(price_str)
                    official_prices[product_id] = price
                except ValueError:
                    print(f"Warning: Could not parse price '{price_str}' for product '{product_id}' from official list.")

        official_prices_df = pd.DataFrame(list(official_prices.items()), columns=[product_id_col, price_col_official])

        # --- STEP 2: Compare the dataset's product prices against the verified list ---
        merged_df = pd.merge(dataset_df, official_prices_df, on=product_id_col, how='left')

        # Identify discrepancies (where official price exists but doesn't match dataset price)
        discrepancies_df = merged_df[
            (merged_df[price_col_official].notna()) & (merged_df[price_col_dataset] != merged_df[price_col_official])
        ][[product_id_col, price_col_dataset, price_col_official]]

        return discrepancies_df

    except requests.exceptions.RequestException as e:
        print(f"Error: Could not retrieve official price list from '{official_price_url}': {e}")
        return None
    except Exception as e:
        print(f"An unexpected error occurred while processing the official price list: {e}")
        return None

# Example Usage:
dataset_file = 'product_data.csv'
official_price_url = 'https://www.example-company.com/prices'  # Replace with the actual URL

price_discrepancies = compare_product_prices(dataset_file, official_price_url)

if price_discrepancies is not None:
    if price_discrepancies.empty:
        print("No price discrepancies found between the dataset and the official price list.")
    else:
        print("Price Discrepancies Found:")
        print(price_discrepancies)
        print("\nAction: Review these discrepancies and mark them for correction in the dataset.")

ModuleNotFoundError: No module named 'requests'