# Measuring Data Accuracy

**Activity Overview**: Assess data accuracy by comparing it to a trusted source and detecting incorrect values or mismatches.

## Title: Product Pricing

**Task**: Compare a dataset of product prices with the latest official price list.

**Steps**:
1. Obtain the latest product price list from the official company website.
2. Compare the dataset's product prices against the verified list.
3. Identify any discrepancies and mark them for correction.

In [1]:
# Write your code from here
import pandas as pd

# Create the company_prices DataFrame
company_prices = pd.DataFrame({
    'product_id': [101, 102, 103, 104, 105],
    'price': [20.00, 30.00, 25.00, 40.00, 15.00]
})

# Create the trusted_prices DataFrame
trusted_prices = pd.DataFrame({
    'product_id': [101, 102, 103, 104, 105],
    'price': [19.99, 30.00, 25.00, 40.00, 16.00]
})

# Save the DataFrames to CSV files
company_prices.to_csv('company_prices.csv', index=False)
trusted_prices.to_csv('trusted_prices.csv', index=False)

print("CSV files created successfully!")

CSV files created successfully!


In [2]:
import pandas as pd
import os

# Function to load a CSV file with error handling
def load_csv(file_path):
    try:
        # Check if file exists
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"File '{file_path}' not found.")
        
        # Load the CSV file into a DataFrame
        df = pd.read_csv(file_path)
        
        # Check for missing columns in the expected format
        required_columns = ['product_id', 'price']  # Assuming 'product_id' and 'price' are required in both datasets
        if not all(col in df.columns for col in required_columns):
            raise ValueError(f"The CSV file '{file_path}' is missing required columns.")
        
        return df
    
    except FileNotFoundError as e:
        print(f"Error: {e}")
        return None
    except ValueError as e:
        print(f"Error: {e}")
        return None
    except pd.errors.EmptyDataError:
        print(f"Error: The file '{file_path}' is empty.")
        return None
    except Exception as e:
        print(f"An unexpected error occurred while reading '{file_path}': {e}")
        return None

# Function to compare product prices
def compare_prices(company_prices_df, trusted_prices_df):
    if company_prices_df is None or trusted_prices_df is None:
        print("One or both datasets failed to load, exiting the comparison.")
        return

    # Merge the two datasets on 'product_id' to compare the prices
    merged_df = pd.merge(company_prices_df, trusted_prices_df, on='product_id', how='left', suffixes=('_company', '_trusted'))

    # Identify discrepancies in the prices
    discrepancies = merged_df[merged_df['price_company'] != merged_df['price_trusted']]
    
    # Display the discrepancies
    if not discrepancies.empty:
        print("Discrepancies found:")
        print(discrepancies[['product_id', 'price_company', 'price_trusted']])
    else:
        print("No discrepancies found. All prices are accurate.")

    # Optionally, you could save the discrepancies to a new CSV
    discrepancies.to_csv('price_discrepancies.csv', index=False)

# Main function to load data and compare prices
def main():
    company_prices_file = 'company_prices.csv'
    trusted_prices_file = 'trusted_prices.csv'

    # Load the datasets
    company_prices_df = load_csv(company_prices_file)
    trusted_prices_df = load_csv(trusted_prices_file)
    
    # Perform the price comparison
    compare_prices(company_prices_df, trusted_prices_df)

if __name__ == "__main__":
    main()

Discrepancies found:
   product_id  price_company  price_trusted
0         101           20.0          19.99
4         105           15.0          16.00
