### Task 1: Measure Data Accuracy using a Trusted Source

**Description**: You have two datasets of product prices: `company_prices.csv` and
`trusted_prices.csv` . Check if the prices in `company_prices.csv` match the prices in
`trusted_prices.csv` . Assume both files have a "product_id" and "price" column.

In [1]:
import pandas as pd

def compare_prices(company_file, trusted_file):
    try:
        # Step 1: Load the datasets (only necessary columns)
        company_df = pd.read_csv(company_file, usecols=['product_id', 'price'])
        trusted_df = pd.read_csv(trusted_file, usecols=['product_id', 'price'])

        # Step 2: Merge the datasets on 'product_id'
        merged_df = pd.merge(company_df, trusted_df, on='product_id', suffixes=('_company', '_trusted'))

        # Step 3: Compare prices and identify mismatches
        mismatches = merged_df[merged_df['price_company'] != merged_df['price_trusted']]

        # Step 4: Output the results
        if not mismatches.empty:
            print("Discrepancies found in the following products:")
            print(mismatches[['product_id', 'price_company', 'price_trusted']])
        else:
            print("All prices match with the trusted source.")
        
        # Assertion (testing coverage): Ensure the function works with test data
        assert len(mismatches) >= 0, "Mismatch rows calculation failed"
        
    except FileNotFoundError as e:
        print(f"Error: File not found - {e}")
    except pd.errors.ParserError as e:
        print(f"Error: Parsing error - {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

# Example usage:
compare_prices('company_prices.csv', 'trusted_prices.csv')


Error: File not found - [Errno 2] No such file or directory: 'company_prices.csv'


### Task 2: Detect Incorrect Values

**Description**: In `company_prices.csv` , detect any negative price values which are incorrect values for prices.

In [2]:
import pandas as pd

def detect_negative_prices(company_file):
    try:
        # Step 1: Load the dataset (only necessary columns)
        company_df = pd.read_csv(company_file, usecols=['product_id', 'price'])

        # Step 2: Identify rows with negative price values
        negative_prices = company_df[company_df['price'] < 0]

        # Step 3: Output the incorrect values
        if not negative_prices.empty:
            print("Detected incorrect (negative) price values:")
            print(negative_prices[['product_id', 'price']])
        else:
            print("No incorrect price values found.")
        
        # Assertion (testing coverage): Ensure that negative prices are detected correctly
        assert len(negative_prices) >= 0, "Negative price detection failed"
        
    except FileNotFoundError as e:
        print(f"Error: File not found - {e}")
    except pd.errors.ParserError as e:
        print(f"Error: Parsing error - {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

# Example usage:
detect_negative_prices('company_prices.csv')


Error: File not found - [Errno 2] No such file or directory: 'company_prices.csv'


### Task 3: Check Missing Data Rates

**Description**: Calculate the percentage of missing values in `customer_data.csv` .

In [3]:
import pandas as pd

def check_missing_data(customer_file):
    try:
        # Step 1: Load the dataset (only necessary columns)
        customer_df = pd.read_csv(customer_file)

        # Step 2: Calculate the number of missing values per column
        missing_values = customer_df.isnull().sum()

        # Step 3: Calculate the percentage of missing values per column
        missing_percentage = (missing_values / len(customer_df)) * 100

        # Step 4: Output the results
        print("Percentage of missing values per column:")
        print(missing_percentage)
        
        # Assertion (testing coverage): Ensure missing percentage calculation works
        assert missing_percentage.isnull().sum() == 0, "Missing percentage calculation failed"
        
    except FileNotFoundError as e:
        print(f"Error: File not found - {e}")
    except pd.errors.ParserError as e:
        print(f"Error: Parsing error - {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

# Example usage:
check_missing_data('customer_data.csv')


Error: File not found - [Errno 2] No such file or directory: 'customer_data.csv'


### Task 4: Handling Partially Available Records

**Description**: In `customer_data.csv` , identify records with missing "email" or "phone number" and decide whether to drop or fill them.

In [4]:
import pandas as pd

def handle_missing_contact_info(customer_file, drop=True):
    try:
        # Step 1: Load the dataset (only necessary columns)
        customer_df = pd.read_csv(customer_file, usecols=['customer_id', 'name', 'email', 'phone'])

        # Step 2: Identify records with missing 'email' or 'phone'
        missing_contact_info = customer_df[customer_df['email'].isnull() | customer_df['phone'].isnull()]

        # Step 3: Decide whether to drop or fill missing records
        if drop:
            cleaned_df = customer_df.dropna(subset=['email', 'phone'])
            print("Dropped records with missing contact info:")
            print(cleaned_df)
        else:
            filled_df = customer_df.copy()
            filled_df['email'].fillna('unknown@example.com', inplace=True)
            filled_df['phone'].fillna('N/A', inplace=True)
            print("Filled missing contact info with default values:")
            print(filled_df)

        # Assertion (testing coverage): Ensure the handling method works (check no missing values left after filling)
        assert filled_df['email'].isnull().sum() == 0, "Email filling failed"
        assert filled_df['phone'].isnull().sum() == 0, "Phone filling failed"
        
    except FileNotFoundError as e:
        print(f"Error: File not found - {e}")
    except pd.errors.ParserError as e:
        print(f"Error: Parsing error - {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

# Example usage:
handle_missing_contact_info('customer_data.csv', drop=False)  # Change drop to True for dropping records


Error: File not found - [Errno 2] No such file or directory: 'customer_data.csv'
