### Task 1: Detecting Missing Values during Data Ingestion
**Description**: You have a CSV file with missing values in some columns. Write a Python script to detect and report missing values during the ingestion process.

**Steps**:
1. Load data
2. Check for missing values
3. Report missing values

In [1]:
# Write your code from here
import pandas as pd
import numpy as np
import os
import unittest

# Step 1: Define a function to load CSV with error handling for file existence
def load_csv_safe(path):
    """
    Load CSV file into a DataFrame. If the file doesn't exist, raises a FileNotFoundError.
    """
    if not os.path.exists(path):
        raise FileNotFoundError(f"File not found: {path}")
    try:
        df = pd.read_csv(path)
        return df
    except Exception as e:
        raise ValueError(f"Error loading CSV: {str(e)}")

# Step 2: Simulate data with missing values
data = {
    'customer_id': [1, 2, np.nan, 4, 5],
    'amount': [100.0, None, 200.0, 300.0, None],
    'transaction_date': ['2024-01-01', '2024-01-02', '2024-01-03', None, '2024-01-05']
}
df = pd.DataFrame(data)
print("Raw Ingested Data:")
display(df)

# Step 3: Handle missing values by using imputation and dropping rows with missing customer_id
def handle_missing_values(df):
    """
    Handle missing values in the dataset:
    - Drop rows where 'customer_id' is missing.
    - Impute missing 'amount' values using mean.
    - Fill missing 'transaction_date' values using forward fill.
    """
    if 'customer_id' not in df.columns or 'amount' not in df.columns:
        raise ValueError("Required columns are missing from the dataset.")
    
    # Drop rows where customer_id is missing
    df_cleaned = df.dropna(subset=['customer_id'])
    
    # Impute missing 'amount' with mean value
    df_cleaned['amount'].fillna(df_cleaned['amount'].mean(), inplace=True)
    
    # Forward fill missing 'transaction_date'
    df_cleaned['transaction_date'].fillna(method='ffill', inplace=True)
    
    return df_cleaned

# Clean the data by handling missing values
df_clean = handle_missing_values(df)
print("Cleaned Data (missing values handled):")
display(df_clean)

# Step 4: Remove duplicates in the dataset
def remove_duplicates(df):
    """
    Remove duplicate rows from the DataFrame.
    """
    return df.drop_duplicates()

df_no_duplicates = remove_duplicates(df_clean)
print("Data after duplicate removal:")
display(df_no_duplicates)

# Step 5: Validate data types
def validate_data_types(df):
    """
    Validate that the data types of columns match the expected types.
    - 'customer_id' should be integer
    - 'amount' should be numeric
    - 'transaction_date' should be datetime-like
    """
    if not np.issubdtype(df['customer_id'].dtype, np.number):
        raise TypeError("'customer_id' column must be numeric")
    if not np.issubdtype(df['amount'].dtype, np.number):
        raise TypeError("'amount' column must be numeric")
    df['transaction_date'] = pd.to_datetime(df['transaction_date'], errors='coerce')
    return df

# Step 6: Apply data type validation
df_validated = validate_data_types(df_no_duplicates)
print("Data after type validation:")
display(df_validated)

# Step 7: Unit Tests for the functions

class TestDataProcessing(unittest.TestCase):
    
    def test_missing_value_handling(self):
        test_data = pd.DataFrame({
            'customer_id': [1, 2, np.nan],
            'amount': [100, None, 200],
            'transaction_date': ['2024-01-01', '2024-01-02', None]
        })
        cleaned_data = handle_missing_values(test_data)
        self.assertEqual(cleaned_data.isnull().sum().sum(), 0)
    
    def test_duplicate_removal(self):
        test_data = pd.DataFrame({
            'customer_id': [1, 2, 2, 3],
            'amount': [100, 200, 200, 300],
            'transaction_date': ['2024-01-01', '2024-01-02', '2024-01-02', '2024-01-03']
        })
        no_duplicates = remove_duplicates(test_data)
        self.assertEqual(no_duplicates.shape[0], 3)
    
    def test_data_type_validation(self):
        test_data = pd.DataFrame({
            'customer_id': [1, 2, 3],
            'amount': [100, 200, 300],
            'transaction_date': ['2024-01-01', '2024-01-02', '2024-01-03']
        })
        validated_data = validate_data_types(test_data)
        self.assertTrue(pd.api.types.is_numeric_dtype(validated_data['amount']))
        self.assertTrue(pd.api.types.is_datetime64_any_dtype(validated_data['transaction_date']))

# Run the tests
unittest.main(argv=[''], exit=False)

# Step 8: Save the cleaned data
df_validated.to_csv("cleaned_output.csv", index=False)
print("Cleaned data saved to cleaned_output.csv")

Raw Ingested Data:


Unnamed: 0,customer_id,amount,transaction_date
0,1.0,100.0,2024-01-01
1,2.0,,2024-01-02
2,,200.0,2024-01-03
3,4.0,300.0,
4,5.0,,2024-01-05


Cleaned Data (missing values handled):


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['amount'].fillna(df_cleaned['amount'].mean(), inplace=True)
  df_cleaned['transaction_date'].fillna(method='ffill', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['transaction_date'].fillna(method='ffill', inplace=True)


Unnamed: 0,customer_id,amount,transaction_date
0,1.0,100.0,2024-01-01
1,2.0,200.0,2024-01-02
3,4.0,300.0,2024-01-02
4,5.0,200.0,2024-01-05


Data after duplicate removal:


Unnamed: 0,customer_id,amount,transaction_date
0,1.0,100.0,2024-01-01
1,2.0,200.0,2024-01-02
3,4.0,300.0,2024-01-02
4,5.0,200.0,2024-01-05


Data after type validation:


Unnamed: 0,customer_id,amount,transaction_date
0,1.0,100.0,2024-01-01
1,2.0,200.0,2024-01-02
3,4.0,300.0,2024-01-02
4,5.0,200.0,2024-01-05


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['amount'].fillna(df_cleaned['amount'].mean(), inplace=True)
  df_cleaned['transaction_date'].fillna(method='ffill', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['transaction_date'].fillna(method='ffill', inplace=True)
.
----------------------------------------------------------------------
Ran 3 tests in 0.006s

OK


Cleaned data saved to cleaned_output.csv


### Task 2: Validate Data Types during Extraction
**Description**: You have a JSON file that should have specific data types for each field. Write a script to validate if the data types match the expected schema.

**Steps**:
1. Define expected schema
2. Validate data types

In [None]:
# Write your code from here

### Task 3: Remove Duplicate Records in Data
**Description**: You have a dataset with duplicate entries. Write a Python script to find and remove duplicate records using Pandas.

**Steps**:
1. Find duplicate records
2. Remove duplicates
3. Report results

In [None]:
# Write your code from here