### Metadata Management for Data Quality
**Description**: Store and use metadata to manage data quality in a pipeline.

**Steps**:
1. Load metadata
2. Load data
3. Use metadata to validate data quality
4. Show valid data


In [1]:
# write your code from here
import pandas as pd
import numpy as np
import json
from datetime import datetime

# Step 1: Create the dataset directly within the code
data = {
    'customer_id': [1, 2, 3, 4, 5, 2, 6, 3],
    'amount': [200, 150, np.nan, 300, 250, 150, 500, 100],
    'date': ['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04', '2023-01-05', '2023-01-06', '2023-01-07', '2023-01-08']
}

# Convert to DataFrame
df = pd.DataFrame(data)

# Step 2: Metadata structure
metadata = {
    'file': 'generated_in_notebook',
    'row_count': len(df),
    'columns': list(df.columns),
    'column_types': {col: str(df[col].dtype) for col in df.columns},
    'last_updated': str(datetime.now()),
    'transformations': []
}

# Display the dataset and metadata
print("Loaded Data:")
print(df)
print("\nMetadata:")
print(metadata)

# Step 3: Function to validate data quality based on metadata
def validate_data_quality(df, metadata):
    """
    Check for missing values, invalid data types, and other quality checks.
    """
    issues = []
    
    # Check for missing values
    missing_values = df.isnull().sum()
    for col in missing_values.index:
        if missing_values[col] > 0:
            issues.append(f"Column '{col}' has {missing_values[col]} missing values.")
    
    # Check for data type validity
    for col in df.columns:
        expected_type = metadata['column_types'].get(col)
        if expected_type and not np.issubdtype(df[col].dtype, np.generic):
            issues.append(f"Column '{col}' has an unexpected data type. Expected {expected_type}, got {str(df[col].dtype)}")

    # Check if 'date' column has a valid date format
    try:
        df['date'] = pd.to_datetime(df['date'], errors='coerce')  # Invalid date values turn into NaT
        invalid_dates = df['date'].isnull().sum()
        if invalid_dates > 0:
            issues.append(f"Column 'date' contains {invalid_dates} invalid dates.")
    except Exception as e:
        issues.append(f"Error parsing 'date' column: {str(e)}")
    
    # Print issues if any, else indicate no issues
    if issues:
        print("\nData Quality Issues:")
        for issue in issues:
            print(issue)
    else:
        print("\nData passed all quality checks!")
    
    return issues

# Validate the data quality
data_quality_issues = validate_data_quality(df, metadata)

# Step 4: Log transformations dynamically
def log_transformation(metadata, transformation_details):
    """
    Add details of data transformations to the metadata.
    """
    transformation_entry = {
        'transformation': transformation_details,
        'timestamp': str(datetime.now())
    }
    metadata['transformations'].append(transformation_entry)

# Example transformations
log_transformation(metadata, "Imputed missing values in 'amount' column with mean.")
log_transformation(metadata, "Removed duplicates based on 'customer_id' column.")

# Step 5: Apply data cleaning or transformations and log those
# Filling missing values in 'amount' column with the mean
df['amount'].fillna(df['amount'].mean(), inplace=True)  # Example transformation
log_transformation(metadata, "Imputed missing 'amount' values using mean.")

# Remove duplicates based on customer_id
df.drop_duplicates(subset='customer_id', keep='first', inplace=True)
log_transformation(metadata, "Removed duplicate rows based on 'customer_id'.")

# Step 6: Save metadata to a JSON file for future tracking
def save_metadata(metadata, metadata_path='metadata.json'):
    """
    Save metadata to a JSON file for tracking.
    """
    try:
        with open(metadata_path, 'w') as f:
            json.dump(metadata, f, indent=4)
    except Exception as e:
        print(f"Error saving metadata: {e}")

# Save the updated metadata
save_metadata(metadata)

# Step 7: Save cleaned data to a new file (simulating saving to CSV)
df.to_csv("cleaned_data.csv", index=False)
print("\nCleaned data saved to 'cleaned_data.csv'")

# Final metadata and data output
print("\nFinal Cleaned Data:")
print(df)

# Unit Test Functions (to check validation, cleaning, and transformation)
def test_validate_data_quality():
    # Test valid data
    df_valid = pd.DataFrame({
        'customer_id': [1, 2, 3],
        'amount': [100, 200, 300],
        'date': ['2023-01-01', '2023-01-02', '2023-01-03']
    })
    metadata_valid = {'columns': df_valid.columns.tolist(), 'column_types': {col: str(df_valid[col].dtype) for col in df_valid.columns}}
    assert len(validate_data_quality(df_valid, metadata_valid)) == 0  # Should pass with no issues

    # Test data with missing value in 'amount' column
    df_invalid = df_valid.copy()
    df_invalid['amount'].iloc[1] = np.nan
    assert len(validate_data_quality(df_invalid, metadata_valid)) > 0  # Should flag missing value issue

    # Test invalid date format
    df_invalid['date'].iloc[1] = 'invalid_date'
    assert len(validate_data_quality(df_invalid, metadata_valid)) > 0  # Should flag invalid date issue


# Run the unit tests
test_validate_data_quality()
print("\nUnit Tests Passed!")

Loaded Data:
   customer_id  amount        date
0            1   200.0  2023-01-01
1            2   150.0  2023-01-02
2            3     NaN  2023-01-03
3            4   300.0  2023-01-04
4            5   250.0  2023-01-05
5            2   150.0  2023-01-06
6            6   500.0  2023-01-07
7            3   100.0  2023-01-08

Metadata:
{'file': 'generated_in_notebook', 'row_count': 8, 'columns': ['customer_id', 'amount', 'date'], 'column_types': {'customer_id': 'int64', 'amount': 'float64', 'date': 'object'}, 'last_updated': '2025-05-28 10:25:21.544388', 'transformations': []}

Data Quality Issues:
Column 'amount' has 1 missing values.

Cleaned data saved to 'cleaned_data.csv'

Final Cleaned Data:
   customer_id      amount       date
0            1  200.000000 2023-01-01
1            2  150.000000 2023-01-02
2            3  235.714286 2023-01-03
3            4  300.000000 2023-01-04
4            5  250.000000 2023-01-05
6            6  500.000000 2023-01-07

Data passed all quality c

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_invalid['amount'].iloc[1] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_invalid['date'].iloc[1] = 'invalid_date'
  df_invalid['date'].iloc[1] = 'invalid_date'
