In [3]:
import pandas as pd
import numpy as np
import great_expectations as ge
from great_expectations.dataset import PandasDataset

# First, let's create a sample dataset to work with
data = {
    'customer_id': range(1, 101),
    'name': [f'Customer {i}' for i in range(1, 101)],
    'age': np.random.randint(15, 80, 100),  # Some ages below 18 and above 65
    'purchase_amount': np.random.uniform(10.0, 500.0, 100),
    'purchase_date': pd.date_range(start='2023-01-01', periods=100),
    'product_category': np.random.choice(['Electronics', 'Clothing', 'Food', 'Books'], 100)
}

# Create DataFrame and convert it to a Great Expectations Dataset
df = pd.DataFrame(data)
ge_df = ge.from_pandas(df)

print("Sample data:")
print(df.head())
print("\nDataset shape:", df.shape)

# Task 1: Validate Column Existence
print("\n### Task 1: Validate Column Existence ###")

# Create an expectation suite
expectation_suite = ge_df.get_expectation_suite("column_existence_suite")
ge_df.expect_table_columns_to_match_ordered_list([
    'customer_id', 'name', 'age', 'purchase_amount', 'purchase_date', 'product_category'
])

# Check if specific columns exist
result_customer_id = ge_df.expect_column_to_exist('customer_id')
result_product_id = ge_df.expect_column_to_exist('product_id')  # This should fail as we don't have this column

print(f"Does 'customer_id' column exist? {result_customer_id.success}")
print(f"Does 'product_id' column exist? {result_product_id.success}")

# Task 2: Validate Column Data Types
print("\n### Task 2: Validate Column Data Types ###")

# Check data types for numeric columns
result_purchase_amount_type = ge_df.expect_column_values_to_be_of_type('purchase_amount', 'float')
result_purchase_amount_kind = ge_df.expect_column_values_to_be_in_type_list('purchase_amount', ['float', 'int', 'numpy.float64'])

print(f"Is 'purchase_amount' of type float? {result_purchase_amount_type.success}")
print(f"Is 'purchase_amount' a numeric type? {result_purchase_amount_kind.success}")

# Let's examine the actual type we have in our DataFrame
print(f"Actual type of 'purchase_amount': {df['purchase_amount'].dtype}")

# Task 3: Validate Range of Values
print("\n### Task 3: Validate Range of Values ###")

# Check if age is between 18 and 65
result_age_range = ge_df.expect_column_values_to_be_between('age', 18, 65)

print(f"Are all 'age' values between 18 and 65? {result_age_range.success}")
if not result_age_range.success:
    print(f"Number of values outside range: {result_age_range.result['unexpected_count']}")
    print(f"Proportion of values outside range: {result_age_range.result['unexpected_percent']}%")
    print(f"Examples of invalid values: {result_age_range.result['partial_unexpected_list']}")

# Let's add additional checks for other columns
print("\n### Additional Validation Checks ###")

# Check if purchase amount is positive
result_purchase_amount_positive = ge_df.expect_column_values_to_be_between('purchase_amount', 0, float('inf'))
print(f"Are all 'purchase_amount' values positive? {result_purchase_amount_positive.success}")

# Check if product_category values belong to a specific set
result_category_values = ge_df.expect_column_values_to_be_in_set(
    'product_category', ['Electronics', 'Clothing', 'Food', 'Books']
)
print(f"Do all 'product_category' values belong to the expected set? {result_category_values.success}")

# Save our expectations as a suite
expectation_suite = ge_df.get_expectation_suite()

# Run all validations and get a validation report
validation_result = ge_df.validate(expectation_suite=expectation_suite, only_return_failures=False)

print("\n### Validation Summary ###")
print(f"Total validations performed: {len(validation_result.results)}")
print(f"Passing validations: {sum(1 for result in validation_result.results if result.success)}")
print(f"Failing validations: {sum(1 for result in validation_result.results if not result.success)}")

# Let's demonstrate how to fix a failure (e.g., age range issue)
print("\n### Fixing Data Issues ###")
print("Before fix: Age range statistics:")
print(f"Min age: {df['age'].min()}, Max age: {df['age'].max()}")
print(f"Number of records with age < 18: {len(df[df['age'] < 18])}")
print(f"Number of records with age > 65: {len(df[df['age'] > 65])}")

# Filter the DataFrame to only include ages 18-65
df_fixed = df[(df['age'] >= 18) & (df['age'] <= 65)].copy()
ge_df_fixed = ge.from_pandas(df_fixed)

# Re-run the age range validation
result_age_range_fixed = ge_df_fixed.expect_column_values_to_be_between('age', 18, 65)
print("\nAfter fix: Age range validation result:")
print(f"Are all 'age' values between 18 and 65? {result_age_range_fixed.success}")
print(f"New data shape: {df_fixed.shape}")

ModuleNotFoundError: No module named 'great_expectations.dataset'