In [1]:
import pandas as pd
import great_expectations as gx
import great_expectations.expectations as gxe
import os
import shutil

# --- Activity 4: Data Quality Automation Tools ---
# --- Task A: Using Great Expectations ---

# --- 19. Setting Up Expectations & Validation ---
# Objective: Install GE, set up expectation suite, validate, list unmet expectations.

# 1. Create a sample dataset (using pandas DataFrame)
print("1. Creating a sample dataset...")
data = {
    'product_id': [101, 102, 103, 104, 105, 106, 107, 108, 109, 110],
    'product_name': ['Laptop', 'Keyboard', 'Mouse', 'Monitor', 'Webcam', 'Printer', 'Scanner', 'Speakers', 'Headphones', 'Microphone'],
    'price': [1200.50, 75.00, 25.99, 350.75, 80.00, 250.00, 150.00, 100.00, 99.50, 70.00],
    'in_stock': [True, True, False, True, True, False, True, True, True, True],
    'rating': [4.5, 4.0, 3.0, 4.8, 4.2, 3.5, 4.1, 4.3, 4.6, 4.0],
    'last_updated': ['2023-01-15', '2024-02-20', '2023-11-10', '2024-03-01', '2024-04-05', '2023-09-30', '2024-01-25', '2024-03-15', '2024-04-20', '2024-05-01']
}
df = pd.DataFrame(data)

# Introduce some data quality issues for demonstration
df.loc[2, 'price'] = -10.0 # Invalid price (outside range)
df.loc[5, 'in_stock'] = None # Missing value
df.loc[8, 'rating'] = 5.5 # Invalid rating (outside range)
df.loc[0, 'product_id'] = 102 # Duplicate product_id

print("Sample DataFrame created with some data quality issues:")
print(df)
print("-" * 30)

# 2. Set up a basic Great Expectations Data Context
# In a real-world scenario, you would typically run `great_expectations init`
# in your terminal to set up the project structure.
# For this script, we'll create a temporary file-based context to support Data Docs.

# Define a temporary GE project directory
ge_root_dir = "gx_temp_project"

# Clean up previous runs if the directory exists
if os.path.exists(ge_root_dir):
    print(f"Removing existing GE project directory: {ge_root_dir}")
    shutil.rmtree(ge_root_dir)

print(f"Initializing Great Expectations Data Context in '{ge_root_dir}'...")
# Initialize the context - this simulates `great_expectations init`
# We use a simple file-based configuration
context = gx.get_context(project_root_dir=ge_root_dir)

# Add a simple datasource (e.g., for pandas DataFrames)
# This simulates configuring a datasource via `great_expectations add datasource`
print("Adding a datasource...")
datasource_name = "my_pandas_datasource"
try:
    context.add_datasource(
        name=datasource_name,
        module_name="great_expectations.datasource",
        class_name="Datasource",
        execution_engine={
            "module_name": "great_expectations.execution_engine",
            "class_name": "PandasExecutionEngine",
        },
        data_connectors={
            "default_runtime_data_connector": {
                "class_name": "RuntimeDataConnector",
                "module_name": "great_expectations.datasource.data_connector",
                "batch_identifiers":["batch_id"] # Required for RuntimeDataConnector
            }
        }
    )
    print(f"Datasource '{datasource_name}' added.")
except gx.exceptions.DataContextError as e:
    print(f"Datasource '{datasource_name}' already exists or error adding: {e}")
    # If it already exists, retrieve it
    context.get_datasource(datasource_name)


# 3. Create an Expectation Suite
# This simulates creating an expectation suite, e.g., via `great_expectations suite new`
print("Creating an Expectation Suite...")
suite_name = "product_data_quality_suite"

# Check if suite already exists, if so, load it
try:
    suite = context.get_expectation_suite(expectation_suite_name=suite_name)
    print(f"Expectation Suite '{suite_name}' loaded.")
except gx.exceptions.DataContextError:
    suite = context.create_expectation_suite(expectation_suite_name=suite_name)
    print(f"Expectation Suite '{suite_name}' created.")

# --- 20. Testing for Expectation (Adding Expectations) ---
# Objective: Create expectations like "column values must fall within a certain range."

print("Adding expectations to the suite...")

# Add expectations based on the dataset and potential quality issues

# Expectation: 'product_id' column values must be unique
suite.add_expectation(gxe.ExpectationConfiguration(
    expectation_type="expect_column_values_to_be_unique",
    kwargs={"column": "product_id"}
))

# Expectation: 'product_name' column values must not be null
suite.add_expectation(gxe.ExpectationConfiguration(
    expectation_type="expect_column_values_to_not_be_null",
    kwargs={"column": "product_name"}
))

# Expectation: 'price' column values must be between 0 and 10000 (Task 20 - Range Expectation)
suite.add_expectation(gxe.ExpectationConfiguration(
    expectation_type="expect_column_values_to_be_between",
    kwargs={"column": "price", "min_value": 0, "max_value": 10000}
))

# Expectation: 'in_stock' column values must be boolean
suite.add_expectation(gxe.ExpectationConfiguration(
    expectation_type="expect_column_values_to_be_boolean",
    kwargs={"column": "in_stock"}
))

# Expectation: 'rating' column values must be between 0 and 5 (Task 20 - Another Range Expectation)
suite.add_expectation(gxe.ExpectationConfiguration(
    expectation_type="expect_column_values_to_be_between",
    kwargs={"column": "rating", "min_value": 0, "max_value": 5}
))

# Expectation: 'last_updated' column values must be in a date format
suite.add_expectation(gxe.ExpectationConfiguration(
    expectation_type="expect_column_values_to_match_regex",
    kwargs={"column": "last_updated", "regex": r"^\d{4}-\d{2}-\d{2}$"}
))


# Save the Expectation Suite
# This saves the suite definition to the great_expectations/expectations directory
context.save_expectation_suite(expectation_suite=suite, expectation_suite_name=suite_name)
print(f"Expectation Suite '{suite_name}' saved.")
print("-" * 30)

# 4. Validate the dataset against the expectation suite (Task 19 - Validate a dataset)

print("Validating the dataset against the expectation suite...")

# Create a Batch Request for the DataFrame
# We use a RuntimeBatchRequest because our data is already in a DataFrame
batch_request = gx.BatchRequest(
    datasource_name=datasource_name,
    data_connector_name="default_runtime_data_connector",
    data_asset_name="my_product_data", # A name for this specific data asset
    runtime_parameters={"batch_data": df}, # Pass the DataFrame here
    batch_identifiers={"batch_id": "first_batch"} # A unique identifier for this batch
)

# Create and run a Checkpoint
# Checkpoints are the primary way to validate data in GE
checkpoint_name = "my_product_checkpoint"
checkpoint_config = {
    "name": checkpoint_name,
    "class_name": "SimpleCheckpoint",
    "validations": [
        {
            "batch_request": batch_request,
            "expectation_suite_name": suite_name,
        }
    ],
}

# Add the checkpoint to the context
try:
    context.add_checkpoint(**checkpoint_config)
    print(f"Checkpoint '{checkpoint_name}' added.")
except gx.exceptions.DataContextError as e:
     print(f"Checkpoint '{checkpoint_name}' already exists or error adding: {e}")
     # If it exists, retrieve it
     context.get_checkpoint(checkpoint_name)


# Run the checkpoint
checkpoint_result = context.run_checkpoint(checkpoint_name=checkpoint_name)

print("\nValidation Complete.")
print("-" * 30)

# 5. List unmet expectations (Task 19 - List unmet expectations)

print("Listing unmet expectations:")

# Check the validation result
if not checkpoint_result["success"]:
    print("Validation failed. Unmet expectations:")
    # Iterate through the results to find failed expectations
    for validation_result_identifier, validation_result in checkpoint_result["run_results"].items():
        # Access the results of the validation
        results = validation_result['validation_result']['results']
        for result in results:
            if not result['success']:
                expectation = result['expectation_config']
                print(f"- Expectation Type: {expectation['expectation_type']}")
                print(f"  Column: {expectation['kwargs'].get('column', 'N/A')}")
                print(f"  Details: {result['result']}")
                print(f"  Meta: {result['meta']}")
else:
    print("Validation successful! All expectations met.")

print("-" * 30)

# --- 21. Generating Data Docs ---
# Objective: Automatically generate data quality documentation.

print("Attempting to build Data Docs...")

# To build Data Docs, you typically need the full GE project structure
# created by `great_expectations init`.
# The context.build_data_docs() command generates the HTML documentation.
# You can then open the index.html file in your browser to view the docs.

try:
    # Build Data Docs
    docs_build_results = context.build_data_docs()
    print("\nData Docs built successfully.")
    # The URL to open will be printed in the console output of the GE command.
    # In this script, we'll just indicate where they are.
    data_docs_path = os.path.join(ge_root_dir, 'uncommitted', 'data_docs', 'local_site')
    print(f"Data Docs are located in: {os.path.abspath(data_docs_path)}")
    print("Open the 'index.html' file in that directory in your web browser to view them.")

except Exception as e:
    print(f"\nCould not build Data Docs. This might require a more complete GE project setup.")
    print(f"Error: {e}")
    print("Please ensure you have run 'great_expectations init' or have a valid GE project structure.")

print("-" * 30)
print("Script finished.")

# Optional: Clean up the temporary GE project directory
# print(f"Cleaning up temporary GE project directory: {ge_root_dir}")
# shutil.rmtree(ge_root_dir)







ModuleNotFoundError: No module named 'great_expectations'

In [2]:
import pandas as pd
import numpy as np

# --- Introduction ---
# This script demonstrates basic data quality checks using the pandas library in Python.
# We will cover:
# 1. Loading data (using a sample DataFrame)
# 2. Checking for missing values
# 3. Handling missing values (filling or dropping)
# 4. Checking for duplicate rows
# 5. Identifying and handling duplicates

# --- 1. Loading Data ---
# Create a sample DataFrame for demonstration purposes
data = {
    'CustomerID': [101, 102, 103, 104, 105, 101, 106, 107, 108, 109],
    'ProductName': ['Laptop', 'Keyboard', 'Mouse', 'Monitor', 'Webcam', 'Laptop', 'Printer', 'Mouse', 'Keyboard', 'Monitor'],
    'Price': [1200, 75, np.nan, 300, 50, 1200, 250, 75, 75, 300],
    'Quantity': [1, 2, 1, 1, 3, 1, 1, 2, 2, np.nan],
    'OrderDate': ['2023-01-10', '2023-01-11', '2023-01-11', '2023-01-12', '2023-01-12', '2023-01-10', '2023-01-13', '2023-01-11', '2023-01-11', '2023-01-12']
}

df = pd.DataFrame(data)

print("--- Original DataFrame ---")
print(df)
print("\n")

# --- 2. Checking for Missing Values ---
# Check for missing values in the entire DataFrame
print("--- Missing Values Count per Column ---")
print(df.isnull().sum())
print("\n")

# Check for missing values in specific columns
print("--- Missing Values in 'Price' column ---")
print(df['Price'].isnull().sum())
print("\n")

# Display rows with missing values
print("--- Rows with any Missing Value ---")
print(df[df.isnull().any(axis=1)])
print("\n")

# --- 3. Handling Missing Values ---

# Option A: Dropping rows with missing values
# Note: This is often not ideal as it can lead to data loss.
df_dropped = df.dropna()
print("--- DataFrame after Dropping Rows with Missing Values ---")
print(df_dropped)
print("\n")

# Option B: Filling missing values
# Fill missing 'Price' with the mean price
mean_price = df['Price'].mean()
df['Price'].fillna(mean_price, inplace=True)

# Fill missing 'Quantity' with a default value (e.g., 1)
df['Quantity'].fillna(1, inplace=True)

print("--- DataFrame after Filling Missing Values ---")
print(df)
print("\n")

# --- 4. Checking for Duplicate Rows ---
# Check for exact duplicate rows
print("--- Check for Exact Duplicate Rows ---")
print(df.duplicated().sum())
print("\n")

# --- 5. Identifying and Handling Duplicates ---

# Identify duplicate rows
print("--- Identified Duplicate Rows ---")
# `keep=False` marks all occurrences of a duplicate as True
print(df[df.duplicated(keep=False)])
print("\n")

# Handle duplicates: Drop duplicate rows (keeping the first occurrence by default)
df_no_duplicates = df.drop_duplicates()
print("--- DataFrame after Dropping Duplicate Rows ---")
print(df_no_duplicates)
print("\n")

# --- Conclusion ---
# This script provided a basic overview of checking and handling missing values
# and duplicates using pandas. More advanced data quality techniques exist
# for specific scenarios.











--- Original DataFrame ---
   CustomerID ProductName   Price  Quantity   OrderDate
0         101      Laptop  1200.0       1.0  2023-01-10
1         102    Keyboard    75.0       2.0  2023-01-11
2         103       Mouse     NaN       1.0  2023-01-11
3         104     Monitor   300.0       1.0  2023-01-12
4         105      Webcam    50.0       3.0  2023-01-12
5         101      Laptop  1200.0       1.0  2023-01-10
6         106     Printer   250.0       1.0  2023-01-13
7         107       Mouse    75.0       2.0  2023-01-11
8         108    Keyboard    75.0       2.0  2023-01-11
9         109     Monitor   300.0       NaN  2023-01-12


--- Missing Values Count per Column ---
CustomerID     0
ProductName    0
Price          1
Quantity       1
OrderDate      0
dtype: int64


--- Missing Values in 'Price' column ---
1


--- Rows with any Missing Value ---
   CustomerID ProductName  Price  Quantity   OrderDate
2         103       Mouse    NaN       1.0  2023-01-11
9         109     Monito

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Price'].fillna(mean_price, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Quantity'].fillna(1, inplace=True)
