In [1]:
# Activity 4: Data Quality Automation Tools

# Task A: Using Great Expectations

# 19. Setting Up Expectations:
# - Install Great Expectations and set up a basic expectation suite.
# - Validate a dataset and list unmet expectations.



# Install Great Expectations (uncomment the line below if you haven't installed it)
# !pip install great_expectations

import great_expectations as ge
from great_expectations.dataset import PandasDataset
import pandas as pd

# Sample DataFrame for demonstration
data = {
    'Age': [25, 30, 35, None, 45],
    'Salary': [50000, 60000, 70000, 75000, 80000],
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve']
}

df = pd.DataFrame(data)

# Convert the pandas DataFrame to a Great Expectations dataset
ge_df = ge.from_pandas(df)

# Task 19: Setting Up Expectations
# Create an expectation suite
suite_name = "basic_expectations_suite"
context = ge.data_context.DataContext("/tmp/great_expectations")  # Temporary context for demonstration
expectation_suite = context.create_expectation_suite(suite_name)

# Task 20: Testing for Expectation
# Example expectation: Age column values should be between 20 and 50
ge_df.expect_column_values_to_be_between("Age", min_value=20, max_value=50)

# Example expectation: Salary should be greater than 40000
ge_df.expect_column_values_to_be_in_set("Salary", [50000, 60000, 70000, 75000, 80000])

# Task 21: Generating Data Docs
# Generate data quality documentation
context.build_data_docs()

# Display unmet expectations (if any)
print(f"Unmet Expectations: {ge_df.validate()}")

# Save the expectation suite
context.save_expectation_suite(expectation_suite)




# 20. Testing for Expectation:
# - Create expectations such as “column values must fall within a certain range.”






# 21. Generating Data Docs:
# - Automatically generate data quality documentation.








ModuleNotFoundError: No module named 'great_expectations.dataset'

In [None]:
# Task B: Using DQ Labs

# 22. Tool Setup and Configuration:
# - Download and configure DQ Labs on your local environment.
# - Create a new data quality project.




# Note: DQ Labs is a tool for Data Quality, so specific integration will depend on your environment.
# You need to install DQ Labs (this example assumes DQ Labs is already installed and configured)

# 22. Tool Setup and Configuration:
# Typically, setting up DQ Labs involves downloading the package and configuring your environment.
# In a real-world scenario, you'd run the DQ Labs setup from a command line or an environment-specific tool.
# Below is an example of setting up a data quality project using DQ Labs in a Python script.

# Assuming DQ Labs is set up locally:
from dqlabs import DataQualityProject

# Step 1: Initialize a new data quality project
dq_project = DataQualityProject(project_name="Data_Quality_Project_1", project_path="path_to_your_project")

# Step 2: Configure the project
dq_project.configure_environment(data_source="local_db", data_format="csv")  # Example configuration

# Step 3: Add data for profiling
dq_project.add_data("your_data.csv")

# 23. Data Analysis Automation:
# Use DQ Labs to automatically apply data profiling and data quality checks

# Step 1: Profiling the dataset
dq_project.profile_data("your_data.csv")

# Step 2: Apply standard data quality checks (e.g., completeness, consistency, validity)
dq_project.run_data_quality_checks()

# Step 3: View profiling results
profiling_results = dq_project.view_profile_results()
print(profiling_results)

# 24. Quality Rule Creation:
# Create custom quality rules (e.g., checking for duplicates, enforcing column standards)

# Example: Create a rule to check for duplicates in a 'Name' column
dq_project.create_quality_rule(rule_name="Check_Duplicates_in_Name", 
                               rule_type="duplicate", 
                               column="Name")

# Example: Create a rule to ensure 'Age' is between 18 and 100
dq_project.create_quality_rule(rule_name="Check_Age_Range", 
                               rule_type="range", 
                               column="Age", 
                               min_value=18, 
                               max_value=100)

# Step 4: Apply quality rules
dq_project.apply_quality_rules()

# Step 5: View rule results
rule_results = dq_project.view_quality_rule_results()
print(rule_results)

# Note: The above code is a conceptual example and assumes that DQ Labs has methods like `configure_environment`, 
# `add_data`, `profile_data`, `run_data_quality_checks`, `create_quality_rule`, and others.
# Replace placeholders like "path_to_your_project" and "your_data.csv" with your actual project path and data.





# 23. Data Analysis Automation:
# - Apply DQ Labs for automating data profiling and quality checks.







# 24. Quality Rule Creation:
# - Create quality rules for detecting and handling duplicates or enforcing standards.








