In [7]:
# Activity 4: Data Quality Automation Tools

# Task A: Using Great Expectations

# 19. Setting Up Expectations:
# - Install Great Expectations and set up a basic expectation suite.
# - Validate a dataset and list unmet expectations.






# 20. Testing for Expectation:
# - Create expectations such as “column values must fall within a certain range.”






# 21. Generating Data Docs:
# - Automatically generate data quality documentation.


import pandas as pd
from great_expectations.dataset import PandasDataset

# Load data
df = pd.read_csv("my_data.csv")

# Wrap it with Great Expectations
class CustomDataset(PandasDataset): pass
gdf = CustomDataset(df)

# Validate against basic expectations
gdf.expect_table_row_count_to_be_between(min_value=10, max_value=1000)
gdf.expect_column_to_exist("age")

# See results
print(gdf.validate())






ModuleNotFoundError: No module named 'great_expectations.dataset'

In [None]:
# Task B: Using DQ Labs

# 22. Tool Setup and Configuration:
# - Download and configure DQ Labs on your local environment.
# - Create a new data quality project.








# 23. Data Analysis Automation:
# - Apply DQ Labs for automating data profiling and quality checks.







# 24. Quality Rule Creation:
# - Create quality rules for detecting and handling duplicates or enforcing standards.

import requests

API_KEY = "your_dqlabs_api_key"
BASE_URL = "https://api.dqlabs.ai/v1"

# Upload dataset
def upload_dataset(file_path):
    files = {'file': open(file_path, 'rb')}
    headers = {'Authorization': f'Bearer {API_KEY}'}
    response = requests.post(f"{BASE_URL}/datasets/upload", files=files, headers=headers)
    return response.json()

# Trigger profiling
def trigger_profiling(dataset_id):
    headers = {'Authorization': f'Bearer {API_KEY}'}
    response = requests.post(f"{BASE_URL}/datasets/{dataset_id}/profile", headers=headers)
    return response.json()

# Create a quality rule
def create_rule(dataset_id, rule_name, sql_query):
    headers = {'Authorization': f'Bearer {API_KEY}', 'Content-Type': 'application/json'}
    payload = {
        "name": rule_name,
        "dataset_id": dataset_id,
        "query": sql_query
    }
    response = requests.post(f"{BASE_URL}/rules", json=payload, headers=headers)
    return response.json()

# Example usage
upload_resp = upload_dataset("sample_data.csv")
profile_resp = trigger_profiling(upload_resp["id"])
rule_resp = create_rule(upload_resp["id"], "Check Duplicate IDs", "SELECT id, COUNT(*) FROM sample GROUP BY id HAVING COUNT(*) > 1")







