#Great expectations Approach

- Define the BaseDataContext for GE. This includes:
  - DataSourceConfig
  - Stores
    - expectations_store
    - validations_store
    - evaluation_parameter_store
    - profile_store
    - checkpoint_store
  - Data docs sites
  - Checkpoints
  
### Every pipeline will have two sets of expectation suites or more
  - One for data profiling or collecting metrics
  - One for validating the data
  - Define two checkpoints, one for profile and the other for validation.
  - The profiling checkpoint runs only on first day of the month.
  - The validation checkpoint runs everyday.
  
### TODO/ Questions:
- How to create expectation suite for data profiling and view them in data docs?
  - I see this as a one time activity and would be revisited based on needs.
  - This suite will be used at a regular cadence (Say once a month)
- How to create expectation suite for validations of data?
  - This suite will be used everyday to validate the data.
- Define actions based on errors - Pending
  - Can send email based on error.
  - How to filter out failed records?
  
#### Validation Operator / Checkpoint
- Validation Operator is operator is being deprecated eventually.
- Checkpoint is the way forward.

In [0]:
import datetime
import time

import pandas as pd
from ruamel import yaml

from great_expectations.data_context.types.base import DataContextConfig, DatasourceConfig, FilesystemStoreBackendDefaults
from great_expectations.data_context.store.tuple_store_backend import TupleAzureBlobStoreBackend
from great_expectations.data_context import BaseDataContext
from great_expectations.profile.basic_dataset_profiler import BasicDatasetProfiler
from great_expectations.core.batch import BatchRequest
from great_expectations.core.batch import RuntimeBatchRequest
from great_expectations.core.expectation_suite import ExpectationSuite
from great_expectations.core.expectation_configuration import ExpectationConfiguration
from great_expectations.checkpoint import SimpleCheckpoint
from great_expectations.checkpoint import Checkpoint
from great_expectations.execution_engine import (
   PandasExecutionEngine,
   SparkDFExecutionEngine,
   SqlAlchemyExecutionEngine,
)


my_spark_datasource_config = DatasourceConfig(
    class_name="Datasource",
    execution_engine= {"class_name": "SparkDFExecutionEngine"},
  data_connectors={
    "default_runtime_data_connector_name":{
        "class_name": "RuntimeDataConnector",
        "batch_identifiers" : ["batch_id"]
    }
  }
)

  
data_context_config = DataContextConfig(
    datasources={"spark_test_datasource": my_spark_datasource_config},
    stores={
          "expectations_store": {
              "class_name": "ExpectationsStore",
              "store_backend": {
                  "class_name": "TupleAzureBlobStoreBackend",
                  "container":  STORAGE_CONTAINER,
                  "connection_string": AZURE_STORAGE_CONNECTION_STRING,
                  "prefix": "expectations"
              },
          },
          "validations_store": {
              "class_name": "ValidationsStore",
              "store_backend": {
                  "class_name": "TupleAzureBlobStoreBackend",
                  "container":  STORAGE_CONTAINER,
                  "connection_string": AZURE_STORAGE_CONNECTION_STRING,
                  "prefix": "validations"
              },
          },
          "profile_store": {
              "class_name": "ProfilerStore",
              "store_backend": {
                  "class_name": "TupleAzureBlobStoreBackend",
                  "container":  STORAGE_CONTAINER,
                  "connection_string": AZURE_STORAGE_CONNECTION_STRING,
                  "prefix": "profilers"
              },
          },
          "evaluation_parameter_store": {
              "class_name": "EvaluationParameterStore"
          },
          "metric_store":{
            "class_name": "MetricStore",
            "store_backend": {
                  "class_name": "DatabaseStoreBackend",
                  "credentials":{
                    "username": MS_USERNAME,
                    "password": MS_PASSWORD,
                    "port": MS_PORT,
                    "host": MS_HOST,
                    "database": MS_DATABASE,
                    "drivername": MS_DRIVERNAME,
                  }
              },
#             "store_backend": {
#                   "class_name": "TupleAzureBlobStoreBackend",
#                   "container":  STORAGE_CONTAINER,
#                   "connection_string": AZURE_STORAGE_CONNECTION_STRING,
#                   "prefix": "metrics"
#               },
          },
          "checkpoint_store": {
              "class_name": "CheckpointStore",
              "store_backend": {
                  "class_name": "TupleAzureBlobStoreBackend",
                  "container": STORAGE_CONTAINER,
                  "prefix": "checkpoint",
                  "connection_string": AZURE_STORAGE_CONNECTION_STRING
            },
        },
    },
    expectations_store_name="expectations_store",
    validations_store_name="validations_store",
    checkpoint_store_name="checkpoint_store",
    evaluation_parameter_store_name="evaluation_parameter_store",
    profiler_store_name="profile_store",
      data_docs_sites = {
        "az_site": {
            "class_name": "SiteBuilder",
            "store_backend": {
                "class_name": "TupleAzureBlobStoreBackend",
                "container":  "\$web",
                "connection_string": AZURE_STORAGE_CONNECTION_STRING
            },
            "site_index_builder": {
                "class_name": "DefaultSiteIndexBuilder",
            },
            "site_section_builders":{
                "expectations":{
                    "class_name":"DefaultSiteSectionBuilder",
                    "source_store_name":"expectations_store",
                    "renderer":{
                        "module_name":"great_expectations.render.renderer",
                        "class_name":"ExpectationSuitePageRenderer"
                    }
                },
                "validations":{
                    "class_name":"DefaultSiteSectionBuilder",
                    "source_store_name":"validations_store",
                    "run_name_filter":{
                        "not_equals":"profiling"
                    },
                    "renderer":{
                        "module_name":"great_expectations.render.renderer",
                        "class_name":"ValidationResultsPageRenderer"
                    }
                },
                "profiling":{
                    "class_name":"DefaultSiteSectionBuilder",
                    "source_store_name":"validations_store",
                    "run_name_filter":{
                        "equals":"profiling"
                    },
                    "renderer":{
                        "module_name":"great_expectations.render.renderer",
                        "class_name":"ProfilingResultsPageRenderer"
                    }
                },
            }
        }
    },
    validation_operators={
        "action_list_operator": {
            "class_name": "ActionListValidationOperator",
            "action_list": [
                {
                    "name": "store_validation_result",
                    "action": {"class_name": "StoreValidationResultAction"},
                },
                {
                    "name": "store_evaluation_params",
                    "action": {"class_name": "StoreEvaluationParametersAction"},
                },
                {
                    "name": "update_data_docs",
                    "action": {"class_name": "UpdateDataDocsAction"},
                }
            ],
        }
    },
    store_backend_defaults = TupleAzureBlobStoreBackend(container=STORAGE_CONTAINER, connection_string=AZURE_STORAGE_CONNECTION_STRING)
)

context = BaseDataContext(project_config=data_context_config)



  "container":  "\$web",
