In [106]:
#!pip install great_expectations
import json
import great_expectations as ge
from ruamel import yaml
from great_expectations.core.batch import BatchRequest, RuntimeBatchRequest

import pandas as pd
from pandas_profiling import ProfileReport

In [107]:
context = ge.get_context()


In [108]:
datasource_yaml = f"""
name: taxi_datasource
class_name: Datasource
module_name: great_expectations.datasource
execution_engine:
  module_name: great_expectations.execution_engine
  class_name: PandasExecutionEngine
data_connectors:
    default_runtime_data_connector_name:
        class_name: RuntimeDataConnector
        batch_identifiers:
            - default_identifier_name
    default_inferred_data_connector_name:
        class_name: InferredAssetFilesystemDataConnector
        base_directory: C:\\Users\\scarl\\Downloads\\Data\\Regression\\Data
        default_regex:
          group_names:
            - data_asset_name
          pattern: (.*)
"""

In [109]:
context.test_yaml_config(datasource_yaml)


Attempting to instantiate class from config...
	Instantiating as a Datasource, since class_name is Datasource
	Successfully instantiated Datasource


ExecutionEngine class name: PandasExecutionEngine
Data Connectors:
	default_inferred_data_connector_name : InferredAssetFilesystemDataConnector

	Available data_asset_names (1 of 1):
		Regression.csv (1 of 1): ['Regression.csv']

	Unmatched data_references (0 of 0):[]

	default_runtime_data_connector_name:RuntimeDataConnector

	Available data_asset_names (0 of 0):
		Note : RuntimeDataConnector will not have data_asset_names until they are passed in through RuntimeBatchRequest

	Unmatched data_references (0 of 0): []



<great_expectations.datasource.new_datasource.Datasource at 0x1cabeb49340>

In [110]:
context.test_yaml_config(datasource_yaml)


Attempting to instantiate class from config...
	Instantiating as a Datasource, since class_name is Datasource
	Successfully instantiated Datasource


ExecutionEngine class name: PandasExecutionEngine
Data Connectors:
	default_inferred_data_connector_name : InferredAssetFilesystemDataConnector

	Available data_asset_names (1 of 1):
		Regression.csv (1 of 1): ['Regression.csv']

	Unmatched data_references (0 of 0):[]

	default_runtime_data_connector_name:RuntimeDataConnector

	Available data_asset_names (0 of 0):
		Note : RuntimeDataConnector will not have data_asset_names until they are passed in through RuntimeBatchRequest

	Unmatched data_references (0 of 0): []



<great_expectations.datasource.new_datasource.Datasource at 0x1cabebb2c70>

In [111]:
context.add_datasource(**yaml.load(datasource_yaml))
#context.add_datasource(**datasource_config)


The default 'Loader' for 'load(stream)' without further arguments can be unsafe.
Use 'load(stream, Loader=ruamel.yaml.Loader)' explicitly if that is OK.
Alternatively include the following in your code:


In most other cases you should consider using 'safe_load(stream)'
  context.add_datasource(**yaml.load(datasource_yaml))


<great_expectations.datasource.new_datasource.Datasource at 0x1cabec6a3a0>

In [112]:
batch_request = RuntimeBatchRequest(
    datasource_name="taxi_datasource",
    data_connector_name="default_runtime_data_connector_name",
    data_asset_name="ras",  # This can be anything that identifies this data_asset for you
    runtime_parameters={"path": "<PATH_TO_YOUR_DATA_HERE>"},  # Add your path here.
    batch_identifiers={"default_identifier_name": "default_identifier"},
)

# Please note this override is only to provide good UX for docs and tests.
# In normal usage you'd set your path directly in the BatchRequest above.
batch_request.runtime_parameters["path"] = "C:\\Users\\scarl\\Downloads\\Data\\Regression\\Data\\Regression.csv"


In [113]:
context.create_expectation_suite(
    expectation_suite_name="test_suite", overwrite_existing=True
)
validator = context.get_validator(
    batch_request=batch_request, expectation_suite_name="test_suite"
)
print(validator.head())

         Date Month   Week  Quantity  Year Product  RatePerKg  Amount  Profit
0  2018-01-15   Jan  Week1    1000.0  2018    Pork       4.85  4850.0  1940.0
1  2018-01-15   Jan  Week2     650.0  2018    Pork       4.45  2892.5  1157.0
2  2018-01-31   Jan  Week3     100.0  2018    Pork       4.35   435.0   174.0
3  2018-01-31   Jan  Week4     120.0  2018    Pork       4.45   534.0   213.6
4  2018-02-15   Feb  Week1     200.0  2018    Pork       4.75   950.0   380.0


In [114]:
assert isinstance(validator, ge.validator.validator.Validator)


In [115]:
# Here is a BatchRequest naming a data_asset
batch_request = BatchRequest(
    datasource_name="taxi_datasource",
    data_connector_name="default_inferred_data_connector_name",
    data_asset_name="ras",
)


In [116]:
# Please note this override is only to provide good UX for docs and tests.
# In normal usage you'd set your data asset name directly in the BatchRequest above.
batch_request.data_asset_name = "Regression.csv"


In [117]:
context.create_expectation_suite(
    expectation_suite_name="test_suite", overwrite_existing=True
)

{
  "expectation_suite_name": "test_suite",
  "expectations": [],
  "ge_cloud_id": null,
  "meta": {
    "great_expectations_version": "0.13.45"
  },
  "data_asset_type": null
}

In [118]:
validator = context.get_validator(
    batch_request=batch_request, expectation_suite_name="test_suite"
)
print(validator.head())

         Date Month   Week  Quantity  Year Product  RatePerKg  Amount  Profit
0  2018-01-15   Jan  Week1    1000.0  2018    Pork       4.85  4850.0  1940.0
1  2018-01-15   Jan  Week2     650.0  2018    Pork       4.45  2892.5  1157.0
2  2018-01-31   Jan  Week3     100.0  2018    Pork       4.35   435.0   174.0
3  2018-01-31   Jan  Week4     120.0  2018    Pork       4.45   534.0   213.6
4  2018-02-15   Feb  Week1     200.0  2018    Pork       4.75   950.0   380.0


In [132]:
validator.expect_column_values_to_be_between(
    column="Date",
    min_value="2018-01-14",
    max_value="2022-12-31",
    mostly=0.95,
    parse_strings_as_datetimes=True,
)

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]




{
  "meta": {},
  "result": {
    "element_count": 1152,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "success": true,
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [136]:
validator.expect_column_value_lengths_to_be_between(
    column="Month", max_value=9, min_value=3
)

Calculating Metrics:   0%|          | 0/9 [00:00<?, ?it/s]

{
  "meta": {},
  "result": {
    "element_count": 1152,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "success": true,
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [137]:
validator.expect_column_mean_to_be_between(
    column="Profit",
    max_value = 2000,
    min_value = 700.36
)

Calculating Metrics:   0%|          | 0/3 [00:00<?, ?it/s]

{
  "meta": {},
  "result": {
    "observed_value": 1947.6152765700685
  },
  "success": true,
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [138]:
validator.expect_column_values_to_be_in_set(
    column="Product",
    value_set =  [
          "Pork",
          "Turkey",
          "Chicken",
          "Mutton",
          "Tuna",
          "Caviar"
        ]
  
)


Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "meta": {},
  "result": {
    "element_count": 1152,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "success": true,
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [145]:
#create custom expectations


In [150]:
from great_expectations.execution_engine import (
   PandasExecutionEngine,
   SparkDFExecutionEngine,
   SqlAlchemyExecutionEngine,
)
from great_expectations.expectations.metrics import (
   ColumnAggregateMetricProvider,
   column_aggregate_value, column_aggregate_partial
)
from great_expectations.expectations.metrics.import_manager import F, sa

class ColumnCustomMax(ColumnAggregateMetricProvider):
    """MetricProvider Class for Custom Aggregate Max MetricProvider"""

    metric_name = "column.aggregate.custom.max"

    @column_aggregate_value(engine=PandasExecutionEngine)
    def _pandas(cls, column, **kwargs):
        """Pandas Max Implementation"""
        return column.max()

    @column_aggregate_partial(engine=SqlAlchemyExecutionEngine)
    def _sqlalchemy(cls, column, **kwargs):
        """SqlAlchemy Max Implementation"""
        return sa.func.max(column)

    @column_aggregate_partial(engine=SparkDFExecutionEngine)
    def _spark(cls, column, _table, _column_name, **kwargs):
        """Spark Max Implementation"""
        types = dict(_table.dtypes)
        return F.maxcolumn()

metric column.aggregate.custom.max is being registered with different metric_provider; overwriting metric_provider
metric column.aggregate.custom.max.aggregate_fn is being registered with different metric_provider; overwriting metric_provider
metric column.aggregate.custom.max.aggregate_fn is being registered with different metric_provider; overwriting metric_provider
