In [2]:
import great_expectations as ge
from great_expectations.dataset.sparkdf_dataset import SparkDFDataset
import pyspark
from pyspark.sql import functions as f, Window
import json
from pyspark.sql import SparkSession


In [3]:
spark = SparkSession.builder.appName('com.spark-dataframe').getOrCreate()

23/03/12 20:10:50 WARN Utils: Your hostname, Antonys-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.1.19 instead (on interface en0)
23/03/12 20:10:50 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/03/12 20:10:51 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/03/12 20:10:52 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [4]:
df = spark.read.format('csv').option('header', True).load('./Data/bank/bank-full.csv', sep = ';')\
    .withColumn('job', f.when(f.col('job') == 'unknown', f.lit(None)).otherwise(f.col('job')))\
    .withColumn('id', f.monotonically_increasing_id())

In [5]:
def load_expectation_suite(path: str) -> dict:
    """Load expectation suite stored in JSON formatand convert into dictionary.
    Args:
        path (str): path to expectation suite json file
    Returns:
        dict: expectation suite
    """
    with open(path, 'r') as f:
        expectation_suite = json.load(f)
    return expectation_suite

In [6]:
def great_expectation_validation(df: pyspark.sql.DataFrame,
                                 expectation_suite_path: str) -> dict:
    """Run validation on DataFrame based on expecation suite
    Args:
        df (pyspark.sql.DataFrame): DataFrame to validate
        expectation_suite_path (str): path to expectation suite json file
    Returns:
        dict: Validation result
    """
    expectation_suite = load_expectation_suite(expectation_suite_path)
    gdf = SparkDFDataset(df)
    validation_results = gdf.validate(expectation_suite = expectation_suite, result_format = 'SUMMARY', catch_exceptions = True)
    return validation_results

In [7]:
validation_result = great_expectation_validation(df = df, 
                                 expectation_suite_path = 'my_expectation_suite.json')

                                                                                

In [8]:
validation_result['success']


False

In [9]:
validation_result['statistics']


{'evaluated_expectations': 4,
 'successful_expectations': 3,
 'unsuccessful_expectations': 1,
 'success_percent': 75.0}

In [10]:
validation_result

{
  "statistics": {
    "evaluated_expectations": 4,
    "successful_expectations": 3,
    "unsuccessful_expectations": 1,
    "success_percent": 75.0
  },
  "results": [
    {
      "result": {
        "observed_value": [
          "age",
          "job",
          "marital",
          "education",
          "default",
          "balance",
          "housing",
          "loan",
          "contact",
          "day",
          "month",
          "duration",
          "campaign",
          "pdays",
          "previous",
          "poutcome",
          "y",
          "id"
        ]
      },
      "exception_info": {
        "raised_exception": false,
        "exception_message": null,
        "exception_traceback": null
      },
      "expectation_config": {
        "kwargs": {
          "column_set": [
            "age",
            "job",
            "marital",
            "education",
            "default",
            "balance",
            "housing",
            "loan",
            "