In [1]:
# Complete Professional PySpark Workflow for Great Expectations
# This demonstrates the logical flow: Data Assistants -> Validation -> Checkpoints -> Documentation
# Adapted for PySpark DataFrame operations

import great_expectations as gx
import pandas as pd
import json
from pathlib import Path
from datetime import datetime
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_timestamp
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType


## Prerequisites for PostgreSQL Connection

**Note:** This notebook uses pandas + SQLAlchemy to connect to PostgreSQL, which is more reliable than Spark JDBC for most environments.

**Required packages:**
- `pandas` - for data manipulation
- `psycopg2` or `psycopg2-binary` - PostgreSQL adapter for Python
- `sqlalchemy` - SQL toolkit and ORM

**Installation:**
```bash
pip install pandas psycopg2-binary sqlalchemy
```

**Alternative approaches:**

1. **Direct Spark JDBC** (requires PostgreSQL JDBC driver):
```bash
# Download PostgreSQL JDBC driver
wget https://jdbc.postgresql.org/download/postgresql-42.6.0.jar
# Add to Spark classpath or use spark.jars configuration
```

2. **Using Spark packages:**
```bash
spark-submit --packages org.postgresql:postgresql:42.6.0
```

The notebook will automatically fall back to sample data if PostgreSQL connection fails.


In [2]:
# Initialize Spark Session and Great Expectations Context
spark = SparkSession.builder \
    .appName("GreatExpectationsPySpark") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
    .getOrCreate()

# Initialize Great Expectations context
context = gx.get_context()

print("Spark Session initialized successfully")
print(f"Spark Version: {spark.version}")
print(f"Great Expectations Version: {gx.__version__}")


Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/10/03 17:17:57 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Spark Session initialized successfully
Spark Version: 4.0.0
Great Expectations Version: 0.18.22


In [3]:
# Step 1: Connect to PostgreSQL Data Source
# Alternative approach: Use pandas to read from PostgreSQL, then convert to Spark DataFrame
import pandas as pd
import psycopg2
from sqlalchemy import create_engine

# PostgreSQL connection string
connection_string = "postgresql+psycopg2://try_gx:try_gx@postgres.workshops.greatexpectations.io/gx_example_db"


# Method 1: Use pandas + SQLAlchemy to read data
print("Attempting to connect to PostgreSQL using pandas + SQLAlchemy...")

# Create SQLAlchemy engine
engine = create_engine(connection_string)

# Read data using pandas
pandas_df = pd.read_sql("SELECT * FROM nyc_taxi_data LIMIT 1000", engine)

# Convert pandas DataFrame to Spark DataFrame
df = spark.createDataFrame(pandas_df)

print("✅ Successfully connected to PostgreSQL!")
    

# Show sample data
print("\nNYC Taxi Data:")
df.show(5, truncate=False)
df.printSchema()

print(f"\nData loaded successfully:")
print(f"Total rows: {df.count()}")
print(f"Total columns: {len(df.columns)}")


Attempting to connect to PostgreSQL using pandas + SQLAlchemy...
✅ Successfully connected to PostgreSQL!

NYC Taxi Data:


                                                                                

+-----+---------+---------------+-------------+------------+------------------+------------------+-------------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-------------------+-------------------+
|index|vendor_id|passenger_count|trip_distance|rate_code_id|store_and_fwd_flag|pickup_location_id|dropoff_location_id|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|pickup             |dropoff            |
+-----+---------+---------------+-------------+------------+------------------+------------------+-------------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-------------------+-------------------+
|1    |1        |1              |0.8          |1           |N                 |112               |112                |1           |6.0        |1.0  |0.5    |1.55     

In [4]:
# Step 2: Add Spark Data Source to Great Expectations
# Add Spark datasource to Great Expectations context
try:
    spark_source = context.get_datasource('spark_professional')
except:
    spark_source = context.sources.add_spark('spark_professional')

# Add DataFrame asset
try:
    spark_asset = spark_source.get_asset('nyc_taxi_spark')
except:
    spark_asset = spark_source.add_dataframe_asset(name='nyc_taxi_spark', dataframe=df)

# Build batch request
spark_batch_request = spark_asset.build_batch_request()
spark_batch_list = spark_asset.get_batch_list_from_batch_request(spark_batch_request)
spark_batch = spark_batch_list[0]

print("Spark data source connected successfully")
print(f"Spark batch: {spark_batch.id}")
print(f"DataFrame shape: {df.count()} rows, {len(df.columns)} columns")


Spark data source connected successfully
Spark batch: spark_professional-nyc_taxi_spark
DataFrame shape: 1000 rows, 19 columns


## Data Assistants in Great Expectations with PySpark

Data Assistants in Great Expectations are tools designed to automate the process of profiling and understanding your data. They analyze datasets to generate useful insights and create initial sets of expectations, which are rules or assertions about your data's properties.

**Main types of Data Assistants:**
- **Missingness Data Assistant:** Profiles your data to detect missing values and suggests expectations related to nullity and completeness for each column.
- **Onboarding Data Assistant:** Provides a broad overview of your dataset, including data types, cardinality, and basic statistics, and generates a comprehensive set of expectations to help you get started with data quality monitoring.

Data Assistants are typically run before manual validation, as they help you quickly establish a baseline of data quality checks and understand the structure and issues in your data.

For more details, see the [Great Expectations documentation](https://docs.greatexpectations.io/docs/oss/guides/expectations/data_assistants/overview/) (Great Expectations, n.d.).

**Reference:**  
Great Expectations. (n.d.). Data Assistants Overview. https://docs.greatexpectations.io/docs/oss/guides/expectations/data_assistants/overview/


In [5]:
# Step 3: Run Data Assistants FIRST (before any validation)
# Create expectation suites for automatic profiling
spark_profiling_suite = context.add_expectation_suite('spark_auto_profiling')

# Create validator for Data Assistants
spark_validator = context.get_validator(
    batch_request=spark_batch_request,
    expectation_suite_name='spark_auto_profiling'
)

# Run Data Assistants
print("\nRunning Data Assistants for automatic profiling...")

# Missingness Data Assistant for Spark DataFrame
spark_missingness_result = context.assistants.missingness.run(
    validator=spark_validator,
    exclude_column_names=['pickup_datetime', 'dropoff_datetime']
)
spark_missingness_suite = spark_missingness_result.get_expectation_suite(
    expectation_suite_name='spark_missingness_final'
)
context.save_expectation_suite(spark_missingness_suite)

# Onboarding Data Assistant for Spark DataFrame
spark_onboarding_result = context.assistants.onboarding.run(
    validator=spark_validator,
    exclude_column_names=['pickup_datetime', 'dropoff_datetime']
)
spark_onboarding_suite = spark_onboarding_result.get_expectation_suite(
    expectation_suite_name='spark_onboarding_final'
)
context.save_expectation_suite(spark_onboarding_suite)

print(f"- Spark Missingness: {len(spark_missingness_suite.expectations)} expectations")
print(f"- Spark Onboarding: {len(spark_onboarding_suite.expectations)} expectations")



Running Data Assistants for automatic profiling...



Generating Expectations:   0%|          | 0/1 [00:00<?, ?it/s]

25/10/03 17:18:08 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Profiling Dataset:         0%|          | 0/19 [00:00<?, ?it/s]

25/10/03 17:18:08 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/03 17:18:09 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

25/10/03 17:18:09 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/03 17:18:09 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/03 17:18:09 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

25/10/03 17:18:10 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/03 17:18:11 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/03 17:18:12 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

25/10/03 17:18:12 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/03 17:18:13 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/03 17:18:13 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

25/10/03 17:18:13 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/03 17:18:13 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/03 17:18:13 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

25/10/03 17:18:13 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/03 17:18:14 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/03 17:18:14 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

25/10/03 17:18:14 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/03 17:18:14 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/03 17:18:14 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

25/10/03 17:18:14 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/03 17:18:14 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/03 17:18:15 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

25/10/03 17:18:15 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/03 17:18:15 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/03 17:18:15 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

25/10/03 17:18:15 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/03 17:18:15 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/03 17:18:15 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

25/10/03 17:18:15 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/03 17:18:15 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/03 17:18:16 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

25/10/03 17:18:16 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/03 17:18:16 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/03 17:18:16 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

25/10/03 17:18:16 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/03 17:18:16 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/03 17:18:16 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

25/10/03 17:18:17 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/03 17:18:17 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/03 17:18:18 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

25/10/03 17:18:18 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/03 17:18:18 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/03 17:18:20 WARN CacheManager: Asked to cache already cached data.        


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

25/10/03 17:18:21 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/03 17:18:21 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/03 17:18:22 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

25/10/03 17:18:22 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/03 17:18:22 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/03 17:18:23 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

25/10/03 17:18:23 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/03 17:18:23 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/03 17:18:23 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

25/10/03 17:18:23 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/03 17:18:24 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/03 17:18:24 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

25/10/03 17:18:24 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]




Generating Expectations:   0%|          | 0/8 [00:00<?, ?it/s]

Profiling Dataset:         0%|          | 0/1 [00:00<?, ?it/s]

25/10/03 17:18:27 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

25/10/03 17:18:29 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

25/10/03 17:18:30 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

25/10/03 17:18:30 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/34 [00:00<?, ?it/s]

Profiling Dataset:         0%|          | 0/3 [00:00<?, ?it/s]

25/10/03 17:18:47 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/03 17:18:48 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

25/10/03 17:18:48 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/03 17:18:48 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/03 17:18:49 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/03 17:18:49 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

25/10/03 17:18:49 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/03 17:18:49 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/03 17:18:50 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/03 17:18:50 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

25/10/03 17:18:50 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/03 17:18:50 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/03 17:18:51 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/49 [00:00<?, ?it/s]

Profiling Dataset:         0%|          | 0/0 [00:00<?, ?it/s]

25/10/03 17:18:51 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/49 [00:00<?, ?it/s]

Profiling Dataset:         0%|          | 0/15 [00:00<?, ?it/s]

25/10/03 17:18:52 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/03 17:18:54 WARN CacheManager: Asked to cache already cached data.        


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

25/10/03 17:18:55 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/03 17:18:55 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/03 17:18:56 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/03 17:18:56 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

25/10/03 17:18:57 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/03 17:19:01 WARN CacheManager: Asked to cache already cached data.        


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/03 17:19:03 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/03 17:19:04 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

25/10/03 17:19:04 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/03 17:19:05 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/03 17:19:42 WARN CacheManager: Asked to cache already cached data.        


Calculating Metrics:   0%|          | 0/7 [00:05<?, ?it/s]

25/10/03 17:19:53 WARN CacheManager: Asked to cache already cached data.        


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

25/10/03 17:19:54 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/03 17:20:01 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/03 17:20:02 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/03 17:20:07 WARN CacheManager: Asked to cache already cached data.        


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

25/10/03 17:20:07 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/03 17:20:08 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/03 17:20:13 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/03 17:20:15 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

25/10/03 17:20:26 WARN CacheManager: Asked to cache already cached data.        


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/03 17:20:35 WARN CacheManager: Asked to cache already cached data.        


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/03 17:20:39 WARN CacheManager: Asked to cache already cached data.        


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/03 17:20:41 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

25/10/03 17:20:41 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/03 17:20:43 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/03 17:20:47 WARN CacheManager: Asked to cache already cached data.        


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/03 17:20:50 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

25/10/03 17:20:50 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/03 17:20:51 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/03 17:20:52 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/03 17:20:53 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

25/10/03 17:20:53 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/03 17:20:53 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/03 17:20:54 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/03 17:20:54 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

25/10/03 17:20:55 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/03 17:20:55 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/03 17:20:55 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/03 17:20:56 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

25/10/03 17:20:56 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/03 17:20:57 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/03 17:21:00 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/03 17:21:01 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

25/10/03 17:21:01 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/03 17:21:01 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/03 17:21:02 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/03 17:21:02 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

25/10/03 17:21:02 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/03 17:21:03 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/03 17:21:03 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/03 17:21:03 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

25/10/03 17:21:04 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/03 17:21:04 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/03 17:21:04 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/03 17:21:05 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

25/10/03 17:21:05 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/03 17:21:06 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/03 17:21:07 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Profiling Dataset:         0%|          | 0/12 [00:00<?, ?it/s]

25/10/03 17:21:07 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/9 [00:00<?, ?it/s]

25/10/03 17:21:10 WARN CacheManager: Asked to cache already cached data.        


Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

25/10/03 17:21:12 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

Discarding histogram values above highest bin.                                  
25/10/03 17:21:21 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/03 17:21:22 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/03 17:21:23 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

25/10/03 17:21:24 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

25/10/03 17:21:25 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/03 17:21:25 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/03 17:21:25 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/9 [00:00<?, ?it/s]

25/10/03 17:21:26 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

25/10/03 17:21:26 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

Discarding histogram values below lowest bin.
Discarding histogram values above highest bin.
25/10/03 17:21:29 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/03 17:21:30 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/03 17:21:31 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

25/10/03 17:21:31 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

25/10/03 17:21:31 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/03 17:21:32 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/03 17:21:32 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/9 [00:00<?, ?it/s]

25/10/03 17:21:32 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

25/10/03 17:21:32 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

Discarding histogram values above highest bin.
25/10/03 17:21:33 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/03 17:21:34 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/03 17:21:34 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

25/10/03 17:21:35 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

25/10/03 17:21:37 WARN CacheManager: Asked to cache already cached data.        


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/03 17:21:39 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/03 17:21:40 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/9 [00:00<?, ?it/s]

25/10/03 17:21:41 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

25/10/03 17:21:41 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

Discarding histogram values above highest bin.
25/10/03 17:21:42 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/03 17:21:42 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/03 17:21:42 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

25/10/03 17:21:43 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

25/10/03 17:21:43 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/03 17:21:43 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/03 17:21:44 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/9 [00:00<?, ?it/s]

25/10/03 17:21:44 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

25/10/03 17:21:45 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

Discarding histogram values above highest bin.
25/10/03 17:21:45 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/03 17:21:46 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/03 17:21:46 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

25/10/03 17:21:46 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

25/10/03 17:21:46 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/03 17:21:47 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/03 17:21:47 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/9 [00:00<?, ?it/s]

25/10/03 17:21:47 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

25/10/03 17:21:50 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

Discarding histogram values above highest bin.
25/10/03 17:21:52 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/03 17:21:52 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/03 17:21:52 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

25/10/03 17:21:53 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

25/10/03 17:21:53 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/03 17:21:54 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/03 17:21:54 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/9 [00:00<?, ?it/s]

25/10/03 17:21:54 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

25/10/03 17:21:54 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

Discarding histogram values above highest bin.
25/10/03 17:21:56 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/03 17:21:56 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/03 17:21:56 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

25/10/03 17:21:57 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

25/10/03 17:21:58 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/03 17:21:59 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/03 17:21:59 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/9 [00:00<?, ?it/s]

25/10/03 17:22:00 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

25/10/03 17:22:00 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

Discarding histogram values above highest bin.
25/10/03 17:22:00 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/03 17:22:01 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/03 17:22:01 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

25/10/03 17:22:01 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

25/10/03 17:22:02 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/03 17:22:02 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/03 17:22:02 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/9 [00:00<?, ?it/s]

25/10/03 17:22:03 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

25/10/03 17:22:04 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

Discarding histogram values above highest bin.
25/10/03 17:22:07 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/03 17:22:07 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/03 17:22:07 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

25/10/03 17:22:09 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

25/10/03 17:22:11 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/03 17:22:12 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/03 17:22:13 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/9 [00:00<?, ?it/s]

25/10/03 17:22:13 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

25/10/03 17:22:14 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

Discarding histogram values above highest bin.
25/10/03 17:22:14 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/03 17:22:15 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/03 17:22:16 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

25/10/03 17:22:17 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

25/10/03 17:22:18 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/03 17:22:18 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/03 17:22:19 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/9 [00:00<?, ?it/s]

25/10/03 17:22:19 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

25/10/03 17:22:19 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

Discarding histogram values above highest bin.
25/10/03 17:22:20 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/03 17:22:20 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/03 17:22:20 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

25/10/03 17:22:20 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

25/10/03 17:22:21 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/03 17:22:21 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/03 17:22:21 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/9 [00:00<?, ?it/s]

25/10/03 17:22:22 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Profiling Dataset:         0%|          | 0/2 [00:00<?, ?it/s]

25/10/03 17:22:23 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/03 17:22:26 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/03 17:22:27 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/03 17:22:27 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/03 17:22:28 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Profiling Dataset:         0%|          | 0/1 [00:00<?, ?it/s]

25/10/03 17:22:28 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/03 17:22:28 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/03 17:22:28 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

25/10/03 17:22:29 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/34 [00:00<?, ?it/s]

25/10/03 17:22:29 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/34 [00:00<?, ?it/s]

                                                                                

Profiling Dataset:         0%|          | 0/9 [00:00<?, ?it/s]

25/10/03 17:22:35 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/03 17:22:35 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

25/10/03 17:22:36 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

25/10/03 17:22:37 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

25/10/03 17:22:37 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/03 17:22:37 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

25/10/03 17:22:38 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

25/10/03 17:22:38 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

25/10/03 17:22:39 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/03 17:22:39 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

25/10/03 17:22:40 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

25/10/03 17:22:41 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

25/10/03 17:22:41 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/03 17:22:44 WARN CacheManager: Asked to cache already cached data.        


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

25/10/03 17:22:46 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

25/10/03 17:22:47 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

25/10/03 17:22:47 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/03 17:22:47 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

25/10/03 17:22:48 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

25/10/03 17:22:48 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

25/10/03 17:22:49 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/03 17:22:49 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

25/10/03 17:22:49 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

25/10/03 17:22:50 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

25/10/03 17:22:50 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/03 17:22:52 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

25/10/03 17:22:53 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

25/10/03 17:22:54 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

25/10/03 17:22:54 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/03 17:22:54 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

25/10/03 17:22:55 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

25/10/03 17:22:56 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

25/10/03 17:22:57 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/03 17:22:59 WARN CacheManager: Asked to cache already cached data.        


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

25/10/03 17:23:00 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

25/10/03 17:23:01 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

- Spark Missingness: 19 expectations
- Spark Onboarding: 55 expectations


In [6]:
# Step 4: Add Manual Business Rules
manual_suite = context.add_expectation_suite('manual_business_rules_spark')
manual_validator = context.get_validator(
    batch_request=spark_batch_request,
    expectation_suite_name='manual_business_rules_spark'
)

# Add specific business expectations for Spark DataFrame
manual_validator.expect_column_values_to_be_between(
    column='passenger_count', min_value=1, max_value=6
)
manual_validator.expect_column_values_to_be_between(
    column='fare_amount', min_value=0
)
manual_validator.expect_column_values_to_not_be_null(
    column='trip_distance'
)
manual_validator.expect_column_values_to_be_between(
    column='trip_distance', min_value=0, max_value=100
)

# Additional Spark-specific expectations
manual_validator.expect_column_values_to_be_between(
    column='vendor_id', min_value=1, max_value=2
)
manual_validator.expect_column_values_to_be_between(
    column='total_amount', min_value=0
)

manual_validator.save_expectation_suite()
manual_suite = context.get_expectation_suite('manual_business_rules_spark')
print(f"\nManual business rules added: {len(manual_suite.expectations)} expectations")


25/10/03 17:23:02 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

                                                                                

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]


Manual business rules added: 3 expectations


In [7]:
# Step 5: Create and Run Checkpoints
print("\nCreating and running checkpoints...")

checkpoint_results = {}

# Define all checkpoints to run
checkpoints_to_run = [
    ('spark_missingness_cp', 'spark_missingness_final', spark_batch_request, spark_source.name, spark_asset.name),
    ('spark_onboarding_cp', 'spark_onboarding_final', spark_batch_request, spark_source.name, spark_asset.name),
    ('manual_business_spark_cp', 'manual_business_rules_spark', spark_batch_request, spark_source.name, spark_asset.name)
]

# Create and run each checkpoint
for cp_name, suite_name, batch_req, source_name, asset_name in checkpoints_to_run:
    checkpoint = context.add_or_update_checkpoint(
        name=cp_name,
        config_version=1.0,
        class_name='Checkpoint',
        run_name_template=f'%Y%m%d-%H%M%S-{cp_name}',
        expectation_suite_name=suite_name,
        batch_request={
            'datasource_name': source_name,
            'data_asset_name': asset_name
        },
        action_list=[
            {
                'name': 'store_validation_result',
                'action': {'class_name': 'StoreValidationResultAction'}
            },
            {
                'name': 'update_data_docs',
                'action': {'class_name': 'UpdateDataDocsAction'}
            }
        ]
    )
    
    result = context.run_checkpoint(checkpoint_name=cp_name)
    checkpoint_results[cp_name] = result

print("\nCheckpoint Results:")
for name, result in checkpoint_results.items():
    status = "PASSED" if result.success else "FAILED"
    print(f"  {name}: {status}")



Creating and running checkpoints...


25/10/03 17:23:10 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/118 [00:00<?, ?it/s]

25/10/03 17:23:16 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/223 [00:00<?, ?it/s]

25/10/03 17:23:22 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
25/10/03 17:23:23 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/25 [00:00<?, ?it/s]


Checkpoint Results:
  spark_missingness_cp: PASSED
  spark_onboarding_cp: FAILED
  manual_business_spark_cp: PASSED


In [8]:
# Step 6: Save Results and Generate Report
results_summary = {
    "generation_time": datetime.now().isoformat(),
    "data_sources": {
        "spark": {"source": spark_source.name, "asset": spark_asset.name},
    },
    "expectation_suites": {
        "spark_missingness": len(spark_missingness_suite.expectations),
        "spark_onboarding": len(spark_onboarding_suite.expectations),
        "manual_business_rules": len(manual_suite.expectations)
    },
    "checkpoint_results": {name: result.success for name, result in checkpoint_results.items()},
    "spark_info": {
        "spark_version": spark.version,
        "dataframe_rows": df.count(),
        "dataframe_columns": len(df.columns)
    }
}

# Save results to file with error handling
results_file = Path("spark_validation_results.json")
try:
    with open(results_file, 'w') as f:
        json.dump(results_summary, f, indent=2)
    print(f"\n✅ Results saved to: {results_file}")
except Exception as e:
    print(f"\n❌ Error saving results: {e}")
    print("Results summary (not saved):")
    print(json.dumps(results_summary, indent=2))

print("\nResults Summary:")
print(f"- Data Sources: {len(results_summary['data_sources'])}")
print(f"- Expectation Suites: {sum(results_summary['expectation_suites'].values())} total expectations")
print(f"- Checkpoints Run: {len(results_summary['checkpoint_results'])}")
print(f"- Spark DataFrame: {results_summary['spark_info']['dataframe_rows']} rows, {results_summary['spark_info']['dataframe_columns']} columns")



✅ Results saved to: spark_validation_results.json

Results Summary:
- Data Sources: 1
- Expectation Suites: 77 total expectations
- Checkpoints Run: 3
- Spark DataFrame: 1000 rows, 19 columns


In [9]:
# Step 7: Build and Open Data Docs
print("\nBuilding Data Docs...")
context.build_data_docs()

# Open Data Docs in browser
print("\nOpening Data Docs in browser...")
context.open_data_docs()

print("\nPySpark Workflow complete! Data Docs are now open in your browser.")



Building Data Docs...

Opening Data Docs in browser...

PySpark Workflow complete! Data Docs are now open in your browser.


In [10]:
# Step 8: Additional PySpark Data Analysis Examples
print("\nAdditional PySpark Data Analysis:")

# Show basic statistics
print("\nBasic Statistics:")
df.describe().show()

# Show data quality metrics
print("\nData Quality Metrics:")
print(f"Total rows: {df.count()}")
print(f"Total columns: {len(df.columns)}")
print(f"Null values per column:")
for col_name in df.columns:
    null_count = df.filter(col(col_name).isNull()).count()
    print(f"  {col_name}: {null_count}")

# Show unique values for categorical columns
print("\nUnique values:")
print(f"Vendor IDs: {sorted([row[0] for row in df.select('vendor_id').distinct().collect()])}")
print(f"Passenger counts: {sorted([row[0] for row in df.select('passenger_count').distinct().collect()])}")

# Show fare amount statistics using proper PySpark function calls
from pyspark.sql.functions import min as spark_min, max as spark_max, avg as spark_avg
fare_stats = df.select(
    spark_min('fare_amount').alias('min_fare'),
    spark_max('fare_amount').alias('max_fare'),
    spark_avg('fare_amount').alias('avg_fare')
).collect()[0]

print(f"\nFare Statistics:")
print(f"  Min fare: ${fare_stats['min_fare']:.2f}")
print(f"  Max fare: ${fare_stats['max_fare']:.2f}")
print(f"  Avg fare: ${fare_stats['avg_fare']:.2f}")



Additional PySpark Data Analysis:

Basic Statistics:


                                                                                

+-------+-----------------+------------------+---------------+----------------+-----------------+------------------+------------------+-------------------+-------------------+------------------+-------------------+--------------------+------------------+------------------+---------------------+------------------+--------------------+
|summary|            index|         vendor_id|passenger_count|   trip_distance|     rate_code_id|store_and_fwd_flag|pickup_location_id|dropoff_location_id|       payment_type|       fare_amount|              extra|             mta_tax|        tip_amount|      tolls_amount|improvement_surcharge|      total_amount|congestion_surcharge|
+-------+-----------------+------------------+---------------+----------------+-----------------+------------------+------------------+-------------------+-------------------+------------------+-------------------+--------------------+------------------+------------------+---------------------+------------------+--------------

                                                                                

  pickup_location_id: 0
  dropoff_location_id: 0
  payment_type: 0
  fare_amount: 0
  extra: 0
  mta_tax: 0
  tip_amount: 0
  tolls_amount: 0
  improvement_surcharge: 0
  total_amount: 0
  congestion_surcharge: 0
  pickup: 0
  dropoff: 0

Unique values:
Vendor IDs: [1, 2, 4]
Passenger counts: [1]

Fare Statistics:
  Min fare: $-0.01
  Max fare: $102.50
  Avg fare: $12.33


In [None]:
# Step 9: Cleanup and Session Management
# Stop Spark session
spark.stop()


Cleaning up Spark session...
Spark session stopped successfully.

PySpark Great Expectations workflow completed!
All data validation, profiling, and documentation has been generated.
