In [1]:
# !pip install pyspark findspark

In [None]:
# Complete Professional PySpark Workflow for Great Expectations
# This demonstrates the logical flow: Data Assistants -> Validation -> Checkpoints -> Documentation
# Adapted for PySpark DataFrame operations

import great_expectations as gx
import pandas as pd
import json
import pyspark as spark
from pathlib import Path
from datetime import datetime
import dotenv
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_timestamp
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType
from dotenv import dotenv_values, load_dotenv




Using Ollama URL: http://localhost:11434
Using Ollama Model: gpt-oss:20b


## Prerequisites for PostgreSQL Connection

**Note:** This notebook uses pandas + SQLAlchemy to connect to PostgreSQL, which is more reliable than Spark JDBC for most environments.

**Required packages:**
- `pandas` - for data manipulation
- `psycopg2` or `psycopg2-binary` - PostgreSQL adapter for Python
- `sqlalchemy` - SQL toolkit and ORM

**Installation:**
```bash
pip install pandas psycopg2-binary sqlalchemy
```

**Alternative approaches:**

1. **Direct Spark JDBC** (requires PostgreSQL JDBC driver):
```bash
# Download PostgreSQL JDBC driver
wget https://jdbc.postgresql.org/download/postgresql-42.6.0.jar
# Add to Spark classpath or use spark.jars configuration
```

2. **Using Spark packages:**
```bash
spark-submit --packages org.postgresql:postgresql:42.6.0
```

The notebook will automatically fall back to sample data if PostgreSQL connection fails.


In [3]:
# Initialize Spark Session and Great Expectations Context
spark = SparkSession.builder \
    .appName("GreatExpectationsPySpark") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
    .getOrCreate()

# Initialize Great Expectations context
context = gx.get_context()

print("Spark Session initialized successfully")
print(f"Spark Version: {spark.version}")
print(f"Great Expectations Version: {gx.__version__}")


Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/10/06 16:05:43 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Spark Session initialized successfully
Spark Version: 4.0.0
Great Expectations Version: 0.18.22


In [4]:
# Step 1: Connect to PostgreSQL Data Source
# Alternative approach: Use pandas to read from PostgreSQL, then convert to Spark DataFrame
import pandas as pd
import psycopg2
from sqlalchemy import create_engine

# PostgreSQL connection string
connection_string = "postgresql+psycopg2://try_gx:try_gx@postgres.workshops.greatexpectations.io/gx_example_db"


# Method 1: Use pandas + SQLAlchemy to read data
# print("Attempting to connect to PostgreSQL using pandas + SQLAlchemy...")

# Create SQLAlchemy engine
engine = create_engine(connection_string)

# Read data using pandas
pandas_df = pd.read_sql("SELECT * FROM nyc_taxi_data", engine)

# Convert pandas DataFrame to Spark DataFrame
df = spark.createDataFrame(pandas_df)

# print("✅ Successfully connected to PostgreSQL!")
    

# Show sample data
print("\nNYC Taxi Data:")
df.show(5, truncate=False)
df.printSchema()

print(f"\nData loaded successfully:")
print(f"Total rows: {df.count()}")
print(f"Total columns: {len(df.columns)}")



NYC Taxi Data:


                                                                                

+-----+---------+---------------+-------------+------------+------------------+------------------+-------------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-------------------+-------------------+
|index|vendor_id|passenger_count|trip_distance|rate_code_id|store_and_fwd_flag|pickup_location_id|dropoff_location_id|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|pickup             |dropoff            |
+-----+---------+---------------+-------------+------------+------------------+------------------+-------------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-------------------+-------------------+
|1    |1        |1              |0.8          |1           |N                 |112               |112                |1           |6.0        |1.0  |0.5    |1.55     

In [5]:
# Step 2: Add Spark Data Source to Great Expectations
# Add Spark datasource to Great Expectations context
try:
    spark_source = context.get_datasource('spark_professional')
except:
    spark_source = context.sources.add_spark('spark_professional')

# Add DataFrame asset
try:
    spark_asset = spark_source.get_asset('nyc_taxi_spark')
except:
    spark_asset = spark_source.add_dataframe_asset(name='nyc_taxi_spark', dataframe=df)

# Build batch request
spark_batch_request = spark_asset.build_batch_request()
spark_batch_list = spark_asset.get_batch_list_from_batch_request(spark_batch_request)
spark_batch = spark_batch_list[0]

print("Spark data source connected successfully")
print(f"Spark batch: {spark_batch.id}")
print(f"DataFrame shape: {df.count()} rows, {len(df.columns)} columns")


Spark data source connected successfully
Spark batch: spark_professional-nyc_taxi_spark
DataFrame shape: 20000 rows, 19 columns


## Data Assistants in Great Expectations with PySpark

Data Assistants in Great Expectations are tools designed to automate the process of profiling and understanding your data. They analyze datasets to generate useful insights and create initial sets of expectations, which are rules or assertions about your data's properties.

**Main types of Data Assistants:**
- **Missingness Data Assistant:** Profiles your data to detect missing values and suggests expectations related to nullity and completeness for each column.
- **Onboarding Data Assistant:** Provides a broad overview of your dataset, including data types, cardinality, and basic statistics, and generates a comprehensive set of expectations to help you get started with data quality monitoring.

Data Assistants are typically run before manual validation, as they help you quickly establish a baseline of data quality checks and understand the structure and issues in your data.

For more details, see the [Great Expectations documentation](https://docs.greatexpectations.io/docs/oss/guides/expectations/data_assistants/overview/) (Great Expectations, n.d.).

**Reference:**  
Great Expectations. (n.d.). Data Assistants Overview. https://docs.greatexpectations.io/docs/oss/guides/expectations/data_assistants/overview/


In [None]:
# Step 3: Run Data Assistants FIRST (before any validation)
# Create expectation suites for automatic profiling
spark_profiling_suite = context.add_expectation_suite('spark_auto_profiling')

# Create validator for Data Assistants
spark_validator = context.get_validator(
    batch_request=spark_batch_request,
    expectation_suite_name='spark_auto_profiling'
)

# Run Data Assistants
print("\nRunning Data Assistants for automatic profiling...")

# Missingness Data Assistant for Spark DataFrame
spark_missingness_result = context.assistants.missingness.run(
    validator=spark_validator,
    exclude_column_names=['pickup_datetime', 'dropoff_datetime']
)
spark_missingness_suite = spark_missingness_result.get_expectation_suite(
    expectation_suite_name='spark_missingness_final'
)
context.save_expectation_suite(spark_missingness_suite)

# Onboarding Data Assistant for Spark DataFrame
spark_onboarding_result = context.assistants.onboarding.run(
    validator=spark_validator,
    exclude_column_names=['pickup_datetime', 'dropoff_datetime']
)
spark_onboarding_suite = spark_onboarding_result.get_expectation_suite(
    expectation_suite_name='spark_onboarding_final'
)
context.save_expectation_suite(spark_onboarding_suite)

# print(f"- Spark Missingness: {len(spark_missingness_suite.expectations)} expectations")
# print(f"- Spark Onboarding: {len(spark_onboarding_suite.expectations)} expectations")



Running Data Assistants for automatic profiling...



Generating Expectations:   0%|          | 0/1 [00:00<?, ?it/s]

25/10/06 16:05:52 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Profiling Dataset:         0%|          | 0/19 [00:00<?, ?it/s]

25/10/06 16:05:52 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/06 16:05:53 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

25/10/06 16:05:53 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/06 16:05:54 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/06 16:05:54 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

25/10/06 16:05:54 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/06 16:05:54 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/06 16:05:54 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

25/10/06 16:05:54 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/06 16:05:54 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/06 16:05:55 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

25/10/06 16:05:55 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/06 16:05:55 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/06 16:05:55 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

25/10/06 16:05:55 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/06 16:05:55 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/06 16:05:55 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

25/10/06 16:05:55 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/06 16:05:56 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/06 16:05:56 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

25/10/06 16:05:56 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/06 16:05:56 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/06 16:05:56 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

25/10/06 16:05:56 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/06 16:05:56 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/06 16:05:56 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

25/10/06 16:05:56 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/06 16:05:57 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/06 16:05:57 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

25/10/06 16:05:57 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/06 16:05:57 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/06 16:05:57 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

25/10/06 16:05:57 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/06 16:05:57 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/06 16:05:57 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

25/10/06 16:05:57 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/06 16:05:58 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/06 16:05:58 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

25/10/06 16:05:58 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/06 16:05:58 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/06 16:05:58 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

25/10/06 16:05:58 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/06 16:05:58 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/06 16:05:58 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

25/10/06 16:05:58 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/06 16:05:59 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/06 16:05:59 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

25/10/06 16:05:59 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/06 16:05:59 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/06 16:05:59 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

25/10/06 16:05:59 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/06 16:05:59 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/06 16:05:59 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

25/10/06 16:05:59 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/06 16:05:59 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/06 16:06:00 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

25/10/06 16:06:00 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]




Generating Expectations:   0%|          | 0/8 [00:00<?, ?it/s]

Profiling Dataset:         0%|          | 0/1 [00:00<?, ?it/s]

25/10/06 16:06:00 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

25/10/06 16:06:00 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

25/10/06 16:06:00 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

25/10/06 16:06:00 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/34 [00:00<?, ?it/s]

Profiling Dataset:         0%|          | 0/0 [00:00<?, ?it/s]

25/10/06 16:06:04 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/49 [00:00<?, ?it/s]

Profiling Dataset:         0%|          | 0/0 [00:00<?, ?it/s]

25/10/06 16:06:04 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/49 [00:00<?, ?it/s]

Profiling Dataset:         0%|          | 0/15 [00:00<?, ?it/s]

25/10/06 16:06:04 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/06 16:06:05 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

25/10/06 16:06:05 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/06 16:06:05 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/06 16:06:05 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/06 16:06:05 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

25/10/06 16:06:05 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/06 16:06:05 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/06 16:06:05 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/06 16:06:05 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

25/10/06 16:06:05 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/06 16:06:06 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/06 16:06:06 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/06 16:06:06 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

25/10/06 16:06:06 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/06 16:06:06 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/06 16:06:06 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/06 16:06:06 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

25/10/06 16:06:06 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/06 16:06:06 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/06 16:06:06 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/06 16:06:07 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

25/10/06 16:06:07 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/06 16:06:07 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/06 16:06:07 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/06 16:06:07 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

25/10/06 16:06:07 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/06 16:06:07 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/06 16:06:07 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/06 16:06:07 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

25/10/06 16:06:07 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/06 16:06:08 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/06 16:06:08 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/06 16:06:08 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

25/10/06 16:06:08 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/06 16:06:08 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/06 16:06:08 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/06 16:06:08 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

25/10/06 16:06:08 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/06 16:06:09 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/06 16:06:09 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/06 16:06:09 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

25/10/06 16:06:09 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/06 16:06:09 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/06 16:06:09 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/06 16:06:09 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

25/10/06 16:06:09 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/06 16:06:09 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/06 16:06:10 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/06 16:06:10 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

25/10/06 16:06:10 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/06 16:06:10 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/06 16:06:10 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/06 16:06:10 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

25/10/06 16:06:10 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/06 16:06:10 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/06 16:06:10 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/06 16:06:11 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

25/10/06 16:06:11 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/06 16:06:11 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

25/10/06 16:06:11 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Profiling Dataset:         0%|          | 0/12 [00:00<?, ?it/s]

25/10/06 16:06:11 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/9 [00:00<?, ?it/s]

25/10/06 16:06:11 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

25/10/06 16:06:11 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

Discarding histogram values above highest bin.
25/10/06 16:06:12 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/06 16:06:12 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/06 16:06:12 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

25/10/06 16:06:12 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

25/10/06 16:06:13 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/06 16:06:13 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/06 16:06:13 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/9 [00:00<?, ?it/s]

25/10/06 16:06:13 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

25/10/06 16:06:13 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

Discarding histogram values above highest bin.
25/10/06 16:06:13 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/06 16:06:13 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/06 16:06:14 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

25/10/06 16:06:14 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

25/10/06 16:06:14 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/06 16:06:14 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/06 16:06:14 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/9 [00:00<?, ?it/s]

25/10/06 16:06:14 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

25/10/06 16:06:15 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

Discarding histogram values above highest bin.
25/10/06 16:06:15 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/06 16:06:15 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/06 16:06:15 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

25/10/06 16:06:15 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

25/10/06 16:06:16 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/06 16:06:16 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/06 16:06:16 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/9 [00:00<?, ?it/s]

25/10/06 16:06:16 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

25/10/06 16:06:16 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

Discarding histogram values above highest bin.
25/10/06 16:06:16 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/06 16:06:16 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/06 16:06:17 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

25/10/06 16:06:17 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

25/10/06 16:06:17 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/06 16:06:17 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/06 16:06:17 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/9 [00:00<?, ?it/s]

25/10/06 16:06:17 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

25/10/06 16:06:17 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

Discarding histogram values above highest bin.
25/10/06 16:06:18 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/06 16:06:18 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/06 16:06:18 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

25/10/06 16:06:18 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

25/10/06 16:06:18 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/06 16:06:18 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/06 16:06:18 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/9 [00:00<?, ?it/s]

25/10/06 16:06:18 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

25/10/06 16:06:19 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

Discarding histogram values above highest bin.
25/10/06 16:06:19 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/06 16:06:19 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/06 16:06:19 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

25/10/06 16:06:19 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

25/10/06 16:06:19 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/06 16:06:20 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/06 16:06:20 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/9 [00:00<?, ?it/s]

25/10/06 16:06:20 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

25/10/06 16:06:20 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

Discarding histogram values above highest bin.
25/10/06 16:06:20 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/06 16:06:20 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/06 16:06:20 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

25/10/06 16:06:21 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

25/10/06 16:06:21 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/06 16:06:21 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/06 16:06:21 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/9 [00:00<?, ?it/s]

25/10/06 16:06:21 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

25/10/06 16:06:21 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

Discarding histogram values above highest bin.
25/10/06 16:06:21 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/06 16:06:22 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/06 16:06:22 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

25/10/06 16:06:22 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

25/10/06 16:06:22 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/06 16:06:22 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/06 16:06:22 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/9 [00:00<?, ?it/s]

25/10/06 16:06:22 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

25/10/06 16:06:23 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

Discarding histogram values above highest bin.
25/10/06 16:06:23 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/06 16:06:23 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/06 16:06:23 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

25/10/06 16:06:23 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

25/10/06 16:06:23 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/06 16:06:24 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/06 16:06:24 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/9 [00:00<?, ?it/s]

25/10/06 16:06:24 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

25/10/06 16:06:24 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

Discarding histogram values above highest bin.
25/10/06 16:06:24 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/06 16:06:24 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/06 16:06:25 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

25/10/06 16:06:25 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

25/10/06 16:06:25 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/06 16:06:25 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/06 16:06:25 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/9 [00:00<?, ?it/s]

25/10/06 16:06:25 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

25/10/06 16:06:25 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

Discarding histogram values above highest bin.
25/10/06 16:06:26 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/06 16:06:26 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/06 16:06:26 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

25/10/06 16:06:26 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

25/10/06 16:06:26 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/06 16:06:27 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/06 16:06:27 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/9 [00:00<?, ?it/s]

25/10/06 16:06:27 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Profiling Dataset:         0%|          | 0/2 [00:00<?, ?it/s]

25/10/06 16:06:27 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/06 16:06:27 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/06 16:06:27 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/06 16:06:28 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/06 16:06:28 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Profiling Dataset:         0%|          | 0/1 [00:00<?, ?it/s]

25/10/06 16:06:28 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/06 16:06:28 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/06 16:06:28 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

25/10/06 16:06:28 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/34 [00:00<?, ?it/s]

25/10/06 16:06:29 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/34 [00:00<?, ?it/s]

Profiling Dataset:         0%|          | 0/8 [00:00<?, ?it/s]

25/10/06 16:06:29 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/06 16:06:29 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

25/10/06 16:06:30 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

25/10/06 16:06:30 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

25/10/06 16:06:30 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/06 16:06:30 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

25/10/06 16:06:31 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

25/10/06 16:06:31 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

25/10/06 16:06:31 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/06 16:06:31 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

25/10/06 16:06:31 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

25/10/06 16:06:31 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

25/10/06 16:06:32 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/06 16:06:32 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

25/10/06 16:06:32 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

25/10/06 16:06:32 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

25/10/06 16:06:32 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/06 16:06:33 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

25/10/06 16:06:33 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

25/10/06 16:06:33 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

25/10/06 16:06:33 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/06 16:06:33 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

25/10/06 16:06:33 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

25/10/06 16:06:34 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

25/10/06 16:06:34 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/06 16:06:34 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

25/10/06 16:06:34 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

25/10/06 16:06:34 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

25/10/06 16:06:34 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

25/10/06 16:06:34 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

25/10/06 16:06:35 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

25/10/06 16:06:35 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

In [None]:
# Step 4: Add Manual Business Rules
manual_suite = context.add_expectation_suite('manual_business_rules_spark')
manual_validator = context.get_validator(
    batch_request=spark_batch_request,
    expectation_suite_name='manual_business_rules_spark'
)

# Add specific business expectations for Spark DataFrame
manual_validator.expect_column_values_to_be_between(
    column='passenger_count', min_value=1, max_value=6
)
manual_validator.expect_column_values_to_be_between(
    column='fare_amount', min_value=0
)
manual_validator.expect_column_values_to_not_be_null(
    column='trip_distance'
)
manual_validator.expect_column_values_to_be_between(
    column='trip_distance', min_value=0, max_value=100
)

# Additional Spark-specific expectations
manual_validator.expect_column_values_to_be_between(
    column='vendor_id', min_value=1, max_value=2
)
manual_validator.expect_column_values_to_be_between(
    column='total_amount', min_value=0
)

manual_validator.save_expectation_suite()
manual_suite = context.get_expectation_suite('manual_business_rules_spark')
print(f"\nManual business rules added: {len(manual_suite.expectations)} expectations")


25/10/06 16:06:35 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]


Manual business rules added: 3 expectations


In [None]:
# Step 5: Create and Run Checkpoints
print("\nCreating and running checkpoints...")

checkpoint_results = {}

# Define all checkpoints to run
checkpoints_to_run = [
    ('spark_missingness_cp', 'spark_missingness_final', spark_batch_request, spark_source.name, spark_asset.name),
    ('spark_onboarding_cp', 'spark_onboarding_final', spark_batch_request, spark_source.name, spark_asset.name),
    ('manual_business_spark_cp', 'manual_business_rules_spark', spark_batch_request, spark_source.name, spark_asset.name)
]

# Create and run each checkpoint
for cp_name, suite_name, batch_req, source_name, asset_name in checkpoints_to_run:
    checkpoint = context.add_or_update_checkpoint(
        name=cp_name,
        config_version=1.0,
        class_name='Checkpoint',
        run_name_template=f'%Y%m%d-%H%M%S-{cp_name}',
        expectation_suite_name=suite_name,
        batch_request={
            'datasource_name': source_name,
            'data_asset_name': asset_name
        },
        action_list=[
            {
                'name': 'store_validation_result',
                'action': {'class_name': 'StoreValidationResultAction'}
            },
            {
                'name': 'update_data_docs',
                'action': {'class_name': 'UpdateDataDocsAction'}
            }
        ]
    )
    
    result = context.run_checkpoint(checkpoint_name=cp_name)
    checkpoint_results[cp_name] = result

print("\nCheckpoint Results:")
for name, result in checkpoint_results.items():
    status = "PASSED" if result.success else "FAILED"
    print(f"  {name}: {status}")



Creating and running checkpoints...


25/10/06 16:06:37 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/118 [00:00<?, ?it/s]

25/10/06 16:06:38 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/199 [00:00<?, ?it/s]

25/10/06 16:06:40 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
25/10/06 16:06:41 WARN CacheManager: Asked to cache already cached data.


Calculating Metrics:   0%|          | 0/25 [00:00<?, ?it/s]


Checkpoint Results:
  spark_missingness_cp: PASSED
  spark_onboarding_cp: FAILED
  manual_business_spark_cp: PASSED


In [9]:
# Step 6: Save Results and Generate Report
results_summary = {
    "generation_time": datetime.now().isoformat(),
    "data_sources": {
        "spark": {"source": spark_source.name, "asset": spark_asset.name},
    },
    "expectation_suites": {
        "spark_missingness": len(spark_missingness_suite.expectations),
        "spark_onboarding": len(spark_onboarding_suite.expectations),
        "manual_business_rules": len(manual_suite.expectations)
    },
    "checkpoint_results": {name: result.success for name, result in checkpoint_results.items()},
    "spark_info": {
        "spark_version": spark.version,
        "dataframe_rows": df.count(),
        "dataframe_columns": len(df.columns)
    }
}

# Save results to file with error handling
results_file = Path("spark_validation_results.json")
try:
    with open(results_file, 'w') as f:
        json.dump(results_summary, f, indent=2)
    print(f"\n✅ Results saved to: {results_file}")
except Exception as e:
    print(f"\n❌ Error saving results: {e}")
    print("Results summary (not saved):")
    print(json.dumps(results_summary, indent=2))

print("\nResults Summary:")
print(f"- Data Sources: {len(results_summary['data_sources'])}")
print(f"- Expectation Suites: {sum(results_summary['expectation_suites'].values())} total expectations")
print(f"- Checkpoints Run: {len(results_summary['checkpoint_results'])}")
print(f"- Spark DataFrame: {results_summary['spark_info']['dataframe_rows']} rows, {results_summary['spark_info']['dataframe_columns']} columns")



✅ Results saved to: spark_validation_results.json

Results Summary:
- Data Sources: 1
- Expectation Suites: 71 total expectations
- Checkpoints Run: 3
- Spark DataFrame: 20000 rows, 19 columns


In [10]:
# Step 7: Build and Open Data Docs
print("\nBuilding Data Docs...")
context.build_data_docs()

# Open Data Docs in browser
print("\nOpening Data Docs in browser...")
context.open_data_docs()

print("\nPySpark Workflow complete! Data Docs are now open in your browser.")



Building Data Docs...

Opening Data Docs in browser...

PySpark Workflow complete! Data Docs are now open in your browser.


In [11]:
# Step 8: Additional PySpark Data Analysis Examples
print("\nAdditional PySpark Data Analysis:")

# Show basic statistics
print("\nBasic Statistics:")
df.describe().show()

# Show data quality metrics
print("\nData Quality Metrics:")
print(f"Total rows: {df.count()}")
print(f"Total columns: {len(df.columns)}")
print(f"Null values per column:")
for col_name in df.columns:
    null_count = df.filter(col(col_name).isNull()).count()
    print(f"  {col_name}: {null_count}")

# Show unique values for categorical columns
print("\nUnique values:")
print(f"Vendor IDs: {sorted([row[0] for row in df.select('vendor_id').distinct().collect()])}")
print(f"Passenger counts: {sorted([row[0] for row in df.select('passenger_count').distinct().collect()])}")

# Show fare amount statistics using proper PySpark function calls
from pyspark.sql.functions import min as spark_min, max as spark_max, avg as spark_avg
fare_stats = df.select(
    spark_min('fare_amount').alias('min_fare'),
    spark_max('fare_amount').alias('max_fare'),
    spark_avg('fare_amount').alias('avg_fare')
).collect()[0]

print(f"\nFare Statistics:")
print(f"  Min fare: ${fare_stats['min_fare']:.2f}")
print(f"  Max fare: ${fare_stats['max_fare']:.2f}")
print(f"  Avg fare: ${fare_stats['avg_fare']:.2f}")



Additional PySpark Data Analysis:

Basic Statistics:
+-------+------------------+-----------------+------------------+------------------+-----------------+------------------+------------------+-------------------+-------------------+------------------+------------------+-----------------+------------------+-------------------+---------------------+------------------+--------------------+
|summary|             index|        vendor_id|   passenger_count|     trip_distance|     rate_code_id|store_and_fwd_flag|pickup_location_id|dropoff_location_id|       payment_type|       fare_amount|             extra|          mta_tax|        tip_amount|       tolls_amount|improvement_surcharge|      total_amount|congestion_surcharge|
+-------+------------------+-----------------+------------------+------------------+-----------------+------------------+------------------+-------------------+-------------------+------------------+------------------+-----------------+------------------+---------------

In [None]:
# Minimal Ollama client for local inference (using requests)
import requests

def ollama_infer(prompt, model=ollama_model, url=ollama_url):
    """
    Send a prompt to the Ollama cloud model and return the response.
    """
    try:
        response = requests.post(
            f"{url}/api/generate",
            json={"model": model, "prompt": prompt, "stream": False},
            timeout=60
        )
        response.raise_for_status()
        return response.json()["response"]
    except requests.exceptions.RequestException as e:
        print(f"Error calling Ollama API: {e}")
        print(f"URL: {url}/api/generate")
        print(f"Model: {model}")
        raise

# Function to ensure Spark session is active
def ensure_spark_session():
    """
    Ensure Spark session is active and properly initialized.
    """
    try:
        # Try to get existing session
        spark = SparkSession.getActiveSession()
        if spark is None:
            print("No active Spark session found. Creating new session...")
            spark = SparkSession.builder \
                .appName("GreatExpectationsPySpark") \
                .config("spark.sql.adaptive.enabled", "true") \
                .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
                .getOrCreate()
        else:
            print("Using existing Spark session")
        
        # Test the session by checking if it can execute a simple operation
        spark.sql("SELECT 1").collect()
        print("Spark session is active and working")
        return spark
    except Exception as e:
        print(f"Error with Spark session: {e}")
        print("Creating new Spark session...")
        spark = SparkSession.builder \
            .appName("GreatExpectationsPySpark") \
            .config("spark.sql.adaptive.enabled", "true") \
            .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
            .getOrCreate()
        return spark

# Mechanism to create expectations using PySpark and Ollama
def create_expectations_with_ollama(spark_df, table_name="spark_table"):
    """
    Generate Great Expectations-style expectations for a Spark DataFrame using Ollama.
    """
    # Ensure Spark session is active
    spark = ensure_spark_session()
    
    # Prepare schema and sample data for the prompt
    schema_info = []
    for field in spark_df.schema.fields:
        schema_info.append({"name": field.name, "type": str(field.dataType)})

    # Use collect() instead of toPandas() to avoid context issues
    try:
        sample_rows = spark_df.limit(10).collect()
        # Convert Row objects to dictionaries
        sample_data = [row.asDict() for row in sample_rows]
    except Exception as e:
        print(f"Error collecting sample data: {e}")
        # Fallback: use schema info only
        sample_data = []

    prompt = (
        "You are a data quality assistant. Given the following Spark DataFrame schema and sample data, "
        "generate a list of Great Expectations-style expectations in JSON format. "
        "Each expectation should include the column, expectation type, and parameters. "
        "Use the following format:\n"
        "[\n"
        "  {\n"
        "    \"expectation_type\": \"expect_column_values_to_not_be_null\",\n"
        "    \"column\": \"column_name\",\n"
        "    \"kwargs\": {\"mostly\": 1.0}\n"
        "  },\n"
        "  ...\n"
        "]\n"
        f"Schema: {json.dumps(schema_info, indent=2)}\n"
        f"Sample Data: {json.dumps(sample_data, indent=2)}\n"
    )

    print("Sending prompt to Ollama...")
    ollama_response = ollama_infer(prompt)
    print("Received response from Ollama")
    
    # Try to extract the JSON from the response
    try:
        # Find the first and last brackets to extract the JSON array
        start = ollama_response.find('[')
        end = ollama_response.rfind(']')
        if start != -1 and end != -1:
            expectations_json = ollama_response[start:end+1]
            expectations = json.loads(expectations_json)
        else:
            print("No JSON array found in response")
            expectations = []
    except Exception as e:
        print("Failed to parse expectations from Ollama response:", e)
        print("Ollama response was:", ollama_response)
        expectations = []

    return expectations

# Save expectations to expectations/json folder
def save_expectations(expectations, table_name="spark_table"):
    """
    Save the generated expectations to a JSON file in the expectations/json folder.
    """
    base_path = Path.cwd() / "expectations" / "json"
    base_path.mkdir(parents=True, exist_ok=True)
    file_path = base_path / f"{table_name}_expectations.json"
    with open(file_path, "w") as f:
        json.dump(expectations, f, indent=2)
    print(f"Expectations saved to {file_path}")

# Ensure Spark session is active before proceeding
spark = ensure_spark_session()

# Example usage:
# Generate and save expectations for the current DataFrame
expectations = create_expectations_with_ollama(df, table_name="nyc_taxi")
save_expectations(expectations, table_name="nyc_taxi")


Using Ollama URL: http://localhost:11434
Using Ollama Model: gpt-oss:20b
Using existing Spark session
Spark session is active and working
Using existing Spark session
Spark session is active and working
Error collecting sample data: 'NoneType' object has no attribute 'setCallSite'
Sending prompt to Ollama...
Error calling Ollama API: HTTPConnectionPool(host='localhost', port=11434): Read timed out. (read timeout=60)
URL: http://localhost:11434/api/generate
Model: gpt-oss:20b


ReadTimeout: HTTPConnectionPool(host='localhost', port=11434): Read timed out. (read timeout=60)

In [None]:
# Step 9: Cleanup and Session Management
# Stop Spark session
# spark.stop()