#### ----------------------------------------------------------
## üß© Great Expectations - Basic Data Validation
#### ----------------------------------------------------------

- Great Expectations is used to check data quality.
- It helps confirm that your dataset meets certain expectations
- (like having no nulls or specific value ranges).
- We'll use its modern API with an "ephemeral" context
- that runs entirely in memory (no setup or YAML needed).

In [13]:
#### REQUIREMENT: uv add great_expectations pandas

import pandas as pd
import great_expectations as gx

df = pd.read_csv("/Users/riadanas/Desktop/ML-Engineering-Bootcamp/Artists.csv")
df.head()


Unnamed: 0,ConstituentID,DisplayName,ArtistBio,Nationality,Gender,BeginDate,EndDate,Wiki QID,ULAN
0,1,Robert Arneson,"American, 1930‚Äì1992",American,male,1930,1992,,
1,2,Doroteo Arnaiz,"Spanish, born 1936",Spanish,male,1936,0,,
2,3,Bill Arnold,"American, born 1941",American,male,1941,0,,
3,4,Charles Arnoldi,"American, born 1946",American,male,1946,0,Q1063584,500027998.0
4,5,Per Arnoldi,"Danish, born 1941",Danish,male,1941,0,,


In [14]:
# 2) in-memory context (no files)
context = gx.get_context(mode="ephemeral")

# 3) make a pandas datasource and dataframe asset
ds = context.data_sources.add_pandas(name="pandas_src")
asset = ds.add_dataframe_asset(name="artists_asset")

# 4) add a "whole dataframe" batch definition, then pass the df
batch_def = asset.add_batch_definition_whole_dataframe("whole_df")
batch = batch_def.get_batch(batch_parameters={"dataframe": df})

In [15]:
# 5) get a validator and add checks
validator = context.get_validator(batch=batch)
validator.expect_column_values_to_not_be_null("DisplayName")
validator.expect_column_values_to_be_between("BeginDate", min_value=1800, max_value=2025)
validator.expect_column_values_to_be_in_set("Gender", ["male", "female", "unknown"])

# 6) run
result = validator.validate()
result



Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]



Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]



Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/19 [00:00<?, ?it/s]

{
  "success": false,
  "results": [
    {
      "success": true,
      "expectation_config": {
        "type": "expect_column_values_to_not_be_null",
        "kwargs": {
          "batch_id": "pandas_src-artists_asset",
          "column": "DisplayName"
        },
        "meta": {},
        "severity": "critical"
      },
      "result": {
        "element_count": 15638,
        "unexpected_count": 0,
        "unexpected_percent": 0.0,
        "partial_unexpected_list": []
      },
      "meta": {},
      "exception_info": {
        "raised_exception": false,
        "exception_traceback": null,
        "exception_message": null
      }
    },
    {
      "success": false,
      "expectation_config": {
        "type": "expect_column_values_to_be_between",
        "kwargs": {
          "batch_id": "pandas_src-artists_asset",
          "column": "BeginDate",
          "min_value": 1800.0,
          "max_value": 2025.0
        },
        "meta": {},
        "severity": "critical"
      

In [17]:
print("\nüßæ Data Validation Summary\n" + "-" * 60)

for r in result["results"]:
    exp_type = r["expectation_config"].type
    col = r["expectation_config"].kwargs.get("column", "N/A")
    ok = "‚úÖ" if r["success"] else "‚ùå"
    unexpected = r["result"].get("unexpected_count", 0)
    total = r["result"].get("element_count", 0)
    unexpected_pct = r["result"].get("unexpected_percent", 0)

    print(
        f"{ok} Column: {col:<15} | Check: {exp_type:<40} "
        f"| Failed: {unexpected:>4}/{total:<5} ({unexpected_pct:.2f}%)"
    )

print("-" * 60)
print(f"Overall Success: {'‚úÖ' if result['success'] else '‚ùå'}")



üßæ Data Validation Summary
------------------------------------------------------------
‚úÖ Column: DisplayName     | Check: expect_column_values_to_not_be_null      | Failed:    0/15638 (0.00%)
‚ùå Column: BeginDate       | Check: expect_column_values_to_be_between       | Failed: 3580/15638 (22.89%)
‚ùå Column: Gender          | Check: expect_column_values_to_be_in_set        | Failed:    6/15638 (0.05%)
------------------------------------------------------------
Overall Success: ‚ùå


In [18]:
df['Gender'].value_counts()

Gender
male                     9940
female                   2435
non-binary                  3
gender non-conforming       1
transgender woman           1
female (transwoman)         1
Name: count, dtype: int64

In [19]:
df['BeginDate'].describe()

count    15638.000000
mean      1491.669843
std        810.180276
min          0.000000
25%       1855.250000
50%       1923.000000
75%       1948.000000
max       2017.000000
Name: BeginDate, dtype: float64