In [None]:
!pip -q install "pandera>=0.18" pandas numpy polars pyarrow hypothesis

import json
import numpy as np
import pandas as pd
import pandera as pa
from pandera.errors import SchemaError, SchemaErrors
from pandera.typing import Series, DataFrame

print("pandera version:", pa.__version__)
print("pandas  version:", pd.__version__)

In [None]:
rng = np.random.default_rng(42)

def make_raw_orders(n=250):
    countries = np.array(["CA", "US", "MX"])
    channels = np.array(["web", "mobile", "partner"])
    raw = pd.DataFrame(
        {
            "order_id": rng.integers(1, 120, size=n),
            "customer_id": rng.integers(1, 90, size=n),
            "email": rng.choice(
                ["alice@example.com", "bob@example.com", "bad_email", None],
                size=n,
                p=[0.45, 0.45, 0.07, 0.03],
            ),
            "country": rng.choice(countries, size=n, p=[0.5, 0.45, 0.05]),
            "channel": rng.choice(channels, size=n, p=[0.55, 0.35, 0.10]),
            "items": rng.integers(0, 8, size=n),
            "unit_price": rng.normal(loc=35, scale=20, size=n),
            "discount": rng.choice([0.0, 0.05, 0.10, 0.20, 0.50], size=n, p=[0.55, 0.15, 0.15, 0.12, 0.03]),
            "ordered_at": pd.to_datetime("2025-01-01") + pd.to_timedelta(rng.integers(0, 120, size=n), unit="D"),
        }
    )

    raw.loc[rng.choice(n, size=8, replace=False), "unit_price"] = -abs(raw["unit_price"].iloc[0])
    raw.loc[rng.choice(n, size=6, replace=False), "items"] = 0
    raw.loc[rng.choice(n, size=5, replace=False), "discount"] = 0.9
    raw.loc[rng.choice(n, size=4, replace=False), "country"] = "ZZ"
    raw.loc[rng.choice(n, size=3, replace=False), "channel"] = "unknown"
    raw.loc[rng.choice(n, size=6, replace=False), "unit_price"] = raw["unit_price"].iloc[:6].round(2).astype(str).values

    return raw

raw_orders = make_raw_orders(250)
display(raw_orders.head(10))

In [None]:
EMAIL_RE = r"^[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}$"

class Orders(pa.DataFrameModel):
    order_id: Series[int] = pa.Field(ge=1)
    customer_id: Series[int] = pa.Field(ge=1)
    email: Series[object] = pa.Field(nullable=True)
    country: Series[str] = pa.Field(isin=["CA", "US", "MX"])
    channel: Series[str] = pa.Field(isin=["web", "mobile", "partner"])
    items: Series[int] = pa.Field(ge=1, le=50)
    unit_price: Series[float] = pa.Field(gt=0)
    discount: Series[float] = pa.Field(ge=0.0, le=0.8)
    ordered_at: Series[pd.Timestamp]

    class Config:
        coerce = True
        strict = True
        ordered = False

    @pa.check("email")
    def email_valid(cls, s: pd.Series) -> pd.Series:
        return s.isna() | s.astype(str).str.match(EMAIL_RE)

    @pa.dataframe_check
    def total_value_reasonable(cls, df: pd.DataFrame) -> pd.Series:
        total = df["items"] * df["unit_price"] * (1.0 - df["discount"])
        return total.between(0.01, 5000.0)

    @pa.dataframe_check
    def channel_country_rule(cls, df: pd.DataFrame) -> pd.Series:
        ok = ~((df["channel"] == "partner") & (df["country"] == "MX"))
        return ok

In [None]:
try:
    validated = Orders.validate(raw_orders, lazy=True)
    print(validated.dtypes)
except SchemaErrors as exc:
    display(exc.failure_cases.head(25))
    err_json = exc.failure_cases.to_dict(orient="records")
    print(json.dumps(err_json[:5], indent=2, default=str))

In [None]:
def split_clean_quarantine(df: pd.DataFrame):
    try:
        clean = Orders.validate(df, lazy=False)
        return clean, df.iloc[0:0].copy()
    except SchemaError:
        pass

    try:
        Orders.validate(df, lazy=True)
        return df.copy(), df.iloc[0:0].copy()
    except SchemaErrors as exc:
        bad_idx = sorted(set(exc.failure_cases["index"].dropna().astype(int).tolist()))
        quarantine = df.loc[bad_idx].copy()
        clean = df.drop(index=bad_idx).copy()
        return Orders.validate(clean, lazy=False), quarantine

clean_orders, quarantine_orders = split_clean_quarantine(raw_orders)
display(quarantine_orders.head(10))
display(clean_orders.head(10))

@pa.check_types
def enrich_orders(df: DataFrame[Orders]) -> DataFrame[Orders]:
    out = df.copy()
    out["unit_price"] = out["unit_price"].round(2)
    out["discount"] = out["discount"].round(2)
    return out

enriched = enrich_orders(clean_orders)
display(enriched.head(5))

In [5]:
class EnrichedOrders(Orders):
    total_value: Series[float] = pa.Field(gt=0)

    class Config:
        coerce = True
        strict = True

    @pa.dataframe_check
    def totals_consistent(cls, df: pd.DataFrame) -> pd.Series:
        total = df["items"] * df["unit_price"] * (1.0 - df["discount"])
        return (df["total_value"] - total).abs() <= 1e-6

@pa.check_types
def add_totals(df: DataFrame[Orders]) -> DataFrame[EnrichedOrders]:
    out = df.copy()
    out["total_value"] = out["items"] * out["unit_price"] * (1.0 - out["discount"])
    return EnrichedOrders.validate(out, lazy=False)

enriched2 = add_totals(clean_orders)
display(enriched2.head(5))

pandera version: 0.28.1
pandas  version: 2.2.2

Raw sample:


  raw.loc[rng.choice(n, size=6, replace=False), "unit_price"] = raw["unit_price"].iloc[:6].round(2).astype(str).values


Unnamed: 0,order_id,customer_id,email,country,channel,items,unit_price,discount,ordered_at
0,11,16,bob@example.com,US,web,7,23.078222,0.0,2025-03-09
1,93,44,bob@example.com,US,web,0,35.170113,0.0,2025-02-19
2,78,17,bob@example.com,US,web,3,50.898645,0.0,2025-02-22
3,53,30,alice@example.com,US,web,3,38.607279,0.0,2025-02-27
4,52,59,alice@example.com,US,mobile,6,21.878901,0.5,2025-03-30
5,103,13,bob@example.com,US,web,1,59.525859,0.2,2025-04-14
6,11,61,bob@example.com,CA,web,3,66.58371,0.0,2025-03-05
7,83,10,alice@example.com,US,partner,1,44.891133,0.9,2025-03-19
8,24,16,alice@example.com,CA,web,6,54.47327,0.0,2025-03-30
9,12,53,bob@example.com,ZZ,web,5,59.8392,0.2,2025-03-24



--- Validating raw_orders with lazy=True (collect all errors) ---
‚ùå Validation failed with multiple errors (SchemaErrors).


Unnamed: 0,schema_context,column,check,check_number,failure_case,index
239,DataFrameSchema,country,total_value_reasonable,0,CA,215
315,DataFrameSchema,items,total_value_reasonable,0,0,170
327,DataFrameSchema,items,total_value_reasonable,0,0,222
326,DataFrameSchema,items,total_value_reasonable,0,2,218
325,DataFrameSchema,items,total_value_reasonable,0,3,215
324,DataFrameSchema,items,total_value_reasonable,0,0,214
323,DataFrameSchema,items,total_value_reasonable,0,0,213
322,DataFrameSchema,items,total_value_reasonable,0,6,211
321,DataFrameSchema,items,total_value_reasonable,0,2,203
320,DataFrameSchema,items,total_value_reasonable,0,0,202



As JSON (first 5):
[
  {
    "schema_context": "DataFrameSchema",
    "column": "country",
    "check": "total_value_reasonable",
    "check_number": 0,
    "failure_case": "CA",
    "index": 215
  },
  {
    "schema_context": "DataFrameSchema",
    "column": "items",
    "check": "total_value_reasonable",
    "check_number": 0,
    "failure_case": 0,
    "index": 170
  },
  {
    "schema_context": "DataFrameSchema",
    "column": "items",
    "check": "total_value_reasonable",
    "check_number": 0,
    "failure_case": 0,
    "index": 222
  },
  {
    "schema_context": "DataFrameSchema",
    "column": "items",
    "check": "total_value_reasonable",
    "check_number": 0,
    "failure_case": 2,
    "index": 218
  },
  {
    "schema_context": "DataFrameSchema",
    "column": "items",
    "check": "total_value_reasonable",
    "check_number": 0,
    "failure_case": 3,
    "index": 215
  }
]

--- Quarantine rows that fail validation ---
clean rows: 184 | quarantined rows: 66


Unnamed: 0,order_id,customer_id,email,country,channel,items,unit_price,discount,ordered_at
1,93,44,bob@example.com,US,web,0,35.170113,0.0,2025-02-19
7,83,10,alice@example.com,US,partner,1,44.891133,0.9,2025-03-19
9,12,53,bob@example.com,ZZ,web,5,59.8392,0.2,2025-03-24
14,86,89,bad_email,CA,partner,5,-23.078222,0.0,2025-03-20
18,100,55,alice@example.com,CA,web,5,61.071301,0.9,2025-01-09
21,45,3,alice@example.com,CA,mobile,5,29.566512,0.9,2025-03-29
22,22,14,alice@example.com,CA,mobile,0,11.955023,0.0,2025-01-11
24,94,38,bob@example.com,CA,web,0,37.040626,0.0,2025-04-27
31,28,44,alice@example.com,US,partner,1,-23.078222,0.1,2025-01-23
32,11,60,alice@example.com,CA,web,0,20.003733,0.5,2025-02-18


Unnamed: 0,order_id,customer_id,email,country,channel,items,unit_price,discount,ordered_at
0,11,16,bob@example.com,US,web,7,23.078222,0.0,2025-03-09
2,78,17,bob@example.com,US,web,3,50.898645,0.0,2025-02-22
3,53,30,alice@example.com,US,web,3,38.607279,0.0,2025-02-27
4,52,59,alice@example.com,US,mobile,6,21.878901,0.5,2025-03-30
5,103,13,bob@example.com,US,web,1,59.525859,0.2,2025-04-14
6,11,61,bob@example.com,CA,web,3,66.58371,0.0,2025-03-05
8,24,16,alice@example.com,CA,web,6,54.47327,0.0,2025-03-30
10,63,71,alice@example.com,CA,mobile,3,57.601986,0.0,2025-01-17
11,117,16,bob@example.com,US,web,3,47.281984,0.2,2025-04-28
12,88,27,alice@example.com,CA,web,1,46.966941,0.1,2025-01-20



--- Function contracts with @pa.check_types ---


Unnamed: 0,order_id,customer_id,email,country,channel,items,unit_price,discount,ordered_at
0,11,16,bob@example.com,US,web,7,23.08,0.0,2025-03-09
2,78,17,bob@example.com,US,web,3,50.9,0.0,2025-02-22
3,53,30,alice@example.com,US,web,3,38.61,0.0,2025-02-27
4,52,59,alice@example.com,US,mobile,6,21.88,0.5,2025-03-30
5,103,13,bob@example.com,US,web,1,59.53,0.2,2025-04-14



--- Schema composition: Enriched schema allows computed columns ---


Unnamed: 0,order_id,customer_id,email,country,channel,items,unit_price,discount,ordered_at,total_value
0,11,16,bob@example.com,US,web,7,23.078222,0.0,2025-03-09,161.547554
2,78,17,bob@example.com,US,web,3,50.898645,0.0,2025-02-22,152.695935
3,53,30,alice@example.com,US,web,3,38.607279,0.0,2025-02-27,115.821837
4,52,59,alice@example.com,US,mobile,6,21.878901,0.5,2025-03-30,65.636704
5,103,13,bob@example.com,US,web,1,59.525859,0.2,2025-04-14,47.620687



Done.
