Skip to content

Commit

Permalink
Update and adjust daft to version 0.2.23 & above (#238)
Browse files Browse the repository at this point in the history
* - Updated `daft_validation`
- Updated related tests
- Updated `pyproject.toml`

* - fixed has entropy

* - made `are_complete` backward compatible
  • Loading branch information
dsaad68 committed May 26, 2024
1 parent 3762e84 commit 643ed7c
Show file tree
Hide file tree
Showing 33 changed files with 214 additions and 114 deletions.
175 changes: 95 additions & 80 deletions cuallee/daft_validation.py

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ pdf = [
"fpdf2==2.7.8"
]
daft = [
"getdaft == 0.2.19"
"getdaft == 0.2.24"
]
all = [
"cuallee[dev,pyspark,pyspark_connect,snowpark,pandas,bigquery,duckdb,polars,test,dagster,cloud,pdf,daft]"
Expand Down
4 changes: 3 additions & 1 deletion test/unit/daft/test_are_complete.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ def test_coverage(check: Check):
check.are_complete(("id", "id2"), 0.75)
df = daft.from_pydict({"id": [10, None], "id2": [300, 500]})
result = check.validate(df)

col_pass_rate = daft.col("pass_rate")
assert (
result.select(daft.col("pass_rate").max() == 0.75).to_pandas().pass_rate.all()
result.agg(col_pass_rate.max()).select(col_pass_rate == 0.75).to_pandas().pass_rate.all()
)
4 changes: 3 additions & 1 deletion test/unit/daft/test_are_unique.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ def test_coverage(check: Check):
df = daft.from_pydict({"id": [10, None], "id2": [300, 500]})
result = check.validate(df)
assert result.select(daft.col("status").str.match("PASS")).to_pandas().status.all()

col_pass_rate = daft.col("pass_rate")
assert (
result.select(daft.col("pass_rate").max() == 0.75).to_pandas().pass_rate.all()
result.agg(col_pass_rate.max()).select(col_pass_rate == 0.75).to_pandas().pass_rate.all()
)
3 changes: 2 additions & 1 deletion test/unit/daft/test_has_pattern.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ def test_coverage(check: Check):
df = daft.from_pydict({"id": ["Herminio", "Hola", "Villain", "Heroe"]})
result = check.validate(df)
assert result.select(daft.col("status").str.match("PASS")).to_pandas().status.all()
col_pass_rate = daft.col("pass_rate")
assert (
result.select(daft.col("pass_rate").max() == 0.75).to_pandas().pass_rate.all()
result.agg(col_pass_rate.max()).select(col_pass_rate == 0.75).to_pandas().pass_rate.all()
)
4 changes: 3 additions & 1 deletion test/unit/daft/test_has_workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ def test_coverage(check: Check):
)
result = check.validate(df)
assert result.select(daft.col("status").str.match("PASS")).to_pandas().status.all()

col_pass_rate = daft.col("pass_rate")
assert (
result.select(daft.col("pass_rate").max() == 4 / 6).to_pandas().pass_rate.all()
result.agg(col_pass_rate.max()).select(col_pass_rate == 4/6).to_pandas().pass_rate.all()
)
4 changes: 3 additions & 1 deletion test/unit/daft/test_is_between.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ def test_coverage(check: Check):
df = daft.from_pydict({"id": np.arange(20)})
result = check.validate(df)
assert result.select(daft.col("status").str.match("PASS")).to_pandas().status.all()

col_pass_rate = daft.col("pass_rate")
assert (
result.select(daft.col("pass_rate").max() == 0.55).to_pandas().pass_rate.all()
result.agg(col_pass_rate.max()).select(col_pass_rate == 0.55).to_pandas().pass_rate.all()
)
5 changes: 4 additions & 1 deletion test/unit/daft/test_is_complete.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,7 @@ def test_coverage(check: Check):
df = daft.from_pydict({"id": [10, None], "id2": [300, 500]})
result = check.validate(df)
assert result.select(daft.col("status").str.match("PASS")).to_pandas().status.all()
assert result.select(daft.col("pass_rate").max() == 0.5).to_pandas().pass_rate.all()
col_pass_rate = daft.col("pass_rate")
assert (
result.agg(col_pass_rate.max()).select(col_pass_rate == 0.50).to_pandas().pass_rate.all()
)
5 changes: 4 additions & 1 deletion test/unit/daft/test_is_contained_in.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,4 +33,7 @@ def test_coverage(check: Check):
df = daft.from_pydict({"id": np.arange(10)})
result = check.validate(df)
assert result.select(daft.col("status").str.match("PASS")).to_pandas().status.all()
assert result.select(daft.col("pass_rate").max() == 0.5).to_pandas().pass_rate.all()
col_pass_rate = daft.col("pass_rate")
assert (
result.agg(col_pass_rate.max()).select(col_pass_rate == 0.50).to_pandas().pass_rate.all()
)
5 changes: 4 additions & 1 deletion test/unit/daft/test_is_daily.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,4 +34,7 @@ def test_coverage(check: Check):
df = daft.from_pandas(pd_df)
result = check.validate(df)
assert result.select(daft.col("status").str.match("PASS")).to_pandas().status.all()
assert result.select(daft.col("pass_rate").max() == 0.6).to_pandas().pass_rate.all()
col_pass_rate = daft.col("pass_rate")
assert (
result.agg(col_pass_rate.max()).select(col_pass_rate == 0.60).to_pandas().pass_rate.all()
)
4 changes: 3 additions & 1 deletion test/unit/daft/test_is_equal_than.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ def test_coverage(check: Check):
df = daft.from_pydict({"id": [1, 1, 1, 0]})
result = check.validate(df)
assert result.select(daft.col("status").str.match("PASS")).to_pandas().status.all()
col_pass_rate = daft.col("pass_rate")
assert (
result.select(daft.col("pass_rate").max() == 0.75).to_pandas().pass_rate.all()
result.agg(col_pass_rate.max()).select(col_pass_rate == 0.75).to_pandas().pass_rate.all()
)

6 changes: 5 additions & 1 deletion test/unit/daft/test_is_greater_or_equal_than.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,4 +32,8 @@ def test_coverage(check: Check):
df = daft.from_pydict({"id": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]})
result = check.validate(df)
assert result.select(daft.col("status").str.match("PASS")).to_pandas().status.all()
assert result.select(daft.col("pass_rate").max() == 0.6).to_pandas().pass_rate.all()
col_pass_rate = daft.col("pass_rate")
assert (
result.agg(col_pass_rate.max()).select(col_pass_rate == 0.60).to_pandas().pass_rate.all()
)

5 changes: 4 additions & 1 deletion test/unit/daft/test_is_greater_than.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,4 +32,7 @@ def test_coverage(check: Check):
df = daft.from_pydict({"id": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]})
result = check.validate(df)
assert result.select(daft.col("status").str.match("PASS")).to_pandas().status.all()
assert result.select(daft.col("pass_rate").max() == 0.5).to_pandas().pass_rate.all()
col_pass_rate = daft.col("pass_rate")
assert (
result.agg(col_pass_rate.max()).select(col_pass_rate == 0.50).to_pandas().pass_rate.all()
)
6 changes: 5 additions & 1 deletion test/unit/daft/test_is_in_billions.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,8 @@ def test_coverage(check: Check):
df = daft.from_pydict({"id": [1.0, 1e9]})
result = check.validate(df)
assert result.select(daft.col("status").str.match("PASS")).to_pandas().status.all()
assert result.select(daft.col("pass_rate").max() == 0.5).to_pandas().pass_rate.all()

col_pass_rate = daft.col("pass_rate")
assert (
result.agg(col_pass_rate.max()).select(col_pass_rate == 0.50).to_pandas().pass_rate.all()
)
6 changes: 5 additions & 1 deletion test/unit/daft/test_is_in_millions.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,8 @@ def test_coverage(check: Check):
df = daft.from_pydict({"id": [1.0, 1e6]})
result = check.validate(df)
assert result.select(daft.col("status").str.match("PASS")).to_pandas().status.all()
assert result.select(daft.col("pass_rate").max() == 0.5).to_pandas().pass_rate.all()

col_pass_rate = daft.col("pass_rate")
assert (
result.agg(col_pass_rate.max()).select(col_pass_rate == 0.50).to_pandas().pass_rate.all()
)
5 changes: 4 additions & 1 deletion test/unit/daft/test_is_inside_interquartile_range.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,4 +180,7 @@ def test_coverage(check: Check):
check.is_inside_interquartile_range("id", pct=0.5)
result = check.validate(df)
assert result.select(daft.col("status").str.match("PASS")).to_pandas().status.all()
assert result.select(daft.col("pass_rate").max() == 0.5).to_pandas().pass_rate.all()
col_pass_rate = daft.col("pass_rate")
assert (
result.agg(col_pass_rate.max()).select(col_pass_rate == 0.50).to_pandas().pass_rate.all()
)
6 changes: 5 additions & 1 deletion test/unit/daft/test_is_less_or_equal_than.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,4 +32,8 @@ def test_coverage(check: Check):
df = daft.from_pydict({"id": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]})
result = check.validate(df)
assert result.select(daft.col("status").str.match("PASS")).to_pandas().status.all()
assert result.select(daft.col("pass_rate").max() == 0.6).to_pandas().pass_rate.all()
col_pass_rate = daft.col("pass_rate")
assert (
result.agg(col_pass_rate.max()).select(col_pass_rate == 0.60).to_pandas().pass_rate.all()
)

6 changes: 5 additions & 1 deletion test/unit/daft/test_is_less_than.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,4 +32,8 @@ def test_coverage(check: Check):
df = daft.from_pydict({"id": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]})
result = check.validate(df)
assert result.select(daft.col("status").str.match("PASS")).to_pandas().status.all()
assert result.select(daft.col("pass_rate").max() == 0.6).to_pandas().pass_rate.all()
col_pass_rate = daft.col("pass_rate")
assert (
result.agg(col_pass_rate.max()).select(col_pass_rate == 0.60).to_pandas().pass_rate.all()
)

6 changes: 5 additions & 1 deletion test/unit/daft/test_is_negative.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,4 +33,8 @@ def test_coverage(check: Check):
df = daft.from_pydict({"id": [1, 2, -1, -2]})
result = check.validate(df)
assert result.select(daft.col("status").str.match("PASS")).to_pandas().status.all()
assert result.select(daft.col("pass_rate").max() == 0.5).to_pandas().pass_rate.all()

col_pass_rate = daft.col("pass_rate")
assert (
result.agg(col_pass_rate.max()).select(col_pass_rate == 0.50).to_pandas().pass_rate.all()
)
5 changes: 4 additions & 1 deletion test/unit/daft/test_is_on_friday.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,9 @@ def test_coverage(check: Check):
df = daft.from_pandas(pd_df)
result = check.validate(df)
assert result.select(daft.col("status").str.match("PASS")).to_pandas().status.all()

col_pass_rate = daft.col("pass_rate")
assert (
result.select(daft.col("pass_rate").max() == 1 / 7).to_pandas().pass_rate.all()
result.agg(col_pass_rate.max()).select(col_pass_rate == 1 / 7).to_pandas().pass_rate.all()
)

5 changes: 4 additions & 1 deletion test/unit/daft/test_is_on_monday.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,9 @@ def test_coverage(check: Check):
df = daft.from_pandas(pd_df)
result = check.validate(df)
assert result.select(daft.col("status").str.match("PASS")).to_pandas().status.all()

col_pass_rate = daft.col("pass_rate")
assert (
result.select(daft.col("pass_rate").max() == 1 / 7).to_pandas().pass_rate.all()
result.agg(col_pass_rate.max()).select(col_pass_rate == 1/7).to_pandas().pass_rate.all()
)

6 changes: 4 additions & 2 deletions test/unit/daft/test_is_on_saturday.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ def test_coverage(check: Check):
df = daft.from_pandas(pd_df)
result = check.validate(df)
assert result.select(daft.col("status").str.match("PASS")).to_pandas().status.all()

col_pass_rate = daft.col("pass_rate")
assert (
result.select(daft.col("pass_rate").max() == 1 / 7).to_pandas().pass_rate.all()
)
result.agg(col_pass_rate.max()).select(col_pass_rate == 1 / 7).to_pandas().pass_rate.all()
)
4 changes: 3 additions & 1 deletion test/unit/daft/test_is_on_schedule.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,8 @@ def test_coverage(check: Check):
df = daft.from_pandas(pd_df)
result = check.validate(df)
assert result.select(daft.col("status").str.match("PASS")).to_pandas().status.all()

col_pass_rate = daft.col("pass_rate")
assert (
result.select(daft.col("pass_rate").max() == 7 / 8).to_pandas().pass_rate.all()
result.agg(col_pass_rate.max()).select(col_pass_rate == 7 / 8).to_pandas().pass_rate.all()
)
4 changes: 3 additions & 1 deletion test/unit/daft/test_is_on_sunday.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ def test_coverage(check: Check):
df = daft.from_pandas(pd_df)
result = check.validate(df)
assert result.select(daft.col("status").str.match("PASS")).to_pandas().status.all()

col_pass_rate = daft.col("pass_rate")
assert (
result.select(daft.col("pass_rate").max() == 1 / 7).to_pandas().pass_rate.all()
result.agg(col_pass_rate.max()).select(col_pass_rate == 1 / 7).to_pandas().pass_rate.all()
)
6 changes: 4 additions & 2 deletions test/unit/daft/test_is_on_thursday.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ def test_coverage(check: Check):
df = daft.from_pandas(pd_df)
result = check.validate(df)
assert result.select(daft.col("status").str.match("PASS")).to_pandas().status.all()

col_pass_rate = daft.col("pass_rate")
assert (
result.select(daft.col("pass_rate").max() == 1 / 7).to_pandas().pass_rate.all()
)
result.agg(col_pass_rate.max()).select(col_pass_rate == 1 / 7).to_pandas().pass_rate.all()
)
4 changes: 3 additions & 1 deletion test/unit/daft/test_is_on_tuesday.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ def test_coverage(check: Check):
df = daft.from_pandas(pd_df)
result = check.validate(df)
assert result.select(daft.col("status").str.match("PASS")).to_pandas().status.all()

col_pass_rate = daft.col("pass_rate")
assert (
result.select(daft.col("pass_rate").max() == 1 / 7).to_pandas().pass_rate.all()
result.agg(col_pass_rate.max()).select(col_pass_rate == 1 / 7).to_pandas().pass_rate.all()
)
4 changes: 3 additions & 1 deletion test/unit/daft/test_is_on_wednesday.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ def test_coverage(check: Check):
df = daft.from_pandas(pd_df)
result = check.validate(df)
assert result.select(daft.col("status").str.match("PASS")).to_pandas().status.all()

col_pass_rate = daft.col("pass_rate")
assert (
result.select(daft.col("pass_rate").max() == 1 / 7).to_pandas().pass_rate.all()
result.agg(col_pass_rate.max()).select(col_pass_rate == 1 / 7).to_pandas().pass_rate.all()
)
4 changes: 3 additions & 1 deletion test/unit/daft/test_is_on_weekday.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ def test_coverage(check: Check):
df = daft.from_pandas(pd_df)
result = check.validate(df)
assert result.select(daft.col("status").str.match("PASS")).to_pandas().status.all()

col_pass_rate = daft.col("pass_rate")
assert (
result.select(daft.col("pass_rate").max() == 5 / 7).to_pandas().pass_rate.all()
result.agg(col_pass_rate.max()).select(col_pass_rate == 5 / 7).to_pandas().pass_rate.all()
)
4 changes: 3 additions & 1 deletion test/unit/daft/test_is_on_weekend.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ def test_coverage(check: Check):
df = daft.from_pandas(pd_df)
result = check.validate(df)
assert result.select(daft.col("status").str.match("PASS")).to_pandas().status.all()

col_pass_rate = daft.col("pass_rate")
assert (
result.select(daft.col("pass_rate").max() == 2 / 7).to_pandas().pass_rate.all()
result.agg(col_pass_rate.max()).select(col_pass_rate == 2 / 7).to_pandas().pass_rate.all()
)
6 changes: 5 additions & 1 deletion test/unit/daft/test_is_positive.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,4 +34,8 @@ def test_coverage(check: Check):
df = daft.from_pydict({"id": [1, 2, -1, -2]})
result = check.validate(df)
assert result.select(daft.col("status").str.match("PASS")).to_pandas().status.all()
assert result.select(daft.col("pass_rate").max() == 0.5).to_pandas().pass_rate.all()

col_pass_rate = daft.col("pass_rate")
assert (
result.agg(col_pass_rate.max()).select(col_pass_rate == 0.50).to_pandas().pass_rate.all()
)
4 changes: 3 additions & 1 deletion test/unit/daft/test_is_unique.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ def test_coverage(check: Check):
df = daft.from_pydict({"id": [10, 20, 30, 10]})
result = check.validate(df)
assert result.select(daft.col("status").str.match("PASS")).to_pandas().status.all()

col_pass_rate = daft.col("pass_rate")
assert (
result.select(daft.col("pass_rate").max() == 0.75).to_pandas().pass_rate.all()
result.agg(col_pass_rate.max()).select(col_pass_rate == 0.75).to_pandas().pass_rate.all()
)
6 changes: 5 additions & 1 deletion test/unit/daft/test_not_contained_in.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,4 +33,8 @@ def test_coverage(check: Check):
df = daft.from_pydict({"id": np.arange(10)})
result = check.validate(df)
assert result.select(daft.col("status").str.match("PASS")).to_pandas().status.all()
assert result.select(daft.col("pass_rate").max() == 0.5).to_pandas().pass_rate.all()

col_pass_rate = daft.col("pass_rate")
assert (
result.agg(col_pass_rate.max()).select(col_pass_rate == 0.50).to_pandas().pass_rate.all()
)
5 changes: 4 additions & 1 deletion test/unit/daft/test_satisfies.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,4 +24,7 @@ def test_coverage(check: Check):
df = daft.from_pydict({"id": [10, -10], "id2": [300, 500]})
result = check.validate(df)
assert result.select(daft.col("status").str.match("PASS")).to_pandas().status.all()
assert result.select(daft.col("pass_rate").max() == 0.5).to_pandas().pass_rate.all()
col_pass_rate = daft.col("pass_rate")
assert (
result.agg(col_pass_rate.max()).select(col_pass_rate == 0.50).to_pandas().pass_rate.all()
)

0 comments on commit 643ed7c

Please sign in to comment.