### Download test files

In [1]:
! wget --quiet https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_49/gencode.v49.annotation.gff3.gz -P ../data/

### Convert to blocked gzip and index (needed for parallel reading)

In [2]:
%%sh
cd ../data
gunzip -c gencode.v49.annotation.gff3.gz  | bgzip -c > gencode.v49.annotation.gff3.bgz
bgzip -r gencode.v49.annotation.gff3.bgz

### Read GFF file with polars-Bio

In [1]:
import polars_bio as pb
import polars as pl

In [2]:
bgz_path = "../data/gencode.v49.annotation.gff3.bgz"

In [5]:
gff_gz = pb.scan_gff("../data/gencode.v49.annotation.gff3.gz")

In [6]:
gff_gz.limit(1).collect()

0rows [00:00, ?rows/s]

chrom,start,end,type,source,score,strand,phase,attributes
str,u32,u32,str,str,f32,str,u32,list[struct[2]]
"""chr1""",11121,24894,"""gene""","""HAVANA""",,"""+""",,"[{""ID"",""ENSG00000290825.2""}, {""gene_id"",""ENSG00000290825.2""}, … {""tag"",""overlaps_pseudogene""}]"


In [7]:
gff = pb.scan_gff(bgz_path)

In [8]:
gff.limit(1).collect()

0rows [00:00, ?rows/s]

chrom,start,end,type,source,score,strand,phase,attributes
str,u32,u32,str,str,f32,str,u32,list[struct[2]]
"""chr1""",11121,24894,"""gene""","""HAVANA""",,"""+""",,"[{""ID"",""ENSG00000290825.2""}, {""gene_id"",""ENSG00000290825.2""}, … {""tag"",""overlaps_pseudogene""}]"


In [29]:
gff.count().collect()

0rows [00:00, ?rows/s]

chrom,start,end,type,source,score,strand,phase,attributes
u32,u32,u32,u32,u32,u32,u32,u32,u32
7747875,7747875,7747875,7747875,7747875,0,7747875,2720577,7747875


### Parallel reading

In [24]:
gff = pb.scan_gff(bgz_path, parallel=True)

In [28]:
pb.set_option("datafusion.execution.target_partitions", "4")

In [26]:
gff.count().collect()

0rows [00:00, ?rows/s]

chrom,start,end,type,source,score,strand,phase,attributes
u32,u32,u32,u32,u32,u32,u32,u32,u32
7747875,7747875,7747875,7747875,7747875,0,7747875,2720577,7747875


### Projection and predicate and pushdown optimizations

#### Projection pushdown

In [30]:
pb.set_option("datafusion.execution.target_partitions", "1")

In [31]:
pb.scan_gff(bgz_path, parallel=True, projection_pushdown=False).select(["chrom", "start", "end"]).collect()

0rows [00:00, ?rows/s]

chrom,start,end
str,u32,u32
"""chr1""",11121,24894
"""chr1""",11121,14413
"""chr1""",11121,11211
"""chr1""",12010,12227
"""chr1""",12613,12721
…,…,…
"""chrM""",15888,15953
"""chrM""",15888,15953
"""chrM""",15956,16023
"""chrM""",15956,16023


In [32]:
pb.scan_gff(bgz_path, parallel=True, projection_pushdown=True).select(["chrom", "start", "end"]).collect()

0rows [00:00, ?rows/s]

chrom,start,end
str,u32,u32
"""chr1""",11121,24894
"""chr1""",11121,14413
"""chr1""",11121,11211
"""chr1""",12010,12227
"""chr1""",12613,12721
…,…,…
"""chrM""",15888,15953
"""chrM""",15888,15953
"""chrM""",15956,16023
"""chrM""",15956,16023


#### Increase parallelism

In [33]:
pb.set_option("datafusion.execution.target_partitions", "4")

In [34]:
pb.scan_gff(bgz_path, parallel=True, projection_pushdown=True).select(["chrom", "start", "end"]).collect()

0rows [00:00, ?rows/s]

chrom,start,end
str,u32,u32
"""chr1""",11121,24894
"""chr1""",11121,14413
"""chr1""",11121,11211
"""chr1""",12010,12227
"""chr1""",12613,12721
…,…,…
"""chr10""",6510995,6511194
"""chr10""",6510995,6510998
"""chr10""",6510996,6510998
"""chr10""",6498396,6498558


#### Predicate pushdown

In [40]:
pb.set_option("datafusion.execution.target_partitions", "1")

In [41]:
lf = pb.scan_gff(bgz_path, parallel=True, predicate_pushdown=False).filter(
    (pl.col("chrom") == "chrY")
    & (pl.col("start") < 500000)
    & (pl.col("end") > 510000)
)

In [42]:
lf.collect()

0rows [00:00, ?rows/s]

chrom,start,end,type,source,score,strand,phase,attributes
str,u32,u32,str,str,f32,str,u32,list[struct[2]]
"""chrY""",386962,511616,"""gene""","""HAVANA""",,"""+""",,"[{""ID"",""ENSG00000292349.2""}, {""gene_id"",""ENSG00000292349.2""}, … {""havana_gene"",""OTTHUMG00000189992.1""}]"
"""chrY""",387035,511616,"""transcript""","""HAVANA""",,"""+""",,"[{""ID"",""ENST00000972808.1""}, {""Parent"",""ENSG00000292349.2""}, … {""havana_gene"",""OTTHUMG00000189992.1""}]"


In [43]:
lf = pb.scan_gff(bgz_path, parallel=True, predicate_pushdown=True).filter(
    (pl.col("chrom") == "chrY")
    & (pl.col("start") < 500000)
    & (pl.col("end") > 510000)
)

In [44]:
lf.collect()

0rows [00:00, ?rows/s]

chrom,start,end,type,source,score,strand,phase,attributes
str,u32,u32,str,str,f32,str,u32,list[struct[2]]
"""chrY""",386962,511616,"""gene""","""HAVANA""",,"""+""",,"[{""ID"",""ENSG00000292349.2""}, {""gene_id"",""ENSG00000292349.2""}, … {""havana_gene"",""OTTHUMG00000189992.1""}]"
"""chrY""",387035,511616,"""transcript""","""HAVANA""",,"""+""",,"[{""ID"",""ENST00000972808.1""}, {""Parent"",""ENSG00000292349.2""}, … {""havana_gene"",""OTTHUMG00000189992.1""}]"


#### Increase parallelism

In [45]:
pb.set_option("datafusion.execution.target_partitions", "4")

In [46]:
lf.collect()

0rows [00:00, ?rows/s]

chrom,start,end,type,source,score,strand,phase,attributes
str,u32,u32,str,str,f32,str,u32,list[struct[2]]
"""chrY""",386962,511616,"""gene""","""HAVANA""",,"""+""",,"[{""ID"",""ENSG00000292349.2""}, {""gene_id"",""ENSG00000292349.2""}, … {""havana_gene"",""OTTHUMG00000189992.1""}]"
"""chrY""",387035,511616,"""transcript""","""HAVANA""",,"""+""",,"[{""ID"",""ENST00000972808.1""}, {""Parent"",""ENSG00000292349.2""}, … {""havana_gene"",""OTTHUMG00000189992.1""}]"


#### Combine projection and predicate pushdown

In [54]:
pb.set_option("datafusion.execution.target_partitions", "1")

In [55]:
(pb.scan_gff(bgz_path, parallel=True, predicate_pushdown=False, projection_pushdown=False)
.select(["chrom", "start", "end", "ID"])
).collect()

0rows [00:00, ?rows/s]

chrom,start,end,ID
str,u32,u32,str
"""chr1""",11121,24894,"""ENSG00000290825.2"""
"""chr1""",11121,14413,"""ENST00000832824.1"""
"""chr1""",11121,11211,"""exon:ENST00000832824.1:1"""
"""chr1""",12010,12227,"""exon:ENST00000832824.1:2"""
"""chr1""",12613,12721,"""exon:ENST00000832824.1:3"""
…,…,…,…
"""chrM""",15888,15953,"""ENST00000387460.2"""
"""chrM""",15888,15953,"""exon:ENST00000387460.2:1"""
"""chrM""",15956,16023,"""ENSG00000210196.2"""
"""chrM""",15956,16023,"""ENST00000387461.2"""


In [48]:
(pb.scan_gff(bgz_path, parallel=True, predicate_pushdown=False, projection_pushdown=False)
.select(["chrom", "start", "end", "ID"])
.filter(
    (pl.col("chrom") == "chrY")
    & (pl.col("start") < 500000)
    & (pl.col("end") > 510000)
)).collect()

0rows [00:00, ?rows/s]

chrom,start,end,ID
str,u32,u32,str
"""chrY""",386962,511616,"""ENSG00000292349.2"""
"""chrY""",387035,511616,"""ENST00000972808.1"""


In [49]:
(pb.scan_gff(bgz_path, parallel=True, predicate_pushdown=True, projection_pushdown=True)
.select(["chrom", "start", "end", "ID"])
.filter(
    (pl.col("chrom") == "chrY")
    & (pl.col("start") < 500000)
    & (pl.col("end") > 510000)
)).collect()

0rows [00:00, ?rows/s]

chrom,start,end,ID
str,u32,u32,str
"""chrY""",386962,511616,"""ENSG00000292349.2"""
"""chrY""",387035,511616,"""ENST00000972808.1"""


In [51]:
pb.set_option("datafusion.execution.target_partitions", "4")

In [52]:
(pb.scan_gff(bgz_path, parallel=True, predicate_pushdown=True, projection_pushdown=True)
.select(["chrom", "start", "end", "ID"])
.filter(
    (pl.col("chrom") == "chrY")
    & (pl.col("start") < 500000)
    & (pl.col("end") > 510000)
)).collect()

0rows [00:00, ?rows/s]

chrom,start,end,ID
str,u32,u32,str
"""chrY""",386962,511616,"""ENSG00000292349.2"""
"""chrY""",387035,511616,"""ENST00000972808.1"""


#### Bug reproduction - filter with ID selection

In [None]:
# Bug: This query should return 2 rows but may return incorrect count
# UPDATE: After applying the fix, this now works correctly!
result_with_id = (pb.scan_gff(bgz_path, parallel=True, predicate_pushdown=False, projection_pushdown=False)
.filter(
    (pl.col("chrom") == "chrY")
    & (pl.col("start") < 500000)
    & (pl.col("end") > 510000)
)).select(["chrom", "start", "end", "ID"]).collect()

print(f"Rows with ID: {result_with_id.height}")
print("✅ BUG FIXED!" if result_with_id.height == 2 else "❌ Still broken")
result_with_id

In [28]:
# Test without ID - should work correctly
result_without_id = (pb.scan_gff(bgz_path, parallel=True, predicate_pushdown=False, projection_pushdown=False)
.filter(
    (pl.col("chrom") == "chrY")
    & (pl.col("start") < 500000)
    & (pl.col("end") > 510000)
)).select(["chrom", "start", "end"]).collect()

print(f"Rows without ID: {result_without_id.height}")
result_without_id

0rows [00:00, ?rows/s]

Rows without ID: 2


chrom,start,end
str,u32,u32
"""chrY""",386962,511616
"""chrY""",387035,511616


In [29]:
# Test select-then-filter pattern with ID - should work
result_select_first = (pb.scan_gff(bgz_path, parallel=True, predicate_pushdown=False, projection_pushdown=False)
.select(["chrom", "start", "end", "ID"])).filter(
    (pl.col("chrom") == "chrY")
    & (pl.col("start") < 500000)
    & (pl.col("end") > 510000)
).collect()

print(f"Rows with select-first pattern: {result_select_first.height}")
result_select_first

0rows [00:00, ?rows/s]

Rows with select-first pattern: 2


chrom,start,end,ID
str,u32,u32,str
"""chrY""",386962,511616,"""ENSG00000292349.2"""
"""chrY""",387035,511616,"""ENST00000972808.1"""
