In [1]:
import oxbow as ox
import polars as pl

vcf_path = "../../fixtures/ALL.chrY.phase3_integrated_v1a.20130502.genotypes.vcf.gz"

In [2]:
chunk_size = 1_000_000
partitions = ox.partition_from_index_file(vcf_path+".tbi", chunk_size)
partitions

[(0, 17799),
 (1011683, 45288),
 (2151905, 38743),
 (3152162, 51362),
 (4161902, 27635),
 (5226885, 50277),
 (5837934, 0)]

In [3]:
ipc = ox.read_vcf_vpos(vcf_path, partitions[0], partitions[-1])
df = pl.read_ipc(ipc)
df.shape

(62042, 9)

In [4]:
y_ipc = ox.read_vcf(vcf_path, "Y")
y_df = pl.read_ipc(y_ipc)
y_df.shape

(62042, 9)

In [5]:
# Check if the number of records for the entire range (via vpos) is equal to the total number of records when reading the whole vcf file
assert df.shape == y_df.shape

In [6]:
# Check if the cumulative length of the partitions equals the total number of records when reading the whole vcf file
cum_sum = 0
for p1, p2 in zip(partitions[:-1], partitions[1:]):
    p_ipc = ox.read_vcf_vpos(vcf_path, p1, p2)
    p_df = pl.read_ipc(p_ipc)
    cum_sum += p_df.shape[0]
assert cum_sum == df.shape[0]

In [7]:
test_ipc = ox.read_vcf_vpos(vcf_path, partitions[1], partitions[3])
test_df = pl.read_ipc(test_ipc)
print(test_df)
print(test_df.shape)

shape: (27_947, 9)
┌───────┬──────────┬─────┬─────┬───┬───────┬────────┬───────────────────────────────────┬────────┐
│ chrom ┆ pos      ┆ id  ┆ ref ┆ … ┆ qual  ┆ filter ┆ info                              ┆ format │
│ ---   ┆ ---      ┆ --- ┆ --- ┆   ┆ ---   ┆ ---    ┆ ---                               ┆ ---    │
│ cat   ┆ i32      ┆ str ┆ str ┆   ┆ f32   ┆ str    ┆ str                               ┆ str    │
╞═══════╪══════════╪═════╪═════╪═══╪═══════╪════════╪═══════════════════════════════════╪════════╡
│ Y     ┆ 8028497  ┆     ┆ G   ┆ … ┆ 100.0 ┆ PASS   ┆ AA=G;AC=1;AF=0.00081103;AN=1233;… ┆ GT     │
│ Y     ┆ 8028609  ┆     ┆ T   ┆ … ┆ 100.0 ┆ PASS   ┆ AA=T;AC=1;AF=0.00081103;AN=1233;… ┆ GT     │
│ Y     ┆ 8028736  ┆     ┆ A   ┆ … ┆ 100.0 ┆ PASS   ┆ AA=A;AC=1;AF=0.00081103;AN=1233;… ┆ GT     │
│ Y     ┆ 8028804  ┆     ┆ A   ┆ … ┆ 100.0 ┆ PASS   ┆ AA=A;AC=61;AF=0.0494728;AN=1233;… ┆ GT     │
│ …     ┆ …        ┆ …   ┆ …   ┆ … ┆ …     ┆ …      ┆ …                                 ┆ 

In [8]:
test2_ipc = ox.read_vcf_vpos(vcf_path, partitions[3], partitions[4])
test2_df = pl.read_ipc(test2_ipc)
print(test2_df)

shape: (10_897, 9)
┌───────┬──────────┬─────┬─────┬───┬───────┬────────┬───────────────────────────────────┬────────┐
│ chrom ┆ pos      ┆ id  ┆ ref ┆ … ┆ qual  ┆ filter ┆ info                              ┆ format │
│ ---   ┆ ---      ┆ --- ┆ --- ┆   ┆ ---   ┆ ---    ┆ ---                               ┆ ---    │
│ cat   ┆ i32      ┆ str ┆ str ┆   ┆ f32   ┆ str    ┆ str                               ┆ str    │
╞═══════╪══════════╪═════╪═════╪═══╪═══════╪════════╪═══════════════════════════════════╪════════╡
│ Y     ┆ 17629604 ┆     ┆ G   ┆ … ┆ 100.0 ┆ PASS   ┆ AA=G;AC=1;AF=0.00081103;AN=1233;… ┆ GT     │
│ Y     ┆ 17629712 ┆     ┆ C   ┆ … ┆ 100.0 ┆ PASS   ┆ AA=.;AC=2;AF=0.00162206;AN=1233;… ┆ GT     │
│ Y     ┆ 17629724 ┆     ┆ C   ┆ … ┆ 100.0 ┆ PASS   ┆ AA=C;AC=1;AF=0.00081103;AN=1233;… ┆ GT     │
│ Y     ┆ 17629865 ┆     ┆ G   ┆ … ┆ 100.0 ┆ PASS   ┆ AA=G;AC=1;AF=0.00081103;AN=1233;… ┆ GT     │
│ …     ┆ …        ┆ …   ┆ …   ┆ … ┆ …     ┆ …      ┆ …                                 ┆ 

In [9]:
foo_ipc = ox.read_vcf(vcf_path, "Y:8028497-17629059")
foo_df = pl.read_ipc(foo_ipc)
print(foo_df.shape)
foo_df

(27947, 9)


chrom,pos,id,ref,alt,qual,filter,info,format
cat,i32,str,str,str,f32,str,str,str
"""Y""",8028497,"""""","""G""","""T""",100.0,"""PASS""","""AA=G;AC=1;AF=0…","""GT"""
"""Y""",8028609,"""""","""T""","""C""",100.0,"""PASS""","""AA=T;AC=1;AF=0…","""GT"""
"""Y""",8028736,"""""","""A""","""G""",100.0,"""PASS""","""AA=A;AC=1;AF=0…","""GT"""
"""Y""",8028804,"""""","""A""","""AT""",100.0,"""PASS""","""AA=A;AC=61;AF=…","""GT"""
"""Y""",8028896,"""rs373433249""","""C""","""T""",100.0,"""PASS""","""AA=C;AC=35;AF=…","""GT"""
"""Y""",8028918,"""""","""T""","""A""",100.0,"""PASS""","""AA=T;AC=1;AF=0…","""GT"""
"""Y""",8029480,"""""","""G""","""A""",100.0,"""PASS""","""AA=G;AC=1;AF=0…","""GT"""
"""Y""",8029507,"""""","""A""","""G""",100.0,"""PASS""","""AA=A;AC=1;AF=0…","""GT"""
"""Y""",8029914,"""""","""G""","""T""",100.0,"""PASS""","""AA=G;AC=1;AF=0…","""GT"""
"""Y""",8030142,"""""","""C""","""T""",100.0,"""PASS""","""AA=C;AC=2;AF=0…","""GT"""
