In [1]:
!wget https://aveit.s3.amazonaws.com/higlass/bam/example_higlass.bam -O example.bam
!wget https://aveit.s3.amazonaws.com/higlass/bam/example_higlass.bam.bai -O example.bam.bai
!touch example.bam.bai # ensures no errors from pysam

--2023-05-16 10:00:21--  https://aveit.s3.amazonaws.com/higlass/bam/example_higlass.bam
Resolving aveit.s3.amazonaws.com (aveit.s3.amazonaws.com)... 54.231.200.217, 3.5.10.233, 3.5.9.201, ...
Connecting to aveit.s3.amazonaws.com (aveit.s3.amazonaws.com)|54.231.200.217|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4993966 (4.8M) [binary/octet-stream]
Saving to: ‘example.bam’


2023-05-16 10:00:24 (2.08 MB/s) - ‘example.bam’ saved [4993966/4993966]

--2023-05-16 10:00:24--  https://aveit.s3.amazonaws.com/higlass/bam/example_higlass.bam.bai
Resolving aveit.s3.amazonaws.com (aveit.s3.amazonaws.com)... 3.5.29.126, 3.5.10.233, 3.5.9.201, ...
Connecting to aveit.s3.amazonaws.com (aveit.s3.amazonaws.com)|3.5.29.126|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 28176 (28K) [binary/octet-stream]
Saving to: ‘example.bam.bai’


2023-05-16 10:00:24 (933 KB/s) - ‘example.bam.bai’ saved [28176/28176]



In [2]:
import pathlib
import io

import pysam
import polars as pl
import pyarrow as pa
import pandas as pd

import oxbow as ox

def pysam_run(file: pathlib.Path, chrom: str, start: int, stop: int):
    ref_names = []
    starts = []
    ends = []
    names = []
    cigars = []
    seqs = []
    quals = []

    bam = pysam.AlignmentFile(str(file), "rb")

    for read in bam.fetch(chrom, start, stop):
        ref_names.append(read.reference_name)
        starts.append(read.reference_start)
        ends.append(read.reference_end)
        names.append(read.query_name)
        cigars.append(read.cigarstring)
        seqs.append(read.query_sequence)
        quals.append("".join(chr(ch + 33) for ch in read.query_qualities))

    return pd.DataFrame(
        {
            "ref_names": ref_names,
            "starts": starts,
            "ends": ends,
            "names": names,
            "cigars": cigars,
            "seqs": seqs,
            "quals": quals,
        }
    ).astype(
        {
            "ref_names": "category",
        }
    )
    

def oxbow_pandas(file: pathlib.Path, chr: str, start: int, stop: int):
    ipc = ox.read_bam(str(file), f"{chr}:{start}-{stop}")
    df = pa.ipc.open_file(io.BytesIO(ipc)).read_pandas()
    return df
    

def oxbow_polars(file: pathlib.Path, chr: str, start: int, stop: int):
    ipc = ox.read_bam(str(file), f"{chr}:{start}-{stop}")
    df = pl.read_ipc(ipc)
    return df

In [3]:
file = pathlib.Path("example.bam")
region = ("chr1", 1, 8_000_000)

In [4]:
%timeit pysam_run(file, *region)

2.08 s ± 26.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [5]:
%timeit oxbow_polars(file, *region)

1.2 s ± 19.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [6]:
%timeit oxbow_pandas(file, *region)

1.24 s ± 37.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
