# Base Sequence Quality

#### Import Dependencies

In [1]:
import polars_bio as pb
import pandas as pd

INFO:polars_bio:Creating BioSessionContext


#### Common

Load Tables

In [2]:
example_path = "./tests/data/base_quality/example.fastq"
bench_path = "./tests/data/base_quality/bench.fastq"

example_data = pb.read_fastq(example_path)
bench_data = pb.read_fastq(bench_path)

INFO:polars_bio:Table: example registered for path: ./tests/data/base_quality/example.fastq
INFO:polars_bio:Table: bench registered for path: ./tests/data/base_quality/bench.fastq


Set target partitions

In [3]:
pb.set_option("datafusion.execution.target_partitions", "8")

### UDTF for Quality Histogram

In [8]:
%time x = pb.sql("SELECT position, q.avg as avg FROM (SELECT position, quality_quartiles(score, count) as q FROM (SELECT * FROM quality_histogram('bench', 'quality_scores')) GROUP BY position)").collect()
x.limit(10)

0rows [00:00, ?rows/s]

CPU times: user 17min 38s, sys: 1min 55s, total: 19min 34s
Wall time: 1min 44s


position,avg
u64,f64
91,94.0
87,94.0
74,94.0
26,94.0
90,94.0
88,94.0
49,94.0
1,94.0
84,94.0
86,94.0


In [5]:
x = pb.sql("SELECT position, quality_quartiles(score, count) as quartiles FROM (SELECT position, score, SUM(count) AS count FROM quality_histogram('example', 'quality_scores') GROUP BY position, score) GROUP BY position").collect()
x.limit(10)

0rows [00:00, ?rows/s]

position,quartiles
u64,struct[1]
84,{94.0}
55,{94.0}
42,{94.0}
81,{94.0}
24,{94.0}
53,{94.0}
44,{94.0}
62,{94.0}
19,{94.0}
86,{94.0}


In [6]:
x = pb.sql("SELECT position, score, SUM(count) AS count FROM quality_histogram('example', 'quality_scores') GROUP BY position, score").collect()

0rows [00:00, ?rows/s]

In [7]:
y = x["count"].sum() / 100
len(x), y, y / len(x), x.sort("count").sort("score")

(1924,
 202.0,
 0.104989604989605,
 shape: (1_924, 3)
 ┌──────────┬───────┬───────┐
 │ position ┆ score ┆ count │
 │ ---      ┆ ---   ┆ ---   │
 │ u64      ┆ u8    ┆ u64   │
 ╞══════════╪═══════╪═══════╡
 │ 89       ┆ 2     ┆ 3     │
 │ 90       ┆ 2     ┆ 3     │
 │ 72       ┆ 2     ┆ 5     │
 │ 84       ┆ 2     ┆ 5     │
 │ 59       ┆ 2     ┆ 6     │
 │ …        ┆ …     ┆ …     │
 │ 16       ┆ 41    ┆ 62    │
 │ 33       ┆ 41    ┆ 63    │
 │ 27       ┆ 41    ┆ 64    │
 │ 25       ┆ 41    ┆ 64    │
 │ 23       ┆ 41    ┆ 65    │
 └──────────┴───────┴───────┘)

### Calculate for file

In [None]:
p = "./tests/data/base_quality/example.fastq"
data = pb.read_fastq(p)
result = pb.quality_udaf(data)
print(result)

### Benchmark

Prepare LazyDatraframe with sequences

In [None]:
p = "./tests/data/base_quality/bench.fastq"
data = pb.read_fastq(p)
pb.set_option("datafusion.execution.target_partitions", "8")

Measure time

In [None]:
%time result = pb.quality_udaf(data)

In [None]:
%time data = data.collect()

In [None]:
result.limit(1)["pos"]