This example shows how you get and analyse kmers from reads using BioNumPy. We start by downloading some reads:

In [None]:
!wget -O reads.fq.gz https://github.com/bionumpy/bionumpy/raw/master/example_data/big.fq.gz
# Install BioNumPy
!pip install bionumpy

In [None]:
import bionumpy as bnp
import numpy as np
import plotly.express as px

In [None]:
reads = bnp.open("reads.fq.gz").read_chunk()
# Change encoding for ultra-fast kmer-hashing
kmers = bnp.get_kmers(reads.sequence, 5)
# kmers is now a RaggedArray that can be indexed
first_kmer_in_each_read = kmers[:, 0]
print(first_kmer_in_each_read)

# Get the most frequent kmers
counts = bnp.sequence.count_encoded(kmers, axis=None)
sorting = np.argsort(counts.counts)[0:10]

top_kmers = np.array(counts.alphabet)[sorting]
top_counts = counts.counts[sorting]

fig = px.bar(x=top_kmers, y=top_counts)
fig.show()

