In [24]:
import sys
import os
import io
import gzip
from multiprocessing import Process, Pipe
import subprocess as sp

# in_fq = '/mnt/nas1/hhl/fusarium/fastqs_cdna/guppy_3.3.0/Fu6_cdna_FAH31309.fq.gz'  # 190Mb
in_fq = '/mnt/nas1/hhl/ustilago/fastqs.nanopore/guppy_2.3.5/UEMT_2cells.min1000.fq.gz'  # 3.3G

In [25]:
def classic_way():
    lengths = []
    handle = io.BufferedReader(gzip.open(in_fq))
    for line in handle:
        lengths.append(len(next(handle).strip()))
        next(handle)
        next(handle)
    handle.close()
    print(len(lengths))

%timeit -r3 classic_way()

586675
586675
586675
586675
1min 5s ± 49.4 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)


In [26]:
def subproc():
    lengths = []
    proc = sp.Popen(['zcat', in_fq], stdout=sp.PIPE, stderr=sp.PIPE)
    # out, err = proc.communicate()
    with proc.stdout as handle:
        for line in handle:
            lengths.append(len(next(handle).strip()))
            next(handle)
            next(handle)
    proc.wait()
    print(len(lengths))

%timeit -r3 subproc()

586675
586675
586675
586675
1min 9s ± 452 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)


references:
- https://docs.python.org/3/library/multiprocessing.html
- https://laucyun.com/c5dafb12eba4cfb0c80159d57f06f8b4.html
- https://zwindr.blogspot.com/2017/04/python-multiprocessing.html
- https://stackoverflow.com/questions/6564395/why-doesnt-pipe-close-cause-eoferror-during-pipe-recv-in-python-multiproces

Not faster? bottleneck is?

In [27]:
def multiproc():
    def read_conn(conn):
        # print(os.getpid())  # this thread is fully loaded
        handle = io.BufferedReader(gzip.open(in_fq))
        for _ in handle:
            conn.send(len(next(handle)))
            next(handle)
            next(handle)
        handle.close()
        # conn.close()
        # print('close reading.')

    parent_conn, child_conn = Pipe(False)
    p = Process(target=read_conn, args=(child_conn,))
    p.start()
    try:
        n = 0
        lengths = []
        child_conn.close()  # close in master proc, but keep opening in child
        while True:
            line = parent_conn.recv()
            lengths.append(line)
            n += 1
    except EOFError as e:
        print(len(lengths), 'finish!')
    p.join()

%timeit -r3 multiproc()

586675 finish!
586675 finish!
586675 finish!
586675 finish!
1min 16s ± 103 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)


`zcat /mnt/nas1/hhl/ustilago/fastqs.nanopore/guppy_2.3.5/UEMT_2cells.min1000.fq.gz | ./fastq_stats.py -`

about 1:10, not that fast?