# Test I/O speed

In [13]:
import io
import gzip
import cProfile

# file = 'test.np.fq'  # 20M
file = '/mnt/nas1/hhl/phellinus/mitochondria/rebuild/pnok/O_PBA/02_0_Sprai_in.fastq'  # 1.5G
# fileGz = 'test.np.fq.gz'  # 12M
fileGz = '/mnt/nas1/hhl/ustilago/fastqs.nanopore/UEMT_FAH91148.fq.gz'  # 2.6G

def op(handle):
    lengths = []
    for line in handle:
        lengths.append(len(next(handle).strip()))
        next(handle)
        next(handle)
    return sum(lengths)

In [3]:
### uncompressed ###

def funcA(file):
    with open(file, 'rt') as f:
        print(op(f))

def funcB(file):
    with open(file, 'rb') as f:
        print(op(f))

def funcC(file):
    with open(file, 'rb') as f:
        handle = io.TextIOWrapper(f)
        print(op(handle))

def funcD(file):
    with open(file, 'rb') as f:
        handle = io.BufferedReader(f)
        print(op(handle))

In [4]:
### compressed ###

def funcA_gz(file):
    with gzip.open(fileGz) as f:
        print(op(f))

def funcB_gz(file):
    with io.BufferedReader(gzip.open(fileGz)) as f:
        print(op(f))

def funcC_gz(file):
    with open(fileGz, 'rb') as f:
        h = io.BufferedReader(gzip.GzipFile(fileobj=f))
        print(op(h))

## benchmark

In [14]:
cProfile.run('funcA(file)')
cProfile.run('funcB(file)')
cProfile.run('funcC(file)')
cProfile.run('funcD(file)')
# A > C > D > B

793548227
         1132126 function calls in 3.485 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.273    0.273    3.481    3.481 <ipython-input-13-c165535b06d2>:10(op)
        1    0.002    0.002    3.485    3.485 <ipython-input-3-9fbb4a90202a>:3(funcA)
        1    0.000    0.000    3.485    3.485 <string>:1(<module>)
        1    0.000    0.000    0.000    0.000 _bootlocale.py:23(getpreferredencoding)
        1    0.000    0.000    0.000    0.000 codecs.py:259(__init__)
        1    0.000    0.000    0.000    0.000 codecs.py:308(__init__)
   195021    0.211    0.000    0.659    0.000 codecs.py:318(decode)
        3    0.000    0.000    0.000    0.000 iostream.py:195(schedule)
        2    0.000    0.000    0.000    0.000 iostream.py:307(_is_master_process)
        2    0.000    0.000    0.000    0.000 iostream.py:320(_schedule_flush)
        2    0.000    0.000    0.000    0.000 iostream.py:382(write)
    

In [10]:
cProfile.run('funcA_gz(file)')
cProfile.run('funcB_gz(file)')
cProfile.run('funcC_gz(file)')
# A > B = C

2334796854
         20937836 function calls in 60.264 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.008    0.008   60.264   60.264 <ipython-input-4-6734752522f5>:3(funcA_gz)
        1    1.662    1.662   60.254   60.254 <ipython-input-9-811626be1f1f>:9(op)
        1    0.000    0.000   60.264   60.264 <string>:1(<module>)
  2222481    1.208    0.000    1.784    0.000 _compression.py:12(_check_not_closed)
        1    0.000    0.000    0.000    0.000 _compression.py:150(tell)
        1    0.000    0.000    0.000    0.000 _compression.py:36(readable)
        1    0.000    0.000    0.000    0.000 _compression.py:39(__init__)
        1    0.000    0.000    0.000    0.000 _compression.py:59(close)
   580941    2.559    0.000   45.855    0.000 _compression.py:66(readinto)
        1    0.000    0.000    0.001    0.001 gzip.py:123(__init__)
        1    0.000    0.000    0.001    0.001 gzip.py:20(open)
  2222484   

2334796854
         16594178 function calls (16013233 primitive calls) in 57.302 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.009    0.009   57.301   57.301 <ipython-input-4-6734752522f5>:11(funcC_gz)
        1    1.573    1.573   57.255   57.255 <ipython-input-9-811626be1f1f>:9(op)
        1    0.000    0.000   57.301   57.301 <string>:1(<module>)
   580943    0.321    0.000    0.466    0.000 _compression.py:12(_check_not_closed)
        1    0.000    0.000    0.000    0.000 _compression.py:120(seek)
        1    0.000    0.000    0.000    0.000 _compression.py:150(tell)
        1    0.000    0.000    0.000    0.000 _compression.py:36(readable)
        1    0.000    0.000    0.000    0.000 _compression.py:39(__init__)
        1    0.000    0.000    0.000    0.000 _compression.py:59(close)
        1    0.000    0.000    0.000    0.000 _compression.py:63(seekable)
   580942    2.628    0.000   46.130    0.0