### STEP3. Oligo dT barcode demultiplexing 

In [None]:
# Python library prep
!pip install pandas biopython

In [1]:
import os
import gc
import gzip
import numpy as np
import pickle
import pandas as pd
from pathlib import Path
from datetime import datetime
from Bio import SeqIO, bgzf
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq

In [2]:
# Load oligo dT barcode Excel file
df_barcode = pd.read_excel('barcode.xlsx')
df_barcode.head(2)

Unnamed: 0,pool,barcode,sequence
0,N701,G2,GAGTGG
1,N701,H1,ACTTGA


In [None]:
# Define barcode, umi length
meta = {'len_barcode': 6, 'len_umi': 10}
meta['path_raw'] = r'./data/fastq/raw/'
meta['raw_fastq'] = os.listdir(meta['path_raw'])
meta['path_output'] = f"output_{datetime.now().strftime('%Y%m%d_%H%M')}"
meta['path_pool'] = {y: os.path.join(meta['path_output'], y) for y in set([x[:6] for x in meta['raw_fastq']])}

meta['pool_fastq'] = {'ML1_S4': 'N701', 'ML2_S5': 'N702'}

In [None]:
def write_seq(fout, seq):
    SeqIO.write(seq, fout, 'fastq')
    return True

In [None]:
for pool_key, pool_path in meta['path_pool'].items():
    
    _tgt_seq = {x: list() for x in df_barcode[df_barcode.pool == meta['pool_fastq'][pool_key]].sequence}
    
    fastq = {'R1':'', 'R2':''}    
    for _key in fastq.keys():
        fastq[_key] = [x for x in meta['raw_fastq'] if x.startswith(pool_key) and x.find(_key) > 0][0]
        
    _fobj = dict()
    for _barcode in _tgt_seq.keys():
        _path_barcode = os.path.join(pool_path,_barcode)
        Path(_path_barcode).mkdir(parents=True,exist_ok=True)
        _fobj[_barcode] = dict()
        for _key, _val in fastq.items():
            _fobj[_barcode][_key] = open(os.path.join(_path_barcode, f"{_val[:-8]}fastq"),'wt')
    
    with gzip.open(os.path.join(meta['path_raw'], fastq['R1']), 'rt') as fin_r1:
        with gzip.open(os.path.join(meta['path_raw'], fastq['R2']), 'rt') as fin_r2:
            _read1 = SeqIO.parse(fin_r1, "fastq")
            _read2 = SeqIO.parse(fin_r2, "fastq")
            iter_count = 0
            while True:
                try:
                    _r1_seq = next(_read1)
                    _r2_seq = next(_read2)
                    _barcode = str(_r1_seq.seq[:meta['len_barcode']])
                    if _barcode in _tgt_seq.keys():
                        write_seq(_fobj[_barcode]['R1'], _r1_seq)
                        write_seq(_fobj[_barcode]['R2'], _r2_seq)
                    iter_count += 1
                except Exception as ex:
                    print(ex)
                    for _filer1r2 in _fobj.values():
                        for _file in _filer1r2.values():
                            _file.close()

    for _filer1r2 in _fobj.values():
        for _file in _filer1r2.values():
            _file.close()

In [None]:
_

In [None]:
SeqIO.write(SeqRecord(Seq(seq[0][2]),seq[0][0],seq[0][0],seq[0][1],letter_annotations={'phred_quality':seq[0][3]}), _fobj['GAGTGG'], 'fastq')

In [None]:
for pool_key, pool_path in meta['path_pool'].items():
    
    _tgt_seq = {x: list() for x in df_barcode[df_barcode.pool == meta['pool_fastq'][pool_key]].sequence}
    fastq = {'R1':'', 'R2':''}
    
    for _key in fastq.keys():
        fastq[_key] = [x for x in meta['raw_fastq'] if x.startswith(pool_key) and x.find(_key) > 0][0]
    
    with gzip.open(os.path.join(meta['path_raw'], fastq['R1']), 'rt') as fin_r1:
        with gzip.open(os.path.join(meta['path_raw'], fastq['R2']), 'rt') as fin_r2:
            _read1 = SeqIO.parse(fin_r1, "fastq")
            _read2 = SeqIO.parse(fin_r2, "fastq")
            iter_count = 0
            while True:
                try:
                    _r1_seq = next(_read1)
                    _r2_seq = next(_read2)
                    
                    _barcode = str(_r1_seq.seq[:meta['len_barcode']])
                    if _barcode in _tgt_seq.keys():
                        _tgt_seq[_barcode].append([
                            [_r1_seq.id,_r1_seq.description, str(_r1_seq.seq), np.array(_r1_seq.letter_annotations['phred_quality'],dtype=int)],
                            [_r2_seq.id,_r2_seq.description, str(_r2_seq.seq), np.array(_r2_seq.letter_annotations['phred_quality'],dtype=int)],
                        ])
                    iter_count += 1
                except Exception as ex:
                    print(ex)
                    break
    print(f"loading finished: {pool_key}({datetime.now().strftime('%Y%m%d %H:%M')})")
    for _key in _tgt_seq.keys():
        _path_barcode = os.path.join(pool_path,_key)
        Path(_path_barcode).mkdir(parents=True,exist_ok=True)
        with bgzf.BgzfWriter(os.path.join(_path_barcode, f"{fastq['R1'][:-8]}fastq.bgz"),'wb') as fout_r1:
            with bgzf.BgzfWriter(os.path.join(_path_barcode, f"{fastq['R2'][:-8]}fastq.bgz"),'wb') as fout_r2:
                for _seq in _tgt_seq[_key]:
                    #SeqIO.write(_seq[0], fout_r1, 'fastq')
                    SeqIO.write(SeqRecord(Seq(_seq[0][2]),_seq[0][0],_seq[0][0],_seq[0][1],letter_annotations={'phred_quality':_seq[0][3]}), fout_r1, 'fastq')
                    SeqIO.write(SeqRecord(Seq(_seq[1][2]),_seq[1][0],_seq[1][0],_seq[1][1],letter_annotations={'phred_quality':_seq[1][3]}), fout_r2, 'fastq')
                    #SeqIO.write(_seq[1], fout_r2, 'fastq')
    
    print(f"writing finished: {pool_key}({datetime.now().strftime('%Y%m%d %H:%M')})")
    _tgt_seq = None
    gc.collect()

In [None]:
# Following Martin advice - 10 May 2021

for pool_key, pool_path in meta['path_pool'].items():
    
    fastq = {'R1':'', 'R2':''}
    
    for _key in fastq.keys():
        fastq[_key] = [x for x in meta['raw_fastq'] if x.startswith(pool_key) and x.find(_key) > 0][0]
    
    # Load R1 file as Python dictionary
    _r1 = dict()
    with gzip.open(os.path.join(meta['path_raw'], fastq['R1']), 'rt') as fin_r1:
        for _r1_seq in SeqIO.parse(fin_r1, 'fastq'):
            _r1[_r1_seq.id] =(str(_r1_seq.seq), np.array(_r1_seq.letter_annotations['phred_quality'],dtype=int))
            
    # Read R2 file line by line and write to FASTQ
    Path(pool_path).mkdir(parents=True,exist_ok=True)
    with gzip.open(os.path.join(meta['path_raw'], fastq['R2']), 'rt') as fin_r2:
        with bgzf.BgzfWriter(os.path.join(pool_path, f"{fastq['R1'][:-8]}{_key}.fastq.bgz".replace('R1','R12')),'wb') as fout:
            _read2 = SeqIO.parse(fin_r2, "fastq")
            while True:
                try:
                    _r2_seq = next(_read2)
                    _r1_seq = _r1.get(_r2_seq.id)
                    _barcode = _r1_seq[0][:meta['len_barcode']]
                    _umi = _r1_seq[0][meta['len_barcode']:meta['len_barcode']+ meta['len_umi']]
                    _seq = _r1_seq[0][meta['len_barcode']+meta['len_umi']:]+str(_r2_seq.seq)

                    _r2_phred = np.array(_r2_seq.letter_annotations['phred_quality'],dtype=int)
                    _phred = {'phred_quality':np.concatenate((_r1_seq[1][meta['len_barcode']+meta['len_umi']:], _r2_phred))}

                    SeqIO.write(SeqRecord(Seq(_seq),_r2_seq.id,'',f'BARCODE:{_barcode} UMI:{_umi}',letter_annotations=_phred), fout, 'fastq')

                except KeyError as ex:
                    print(f'{ex} is not exist!')
                    break
                except Exception as ex:
                    print(ex)
                    break
    _r1 = None
    gc.collect()

In [None]:
# for pool_key, pool_path in meta['path_pool'].items():
    
#     _tgt_seq = {x: list() for x in df_barcode[df_barcode.pool == meta['pool_fastq'][pool_key]].sequence}
#     fastq = {'R1':'', 'R2':''}
    
#     for _key in fastq.keys():
#         fastq[_key] = [x for x in meta['raw_fastq'] if x.startswith(pool_key) and x.find(_key) > 0][0]
    
#     with gzip.open(os.path.join(meta['path_raw'], fastq['R1']), 'rt') as fin_r1:
#         with gzip.open(os.path.join(meta['path_raw'], fastq['R2']), 'rt') as fin_r2:
#             _read1 = SeqIO.parse(fin_r1, "fastq")
#             _read2 = SeqIO.parse(fin_r2, "fastq")
#             iter_count = 0
#             while True:
#                 try:
#                     _r1_seq = next(_read1)
#                     _r2_seq = next(_read2)
#                     _barcode = str(_r1_seq.seq[:meta['len_barcode']])
#                     _umi = str(_r1_seq.seq[meta['len_barcode']:meta['len_barcode']+ meta['len_umi']])
#                     if _barcode in _tgt_seq.keys():
#                         _seq = str(_r1_seq.seq)[meta['len_barcode']+meta['len_umi']:]+str(_r2_seq.seq)
#                         _tgt_seq[_barcode].append([_r1_seq.id,_umi,_seq])
#                     iter_count += 1
                    
#                 except:
#                     break
                    
#     Path(pool_path).mkdir(parents=True,exist_ok=True)
#     for _key, _seqs in _tgt_seq.items():
#         with open(os.path.join(pool_path, f"{fastq['R1'][:-8]}{_key}.fasta".replace('R1','R12')),'wt') as fout:
#             for _seq in _seqs:
#                 SeqIO.write(SeqRecord(Seq(_seq[2]),_seq[0],'',f'BARCODE:{_key} UMI:{_seq[1]}'), fout, 'fasta')
    
#     _tgt_seq = None
#     gc.collect()
    