In [2]:
import dnaio
import gzip
import edlib
import pysam
import pyfastx
import os
import collections
import pandas as pd
import numpy as py
import multiprocessing as mp

grna_fa = 'grna.fa'
grna_mismatch_allowed = 1
read_dir = ''
out_file = 'library_map.csv'


def reverse_complement(sequence):
    complement = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C', 'N': 'N'}
    reverse_sequence = sequence[::-1]
    reverse_complement_sequence = ''.join(
        complement[base] for base in reverse_sequence)
    return reverse_complement_sequence


grna_dict = {}
with pysam.FastxFile(grna_fa) as fh:
    for entry in fh:
        grna_dict[entry.sequence] = entry.name
        grna_dict[reverse_complement(entry.sequence)] = entry.name


def get_grna_from_merged_reads(seq, grna_dict):
    for grna in grna_dict:
        align = edlib.align(
            grna,
            seq,
            mode="HW",
            task="distance",
            k=grna_mismatch_allowed)
        if align['editDistance'] != -1:
            _, end = align['locations'][-1]
            return grna_dict[grna]
    return None


def get_grna_cb_map(reads_path):
    dict_res = {}
    i = 0
    for name, seq, qual in pyfastx.Fastx(reads_path):
        i += 1
        cb = name.split(':')[-1]
        grna = get_grna_from_merged_reads(seq, grna_dict)
        if grna:
            if dict_res.get(cb):
                dict_res[cb].append(grna)
            else:
                dict_res[cb] = [grna]
    return dict_res


pool = mp.Pool(20)
reads = list(map(lambda x: os.path.join(read_dir, x), reads))
dict_res_all = pool.map(get_grna_cb_map, reads)


def collect_res(results):
    dict_res = {}
    for res in results:
        for k, v in res.items():
            if dict_res.get(k):
                dict_res[k] += v
            else:
                dict_res[k] = v
    return dict_res


dict_res_all = collect_res(dict_res_all)
res_df = []
for key, value in dict_res_all.items():
    most_common = collections.Counter(value).most_common(2)
    t0, n0 = most_common[0]
    t1, n1 = most_common[1] if len(most_common) > 1 else ('-', 0.01)
    n0_ratio = n0 / len(value)
    n1_ratio = n1 / len(value)
    res_df.append([key, t0, n0, n0_ratio, n1, n1_ratio, round(n0 / n1, 2)])

res_df = pd.DataFrame(
    res_df,
    columns=[
        'Cell Barcode',
        'gRna',
        'Firstly gRNA Num',
        'Firstly gRNA Ratio',
        'Secondly gRNA Num',
        'Secondly gRNA Ratio',
        'Firstly / Secondly'])
res_df.to_csv(out_file, index=False)