# Exploratory notebook that removes C>T edited reads from a BAM file. Alex will check the expression profiles with and without C>T edited reads and see if UMAP differences are derived from edits or expression.

In [1]:
import pandas as pd
import numpy as np
import pysam 
import os
import re
import glob
from collections import defaultdict

In [2]:
input_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/inputs/'
output_dir = '/home/bay001/projects/kris_apobec_20200121/permanent_data2/07_scRNA_groups/outputs/'

In [3]:
def get_softclip(cigar):
    """
    Returns the number of bases to be softclipped on either left or right
    side. Or both. If not softclipped, return 0
    :param cigar: string
        BAM/SAM CIGAR string
    :return left: int
        number of softclipped reads at the beginning
    :return right: int
        number of softclipped reads at the end
    """
    softclip_regex = ur"(\d+)S"
    softclip = re.findall(softclip_regex,cigar)

    softclip_right_regex = ur"[\w\d]{1}(\d+)S" # if the softclip comes from the right side
    softclip_right = re.findall(softclip_right_regex,cigar)

    left = 0
    right = 0

    if softclip:
        if len(softclip) == 2: # softclipped on both sides
            left = int(softclip[0])
            right = int(softclip[1])
        elif len(softclip_right) == 1: # softclipped only on the RIGHT side
            right = int(softclip[0])
        else: # softclipped only on the LEFT side
            left = int(softclip[0])
    return left, right


def remove_softclipped_reads(left, right, read_seq):
    """
    Returns the read after removing softclipped bases.
    :param left: int
        left softclip
    :param right: int
        right softclip
    :param read_seq: string
        read sequence
    :return softclipped_read_sequence: string
    """
    if right == 0:
        return read_seq[left:]
    return read_seq[left:-right]


def ct_mismatches(read_seq, md, sense):
    """
    Given a read sequence, MD tag, and 'sense' (look for CT if sense,
    look for GA if antisense), return the number of CT/GA mismatches
    seen in the read.
    :param read_seq: string
    :param md: string
    :param sense: boolean
    :return nonAG: int
    """
    mismatches_regex = ur"(\d+)([ATCG])"
    mismatches = re.findall(mismatches_regex,md)
    ct_mm_counts = 0
    if mismatches:
        read_pos = 0
        for mismatch in mismatches:
            ref_allele = mismatch[1]
            read_pos += int(mismatch[0])

            read_allele = read_seq[read_pos]
            if((ref_allele == 'C' and read_allele == 'T' and sense == True) or
                   (ref_allele == 'G' and read_allele == 'A' and sense == False)):
                ct_mm_counts += 1
            read_pos += 1

    return ct_mm_counts

In [4]:
flags = defaultdict(int) # keep track of filtered reads
reverse_stranded = False # 10X is stranded
ct_threshold = 0 # remove any read with more than 0 C>T
counter = 0 # track read progress

# input_bam = os.path.join(input_dir, 'RBFOX2-TIA1-STAMP_round2E_read1_feature_FB_possorted_genome_bam_MD.bam')
input_bam = os.path.join(input_dir, 'HEK-NPC-APOBEC-STAMP_possorted_genome_bam_MD.bam')
output_bam = os.path.join(output_dir, 'HEK-NPC-APOBEC-STAMP_possorted_genome_bam_MD.noCT.bam')
i = pysam.AlignmentFile(input_bam)
o = pysam.AlignmentFile(output_bam, "wb", template=i)
for read in i:
    try:
        flag = 1  # start out as a 'good' read
        cigar = read.cigarstring
        if 'X' in cigar or '=' in cigar:
            warn_x = True
        read_seq = read.query_sequence
        read_name = read.query_name
        """
        Throw out unmapped reads
        """
        if read.is_unmapped:
            continue  # must be here, otherwise reads won't have CIGAR
        try:
            md = read.get_tag('MD')
        except KeyError:
            warn_mm = True
            md = ''
        """
        Remove
        """
        """
        Takes care of soft clipped bases
        (remove bases from the read_seq which are soft clipped
        to not interfere with mis-alignments downstream)
        """
        left_softclip, right_softclip = get_softclip(cigar)
        read_seq = remove_softclipped_reads(
            left_softclip, right_softclip, read_seq
        )
        
        """
        # Manually setting reversed reads to 'sense' strand per truseq
        library protocols (Default is truseq reverse stranded)
        """
        if reverse_stranded:
            sense = True if read.is_reverse == True else False
        else:
            sense = True if read.is_reverse == False else False
            
        """
        # 5c) If there exists indels, remove them.
        """
        if 'I' in cigar or 'D' in cigar:
            flag = 4

        elif read.is_secondary:
            flag = 5
        
        elif ct_mismatches(read_seq, md, sense) > ct_threshold:
            flag = 7
            
        flags[flag] += 1
        if flag == 1:
            o.write(read)
        
        if counter % 10000000 == 0:
            print("Parsed {} reads".format(counter))
        counter += 1
    except Exception as e:
        print(e)
        break
o.close()
i.close()

Parsed 0 reads
Parsed 10000000 reads
Parsed 20000000 reads
Parsed 30000000 reads
Parsed 40000000 reads
Parsed 50000000 reads
Parsed 60000000 reads
Parsed 70000000 reads
Parsed 80000000 reads
Parsed 90000000 reads
Parsed 100000000 reads
Parsed 110000000 reads
Parsed 120000000 reads
Parsed 130000000 reads
Parsed 140000000 reads
Parsed 150000000 reads
Parsed 160000000 reads
Parsed 170000000 reads
Parsed 180000000 reads
Parsed 190000000 reads
Parsed 200000000 reads
Parsed 210000000 reads
Parsed 220000000 reads
Parsed 230000000 reads
Parsed 240000000 reads
Parsed 250000000 reads
Parsed 260000000 reads
Parsed 270000000 reads
Parsed 280000000 reads
Parsed 290000000 reads
Parsed 300000000 reads
Parsed 310000000 reads
Parsed 320000000 reads
Parsed 330000000 reads
Parsed 340000000 reads
Parsed 350000000 reads
Parsed 360000000 reads
Parsed 370000000 reads
Parsed 380000000 reads
Parsed 390000000 reads
Parsed 400000000 reads
Parsed 410000000 reads
Parsed 420000000 reads
Parsed 430000000 reads
Parse

In [5]:
flags

defaultdict(int, {1: 415427692, 4: 6504555, 5: 60940143, 7: 8141740})

### Metrics: