# Disambiguate (using cvbio)

Result plots after seperate alignment using bwa-mem to human, Malayan pangolin and African green monkey genomes, then best mapping caculated using cvbio Disambiguate


In [None]:
import re
import numpy as np
import time
import os.path
import sys
import pandas as pd
from matplotlib import pyplot as plt
import matplotlib.lines as mlines
import seaborn as sns
from io import StringIO
from Bio import SeqIO
import pysam
from pandas.plotting import scatter_matrix
import warnings
import json
import numpy as np
import subprocess
warnings.filterwarnings('ignore')
sys.executable.split('/')[-3]

In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [None]:
def run_flagtools(bam_file, stats_file):
    cmd=f"{SAMTOOLS_PATH}samtools flagstat {bam_file} > {stats_file}"
    try:
        subprocess.check_call(cmd, shell=True)
    except subprocess.CalledProcessError as e:
        raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output))

In [None]:
def write_df(stats, aln_names, aln_name, aln_type):
    aln_all='_'.join(aln_names)
    out_name='disambiguate_'+aln_name+'_'+aln_type+'_'+aln_all+'_basic_stats.csv'
    df=pd.DataFrame(stats)
    df.columns=['mapped','properly paired', 'aln_ref']
    df.insert(0, 'SRA', SRAs)
    df.to_csv(f'{BASE_PATH}/general_plots/{out_name}')
    return df, out_name

In [None]:
def get_flagtools_stats(stats_file):
    with open(stats_file) as file:
        lines = file.readlines()
        lines = [line.rstrip() for line in lines]
        for line in lines:
            if 'mapped (' in line:
                if 'primary mapped (' not in line:
                    mapped=int(line.split()[0])
            elif 'properly paired (' in line:
                properly_paired=int(lines[8].split()[0])
    return mapped, properly_paired

In [None]:
def plot_flagtools(df, fig_name, y_scale='log', save_fig=True):
    fig_name=fig_name.split('.txt')[0]+'.png'
    plt.rcParams["figure.figsize"] = (12,6)
    df.plot(x="SRA", y=['mapped'], kind="bar", color='royalblue')
    plt.title(fig_name)
    if y_scale=='log':
        plt.yscale('log')
    if save_fig:
        plt.savefig(f'{BASE_PATH}/general_plots/{fig_name}', bbox_inches='tight')

In [None]:
def plot_flagtools_stackplots(frames, frames_ambig, file_name, plot_type, y_scale='log', save_fig=False):
    #SRA 	mapped 	properly paired 	aln_ref
    #human, vero, manjav
    x1=frames[0]['SRA']
    y1=frames[0]['mapped']
    x2=frames[1]['SRA']
    y2=frames[1]['mapped']
    x3=frames[2]['SRA']
    y3=frames[2]['mapped']
    
    
    xa=frames_ambig[2]['SRA']
    ya=frames_ambig[2]['mapped']
    total=y1+y2+y3+ya
    if plot_type=='percent':
        y1=(y1/total)*100
        y2=(y2/total)*100
        y3=(y3/total)*100
        ya=(ya/total)*100
    assert x1.tolist()==x2.tolist()==x3.tolist()

    assert frames[0]['aln_ref'][0]=='human'
    assert frames[1]['aln_ref'][0]=='vero'
    assert frames[2]['aln_ref'][0]=='manjav'
    assert frames_ambig[2]['aln_ref'][0]=='manjav'
    
    plt.rcParams["figure.figsize"] = (5,6)
    plt.bar(x1, y1, color='#1D2F6F')
    plt.bar(x2, y2, bottom=y1, color='#6EAF46')
    plt.bar(x3, y3, bottom=y1+y2, color='#FAC748')
    plt.bar(xa, ya, bottom=y1+y2+y3, color='#8390FA')
    if plot_type=='percent':
        plt.ylabel('percent')
    else:
        plt.ylabel('counts')
    legend_order=['human', 'vero', 'manis javanica', 'ambiguous']

    plt.legend(legend_order, loc=(1.05, 0.7))
    ax = plt.axes()
    plt.tick_params(axis='x', rotation=90)
    plt.tight_layout()
    if y_scale=='log':
        plt.yscale('log')
    if save_fig:
        plt.savefig(f'{BASE_PATH}/general_plots/{file_name}', bbox_inches='tight')

In [None]:
def stats_worflow(data_path, alignment_names, raw_aln_names, aln_type, cvbio_folder='cvbio'):
    for alignment_name, raw_aln_name in zip(alignment_names, raw_aln_names):
        stats_l=[]
        stats_ambig=[]
        for sra in SRAs:
            bam_path=data_path+f'{sra}/{cvbio_folder}/'
            bam_f=f'disambiguated.{alignment_name}.bam'
            stats_file=bam_f.split('.bam')[0]+'_flagstats.txt'
            run_flagtools(bam_path+bam_f, bam_path+stats_file)
            mapped, properly_paired=get_flagtools_stats(bam_path+stats_file)
            stats=[mapped, properly_paired, alignment_name]
            stats_l.append(stats)

            #ambiguous bams in subfolder
            ambig_path=bam_path+f'ambiguous-alignments/'
            if aln_type=='reads':
                aln_type_name='reads_fastp'
                ambig_f=f'{sra}_{aln_type_name}_{raw_aln_name}.fna_default_bwamem_gatk_sorted_marked.ambiguous.bam'
            elif aln_type=='MEGAHITfinal':
                aln_type_name='megahit_final'
                ambig_f=f'{sra}_{aln_type_name}_{raw_aln_name}.fna_bwamem_gatk_sorted_marked.ambiguous.bam'
            ambig_stats_f=ambig_f.split('.bam')[0]+'_flagstats.txt'
            run_flagtools(ambig_path+ambig_f, ambig_path+ambig_stats_f)
            ambig_mapped, ambig_properly_paired=get_flagtools_stats(ambig_path+ambig_stats_f)
            ambig_stats=[ambig_mapped, ambig_properly_paired, alignment_name]
            stats_ambig.append(ambig_stats)
        stats_df, outname=write_df(stats_l, alignment_names, alignment_name, aln_type)
        stats_ambig_df, outname_ambig=write_df(stats_ambig, raw_aln_names, raw_aln_name, aln_type)

In [None]:
def plot_workflow(data_path, alignment_names, raw_aln_names, aln_type, dummy_sra='CRR477154', cvbio_folder='cvbio'):
    frames=[]
    frames_ambig=[]
    out_names=[]
    bam_path=data_path+f'{dummy_sra}/bwa_mem/{cvbio_folder}/'
    ambig_path=bam_path+f'ambiguous-alignments/'
    for alignment, raw_aln_name in zip(alignment_names, raw_aln_names):
        aln_all='_'.join(alignment_names)
        out_name='disambiguate_'+alignment+'_'+aln_type+'_'+aln_all+'_basic_stats.csv'
        
        df=pd.read_csv(f'{BASE_PATH}/general_plots/{out_name}')
        df.drop(columns=['Unnamed: 0'],inplace=True)

        
        aln_all_a='_'.join(raw_aln_names)
        out_name_a='disambiguate_'+raw_aln_name+'_'+aln_type+'_'+aln_all_a+'_basic_stats.csv'
    
        df_a=pd.read_csv(f'{BASE_PATH}/general_plots/{out_name_a}')
        df_a.drop(columns=['Unnamed: 0'],inplace=True)
        frames.append(df.copy())
        frames_ambig.append(df_a.copy())
    return frames, frames_ambig



In [None]:
def get_pct(y_arr, tot):
    pct_arr=[]
    for i,val in enumerate(y_arr.tolist()):
        pct=(val/tot)*100
        pct_arr.append(pct)
    #pct_arr=np.array(pct_arr)
    return pct_arr

In [None]:
def calc_pct(frames, frames_ambig):
    SRAs=frames[0]['SRA']
    human=frames[0]['mapped']
    vero=frames[1]['mapped']
    manjav=frames[2]['mapped']
    
    hu_no_ambig=[None]*len(SRAs)
    v_no_ambig=[None]*len(SRAs)
    mj_no_ambig=[None]*len(SRAs)
    
    for i,sra in enumerate(SRAs):
        total_no_ambig=human[i]+vero[i]+manjav[i]
        hu_no_ambig[i]=(human[i]/total_no_ambig)*100
        v_no_ambig[i]=(vero[i]/total_no_ambig)*100
        mj_no_ambig[i]=(manjav[i]/total_no_ambig)*100
    df_no_ambig=pd.DataFrame(list(zip(SRAs, hu_no_ambig, v_no_ambig, mj_no_ambig)),
              columns=['SRA','human','vero', 'manjav'])
        
    
    human_ambig=frames_ambig[0]['mapped']
    vero_ambig=frames_ambig[1]['mapped']
    manjav_ambig=frames_ambig[2]['mapped']
    just_ambig_pct_all_ambig=[None]*len(SRAs)
    
    hu_pct_mj_ambig=[None]*len(SRAs)
    v_pct_mj_ambig=[None]*len(SRAs)
    mj_pct_mj_ambig=[None]*len(SRAs)
    mj_ambig_pct_mj_ambig=[None]*len(SRAs)
    
    hu_pct_all_ambig=[None]*len(SRAs)
    v_pct_all_ambig=[None]*len(SRAs)
    mj_pct_all_ambig=[None]*len(SRAs)

    for i,sra in enumerate(SRAs):
        total_mj_ambig=human[i]+vero[i]+manjav[i]+manjav_ambig[i]
        total_all_ambig=total_mj_ambig+human_ambig[i]+vero_ambig[i]
        just_ambig=manjav_ambig[i]+human_ambig[i]+vero_ambig[i]
        
        hu_pct_mj_ambig[i]=(human[i]/total_mj_ambig)*100
        v_pct_mj_ambig[i]=(vero[i]/total_mj_ambig)*100
        mj_pct_mj_ambig[i]=(manjav[i]/total_mj_ambig)*100
        mj_ambig_pct_mj_ambig[i]=(manjav_ambig[i]/total_mj_ambig)*100
        
        hu_pct_all_ambig[i]=(human[i]/total_all_ambig)*100
        v_pct_all_ambig[i]=(vero[i]/total_all_ambig)*100
        mj_pct_all_ambig[i]=(manjav[i]/total_all_ambig)*100
        
        just_ambig_pct_all_ambig[i]=(just_ambig/total_all_ambig)*100
        
    df_mj_ambig=pd.DataFrame(list(zip(SRAs, hu_pct_mj_ambig, v_pct_mj_ambig, mj_pct_mj_ambig, mj_ambig_pct_mj_ambig)),
              columns=['SRA','human','vero', 'manjav', 'mj ambiguous'])
    df_all_ambig=pd.DataFrame(list(zip(SRAs, hu_pct_all_ambig, v_pct_all_ambig, mj_pct_all_ambig, just_ambig_pct_all_ambig)),
              columns=['SRA','human','vero', 'manjav', 'all ambiguous'])
    return df_mj_ambig, df_all_ambig, df_no_ambig

In [None]:
PROJECT_CODE='PRJCA002517'
BASE_PATH = f'/mnt/8TB_2/Data/Assembly/{PROJECT_CODE}'

REF_BASE_PATH='/mnt/1TB_0/Data/Code/code/PRJNA901878/'
REFERENCE_MJ='YNU_ManJav_2.0.fna'
REFERENCE_H='GRCh38.p13.fna'
REFERENCE_V='Vero_WHO_p1.0.fna'
ASSEMBLY='bwamem'

In [None]:
SRAs=['CRR477154', 'CRR477155','CRR477156', 'CRR477157']


In [None]:
alignment_names=['human','vero','manjav']
raw_aln_names=['GRCh38.p13','Vero_WHO_p1.0','YNU_ManJav_2.0']
aln_type='reads'
data_path=f'{BASE_PATH}/general_plots/'
plot_path=f'{BASE_PATH}/general_plots/'
SAMTOOLS_PATH=''

In [None]:
#data_path, alignment_names, raw_aln_names, aln_type, cvbio_folder='cvbio'
stats_worflow(BASE_PATH+'/', alignment_names, raw_aln_names, aln_type)

In [None]:
frames, frames_ambig=plot_workflow(data_path, alignment_names, raw_aln_names, aln_type)

In [None]:
df_mj_ambig, df_all_ambig, df_no_ambig= calc_pct(frames, frames_ambig)

In [None]:
df_all_ambig.head()

In [None]:
df_mj_ambig.head()

In [None]:
csv_name='disambiguated_'+aln_type+'_'+'_'.join(alignment_names)+'_bwamem_stats_df.csv'
#df_mj_ambig.to_csv(f'{data_path}/{csv_name}', index=False)

In [None]:
csv_name='disambiguated_'+aln_type+'_'+'_'.join(alignment_names)+'_no_ambig_bwamem_stats_df.csv'
#df_no_ambig.to_csv(f'{data_path}/{csv_name}', index=False)

In [None]:
plot_type='percent'
plot_file_name=f'disambiguated_'+aln_type+'_'+'_'.join(alignment_names)+'{plot_type}_bwamem_stats.png'
plot_flagtools_stackplots(frames, frames_ambig, plot_file_name, plot_type, y_scale='', save_fig=False)