In [None]:
import pathlib
import subprocess
import os
import pandas as pd

In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [None]:
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

### Read alignment using Minimap2

In [None]:
def log_cmd(log_file, cmd, file_opt='a'):
    try:
        f = open(log_file, file_opt)
        f.write(cmd)
        f.write('\n')
        f.close()
    except Exception as e:
        raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output))

In [None]:
def run_minimap2(fastq_1, fastq_2, alignment_path, sra, sra_r=None):
    if sra_r:
        out_file= f"{alignment_path}{sra}_{ALIGN_OPTS_NAME}_{ALIGN_NAME}{sra_r}_minimap2.sam"
    else:
        out_file= f"{alignment_path}{sra}_{ALIGN_OPTS_NAME}_{ALIGN_NAME}_minimap2.sam"
    try:
        cmd = f"{MINIMAP2_PATH}minimap2 {MINIMAP_PARAMS} -a {REF_PATH+REF_NAME} {fastq_1} {fastq_2} >{out_file} "
        subprocess.check_call(cmd, shell=True)
    except subprocess.CalledProcessError as e:
        raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output))
    log_file=out_file.split('.sam')[0]+'_log.txt'
    log_cmd(log_file, cmd, file_opt='w')
    return out_file

In [None]:
def gatk_sort(bwa_mem_out):
    out_file=bwa_mem_out.split('.sam')[0]
    out_file=out_file+'_gatk_sorted.sam'
    try:
        cmd=f"java -jar {GATK_JAR}  SortSam INPUT={bwa_mem_out} OUTPUT={out_file} SORT_ORDER=coordinate VALIDATION_STRINGENCY=SILENT"
        subprocess.check_call(cmd, shell=True)
    except subprocess.CalledProcessError as e:
        raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output))
    return out_file

In [None]:
def gatk_metrics(gatk_out):
    gatk_file=gatk_out.split('.sam')[0]
    out_file=gatk_file+'_marked.bam'
    metrics_file=gatk_file+'_metrics.txt'
    try:
        cmd=f"java -Xmx64G -jar {GATK_JAR}  MarkDuplicates INPUT={gatk_out} OUTPUT={out_file} METRICS_FILE={metrics_file} ASSUME_SORTED=true VALIDATION_STRINGENCY=SILENT"
        subprocess.check_call(cmd, shell=True)
    except subprocess.CalledProcessError as e:
        raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output))
    return out_file

In [None]:
def index_bam(gatk_bam_file):
    try:
        cmd=f"{SAMTOOLS_PATH}samtools index {gatk_bam_file}"
        subprocess.check_call(cmd, shell=True)
    except subprocess.CalledProcessError as e:
        raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output))

In [None]:
def run_fastp(in_1, in_2):
    print('processing with fastp')
    out_1=in_1.split('.fq')[0]+'_fastp.fq'
    out_2=in_2.split('.fq')[0]+'_fastp.fq'
    cmd = f"fastp --in1 {in_1} --in2 {in_2} --out1 {out_1} --out2 {out_2}"
    subprocess.check_call(cmd, shell=True)
    return out_1, out_2
    

In [None]:
def remove_file(file_to_del):
    cmd = f"rm {file_to_del}"
    subprocess.check_call(cmd, shell=True)
    

In [None]:
def create_bed(fasta_file, bed_file):
    cmd=f'faidx --transform bed {fasta_file} > {bed_file}'
    subprocess.check_call(cmd, shell=True)

In [None]:
def run_bamdst(bed_file, out_path, bam_in):
    try:
        cmd=f'{BAMDST_PATH}bamdst -p {bed_file} -o {out_path} {bam_in}'
        subprocess.check_call(cmd, shell=True)
    except subprocess.CalledProcessError as e:
        print("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output))

In [None]:
def run_bamstats(bam_file, stats_out):
    try:
        cmd=f'java -Xmx48g -Djava.awt.headless=true -jar ~/apps/BAMStats-1.25/BAMStats-1.25.jar -i {bam_file} -m -q -o {stats_out}'
        subprocess.check_call(cmd, shell=True)
    except subprocess.CalledProcessError as e:
        print("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output))

In [None]:
def bam_to_bed(gatk_bam_file):
    bed_file=gatk_bam_file.split('.bam')[0]+'.bed'
    cmd=f'{BEDTOOLS_PATH}bedtools bamtobed -i {gatk_bam_file} >{bed_file}'
    subprocess.check_call(cmd, shell=True)

In [None]:
def run_samtools_stats(gatk_bam_file, ref_genome):
    bam_path_root=gatk_bam_file.split('.bam')[0]
    sam_out=bam_path_root+'_samtools_stats.txt'
    cmd=f'{SAMTOOLS_PATH}samtools stats {gatk_bam_file} --reference {ref_genome} >{sam_out}'
    subprocess.check_call(cmd, shell=True)

In [None]:
def run_flagtools(gatk_bam_file):
    bam_path_root=gatk_bam_file.split('.bam')[0]
    flagstat_file=bam_path_root+'_flagtools.txt'
    cmd=f'{SAMTOOLS_PATH}samtools flagstat {gatk_bam_file} >{flagstat_file}'
    subprocess.check_call(cmd, shell=True)

In [None]:
def run_idxstats(gatk_bam_file, flagstat_file):
    cmd=f"{SAMTOOLS_PATH}samtools idxstats {gatk_bam_file} >{flagstat_file}"
    try: 
        subprocess.check_call(cmd, shell=True)
    except subprocess.CalledProcessError as e:
        print("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output))

In [None]:
def samtools_sort(file_to_sort, out_type='BAM'):
    out_file=file_to_sort.split('.sam')[0]
    if out_type=='BAM':
        out_file=out_file+'_sorted.bam'
    else:
        out_file=out_file+'_sorted.sam'
    cmd=f'{SAMTOOLS_PATH}samtools sort -O {out_type} -o {out_file} {file_to_sort}'
    try:
        subprocess.check_call(cmd, shell=True)
    except subprocess.CalledProcessError as e:
        raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output))
    return out_file

In [None]:
def samtoos_coverage(in_file):
    out_file=in_file.split('.sam')[0]
    out_file=out_file+'_coverage.txt'
    cmd=f'{SAMTOOLS_PATH}samtools coverage {in_file} -o {out_file}'
    try:
        subprocess.check_call(cmd, shell=True)
    except subprocess.CalledProcessError as e:
        raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output))

In [None]:
def prep_bed(create=True):
    fasta_file=REF_PATH+REF_NAME
    bed_file=REF_PATH+'bed/'+REF_NAME.split('.fa')[0]+'.bed'
    pathlib.Path(REF_PATH+'bed/').mkdir(exist_ok=True)
    if create:
        create_bed(fasta_file, bed_file)
    return bed_file

In [None]:
def bam_to_fastq():
    ref=REF_NAME.split('.fa')[0]
    for idx, sra in enumerate(SRAs):
        alignment_path=PRJ_OUT_PATH+sra+'/minimap2/'
        #SRR10168373_reads_mitochondria_mamals_1_local_G_298_0_minimap2_gatk_sorted_marked.bam
        bam_file= f"{alignment_path}{sra}_reads_{ref}_{ALIGN_OPTS_NAME}_minimap2_gatk_sorted_marked.bam"
        fastq_file= f"{alignment_path}{sra}_reads_{ref}_{ALIGN_OPTS_NAME}_minimap2_gatk_sorted_marked_bedtools.fq"
        print(fastq_file)
        cmd=f'{BEDTOOLS_PATH}bedtools bamtofastq -i {bam_file} -fq {fastq_file}'
        subprocess.check_call(cmd, shell=True)

In [None]:
def align_idx_bamdst(idxstat_file, bamdst_report):
    f_idx = open(idxstat_file, mode='r')
    idx_lines = f_idx.readlines()
    f_idx.close()
    f_bamdst = open(bamdst_report, mode='r')
    bamdst_lines = f_bamdst.readlines()
    f_bamdst.close()
    fixed_lines=[]
    for count, b_line in enumerate(bamdst_lines):
        b_words = b_line.split('\t')
        if count==0:
            b_words[0]='Accession'
            b_words.insert(1,'ref_length')
            b_words.insert(2,'N')
            new_line='\t'.join(b_words)
            fixed_lines.append(new_line)
        else:
            if float(b_words[1])>0.0 or float(b_words[4])>0.0:
                for i_line in idx_lines:
                    i_words = i_line.split('\t')
                    if b_words[0].strip() == i_words[0].strip():
                        b_words.insert(1,i_words[1])
                        b_words.insert(2,i_words[2])
                new_line='\t'.join(b_words)
                fixed_lines.append(new_line)
    new_report=bamdst_report.split('.txt')[0]+'_N.txt'
    print(f'new_report: {new_report}')
    f = open(new_report, "w")
    f.writelines(fixed_lines)
    f.close()
    return new_report

In [None]:
def workflow_paired(sra_dir=False, bed_file=None):
    if not os.path.isfile(PRJ_OUT_PATH):
        pathlib.Path(PRJ_OUT_PATH).mkdir(exist_ok=True)
    for idx, sra in enumerate(SRAs):
        print(f'{sra}')
        fastq1=FASTQ_FILES[2*idx]
        fastq2=FASTQ_FILES[(2*idx)+1]
        if sra_dir:
            f1path=BASE_PATH+sra+'/'+fastq1
            f2path=BASE_PATH+sra+'/'+fastq2
        else:
            f1path=BASE_PATH+fastq1
            f2path=BASE_PATH+fastq2

        alignment_path=PRJ_OUT_PATH+sra
        if not os.path.isfile(alignment_path):
            pathlib.Path(alignment_path).mkdir(exist_ok=True)
        alignment_path=PRJ_OUT_PATH+sra+'/minimap2/'
        if not os.path.isfile(alignment_path):
            pathlib.Path(alignment_path).mkdir(exist_ok=True)

        if not os.path.isfile(f1path):
            if not CREATE_FASTP:
                print(f'file doesnt exist but should: {f1path}')
                raise Exception
            f1path, f2path=run_fastp(f1path, f2path)
            
        if SORT_TOOL=='samtools':
            bam_sorted=samtools_sort(sam_out, out_type='BAM')
            remove_file(sam_out)
        else:
            sam_out=gatk_sort(sam_out)
            bam_sorted=gatk_metrics(sam_out)
        index_bam(bam_sorted)
        
        run_samtools_stats(bam_sorted, REF_PATH+REF_NAME)
        run_flagtools(bam_sorted)

        idxstat_file=bam_sorted.split(f'{BAM_POSTFIX}.bam')[0]+'idxstats.txt'
        run_idxstats(bam_sorted, idxstat_file)
        
        out_path=alignment_path+f'bamstats_reads_{ALIGN_NAME}_{ALIGN_OPTS_NAME}/'
        if not os.path.isfile(out_path):
            pathlib.Path(out_path).mkdir(exist_ok=True)

        #bamstats_file=gatk_bam_file.split('gatk_sorted_marked.bam')[0]+'bamstats.txt'
        #print(f'bamstats_file: {bamstats_file}')
        #if os.path.isfile(bamstats_file):
        #    remove_file(bamstats_file)
        #run_bamstats(gatk_bam_file, bamstats_file)
        if bed_file:
            out_path=alignment_path+f'bamdst_reads_{ALIGN_NAME}_{ALIGN_OPTS_NAME}/'
            print(f'out_path: {out_path}')
            if not os.path.isfile(out_path):
                pathlib.Path(out_path).mkdir(exist_ok=True)
            run_bamdst(bed_file, out_path, bam_sorted)
            bamdst_data=out_path+'chromosomes.report'
            aligned_file=align_idx_bamdst(idxstat_file, bamdst_data)

In [None]:
def workflow_single(sra_dir=False, bed_file=None, sra_r=None):
    if not os.path.isfile(PRJ_OUT_PATH):
        pathlib.Path(PRJ_OUT_PATH).mkdir(exist_ok=True)
    for idx, sra in enumerate(SRAsingle):
        print(f'{sra}') 
        fastq=FASTQ_SINGLE[idx]
        if sra_dir:
            f1path=BASE_PATH+sra+'/'+fastq
        else:
            f1path=BASE_PATH+fastq
        alignment_path=PRJ_OUT_PATH+sra
        if not os.path.isfile(alignment_path):
            pathlib.Path(alignment_path).mkdir(exist_ok=True)
        alignment_path=PRJ_OUT_PATH+sra+'/minimap2/'
        if not os.path.isfile(alignment_path):
            pathlib.Path(alignment_path).mkdir(exist_ok=True)

        #if not os.path.isfile(f1path):
            #if not CREATE_FASTP:
                #print(f'file doesnt exist but should: {f1path}')
                #raise Exception
            #out_1=f1path.split('.fq')[0]+'_fastp.fq'
            #cmd = f"fastp --in1 {f1path} --out1 {out_1}"
            #subprocess.check_call(cmd, shell=True)
        f2path=''
        sam_out=run_minimap2(f1path, f2path, alignment_path, sra, sra_r)
        if SORT_TOOL=='samtools':
            bam_sorted=samtools_sort(sam_out, out_type='BAM')
        else:
            sam_out=gatk_sort(sam_out)
            bam_sorted=gatk_metrics(sam_out)
        index_bam(bam_sorted)

        idxstat_file=bam_sorted.split(f'{BAM_POSTFIX}.bam')[0]+'idxstats.txt'
        run_idxstats(bam_sorted, idxstat_file)
        
        #bamstats_file=gatk_bam_file.split('gatk_sorted_marked.bam')[0]+'bamstats.txt'
        #print(f'bamstats_file: {bamstats_file}')
        #if os.path.isfile(bamstats_file):
        #    remove_file(bamstats_file)
        #run_bamstats(gatk_bam_file, bamstats_file)
        if bed_file:
            out_path=alignment_path+f'bamdst_reads_{ALIGN_NAME}_{ALIGN_OPTS_NAME}_{sra_r}se/'
            if not os.path.isfile(out_path):
                pathlib.Path(out_path).mkdir(exist_ok=True)
            run_bamdst(bed_file, out_path, bam_sorted)
            bamdst_data=out_path+'chromosomes.report'
            aligned_file=align_idx_bamdst(idxstat_file, bamdst_data)

In [None]:
MINIMAP2_PATH='~/apps/minimap2-2.24_x64-linux/'
SAMTOOLS_PATH=''
BAMDST_PATH='/mnt/3TB_0/Data/Code/external/bamdst/'
BEDTOOLS_PATH='~/apps/bedtools/'
GATK_PATH='~/apps/gatk-4.1.9.0/'
GATK_JAR=GATK_PATH+'gatk-package-4.1.9.0-local.jar'

### General Settings

In [None]:
#REF_NAME='PRJNA901878_ncbi_subset.fa'
#REF_PATH='/mnt/1TB_0/Data/fasta/combined/'
#TAXA_TAB_DEL='PRJNA901878_ncbi_subset.taxa'
TAXA_TAB_DEL=''
REF_NAME='GCF_000003025.6_Sscrofa11.1_genomic.fna'
REF_PATH='/mnt/1TB_0/Data/fasta/mammal_genomes/Sscrofa11_1/ncbi-genomes-2021-11-27/'
ALIGN_NAME=REF_NAME.split('.fna')[0]

NUM_THREADS=32
LAYOUT='PAIRED'
KEEP_UNALIGNED=False
CREATE_FASTP=False
ALIGNER='minimap2'

#https://lh3.github.io/minimap2/minimap2.html#5 -2 for 2 IO threads
#to overcome sam has no header usie -I
MINIMAP_PARAMS=f'-MD -c -eqx -x sr -2 -t {NUM_THREADS} -I30g --sam-hit-only --secondary=no'
ALIGN_OPTS_NAME='x_sr_secondary_no'

SORT_TOOL='samtools' #gatk, samtools

BAM_POSTFIX='sorted'
BAMSTATS_POST=f'_{ALIGN_NAME}_{ALIGN_OPTS_NAME}_bamstats.txt'
BAM_POST=f'_reads_{REF_NAME.split(".fa")[0]}_{ALIGN_OPTS_NAME}_{ALIGNER}_{SORT_TOOL}_sorted.bam'

#### bed file/reference indexing

In [None]:
#bed_file=prep_bed(create=True)

### BioProject specific

In [None]:
PRJ='PRJNA901878'

PRJ_OUT_PATH=f'/mnt/8TB_2/Data/Assembly/{PRJ}/'
BASE_PATH=f'/mnt/8TB_0/Data/genbank/{PRJ}/'

SRAs=['SRR22936497']


FASTQ_FILES=[]
for f in SRAs:
    FASTQ_FILES.append(f+'_1_fastp.fastq')
    FASTQ_FILES.append(f+'_2_fastp.fastq')
    
workflow_paired(sra_dir=False)

### MJ

In [None]:
#REF_NAME='PRJNA901878_ncbi_subset.fa'
#REF_PATH='/mnt/1TB_0/Data/fasta/combined/'
#TAXA_TAB_DEL='PRJNA901878_ncbi_subset.taxa'
TAXA_TAB_DEL=''
REF_NAME='GCF_014570535.1_YNU_ManJav_2.0_genomic.fna'
REF_PATH='/mnt/3TB_0/Data/fasta/mammal_genomes/'
ALIGN_NAME=REF_NAME.split('.fa')[0]

NUM_THREADS=32
LAYOUT='PAIRED'
KEEP_UNALIGNED=False
CREATE_FASTP=False

#https://lh3.github.io/minimap2/minimap2.html#5 -2 for 2 IO threads
MINIMAP_PARAMS=f'-MD -c -eqx -x sr -2 -t {NUM_THREADS} --sam-hit-only --secondary=no'
ALIGN_OPTS_NAME='x_sr_secondary_no'
ALIGNER='minimap2'

BAM_POSTFIX='sorted'
BAMSTATS_POST=f'_{ALIGN_NAME}_{ALIGN_OPTS_NAME}_bamstats.txt'
BAM_POST=f'_reads_{REF_NAME.split(".fa")[0]}_{ALIGN_OPTS_NAME}_{ALIGNER}_sorted.bam'

#### bed file/reference indexing

In [None]:
#bed_file=prep_bed(create=True)

### MJ

In [None]:
PRJ='PRJNA901878'

PRJ_OUT_PATH=f'/mnt/4TB_1/Data/Assembly/{PRJ}/'
BASE_PATH=f'/mnt/8TB_0/Data/genbank/{PRJ}/'

 
#SRAs=['SRR22936420','SRR22936419','SRR22936773', 'SRR22936421', 'SRR22936770', 'SRR22936422','SRR22936541', 'SRR22936544']

SRAs=['SRR22936541']

FASTQ_FILES=[]
for f in SRAs:
    FASTQ_FILES.append(f+'_1_fastp.fastq')
    FASTQ_FILES.append(f+'_2_fastp.fastq')
    
#workflow_paired(sra_dir=False)

### Concatref MJ, Sus, Human, Vero

In [None]:
#REF_NAME='PRJNA901878_ncbi_subset.fa'
#REF_PATH='/mnt/1TB_0/Data/fasta/combined/'
#TAXA_TAB_DEL='PRJNA901878_ncbi_subset.taxa'
TAXA_TAB_DEL=''
#REF_NAME='GCF_000003025.6_Sscrofa11.1_genomic.fna'
REF_PATH='/mnt/3TB_0/Data/Code/code/PRJNA901878/'
REF_NAME='sus_mj.fa'
#REF_PATH='/mnt/3TB_0/Data/fasta/mammal_genomes/'
ALIGN_NAME=REF_NAME.split('.fa')[0]
ALIGNER='minimap2'

NUM_THREADS=32
LAYOUT='PAIRED'
KEEP_UNALIGNED=False
CREATE_FASTP=False

#https://lh3.github.io/minimap2/minimap2.html#5 -2 for 2 IO threads
MINIMAP_PARAMS=f'-MD -c -eqx -x sr -2 -t {NUM_THREADS} -I30g --sam-hit-only --secondary=no'
ALIGN_OPTS_NAME='x_sr_secondary_no'

BAM_POSTFIX='gatk_sorted_marked'
BAMSTATS_POST=f'_{ALIGN_NAME}_{ALIGN_OPTS_NAME}_bamstats.txt'
BAM_POST=f'_reads_{REF_NAME.split(".fa")[0]}_{ALIGN_OPTS_NAME}_{ALIGNER}_sorted.bam'

In [None]:
PRJ='PRJNA901878'

PRJ_OUT_PATH=f'/mnt/4TB_1/Data/Assembly/{PRJ}/'
BASE_PATH=f'/mnt/8TB_0/Data/genbank/{PRJ}/'

SRAs=['SRR22936420','SRR22936419','SRR22936773', 'SRR22936421', 'SRR22936770', 'SRR22936422','SRR22936541', 'SRR22936544','SRR22936497']

FASTQ_FILES=[]
for f in SRAs:
    FASTQ_FILES.append(f+'_1_fastp.fastq')
    FASTQ_FILES.append(f+'_2_fastp.fastq')
    
workflow_paired(sra_dir=False)

In [None]:
#REF_NAME='PRJNA901878_ncbi_subset.fa'
#REF_PATH='/mnt/1TB_0/Data/fasta/combined/'
#TAXA_TAB_DEL='PRJNA901878_ncbi_subset.taxa'
TAXA_TAB_DEL=''
#REF_NAME='GCF_000003025.6_Sscrofa11.1_genomic.fna'
REF_PATH='/mnt/3TB_0/Data/Code/code/PRJNA901878/'
REF_NAME='sus_vero_mj_hu.fa'
#REF_PATH='/mnt/3TB_0/Data/fasta/mammal_genomes/'
ALIGN_NAME=REF_NAME.split('.fa')[0]
ALIGNER='minimap2'

NUM_THREADS=32
LAYOUT='PAIRED'
KEEP_UNALIGNED=False
CREATE_FASTP=False

#https://lh3.github.io/minimap2/minimap2.html#5 -2 for 2 IO threads
MINIMAP_PARAMS=f'-MD -c -eqx -x sr -2 -t {NUM_THREADS} -I30g --sam-hit-only --secondary=no'
ALIGN_OPTS_NAME='x_sr_secondary_no'

BAM_POSTFIX='gatk_sorted_marked'
BAMSTATS_POST=f'_{ALIGN_NAME}_{ALIGN_OPTS_NAME}_bamstats.txt'
BAM_POST=f'_reads_{REF_NAME.split(".fa")[0]}_{ALIGN_OPTS_NAME}_{ALIGNER}_sorted.bam'

In [None]:
PRJ='PRJNA901878'

PRJ_OUT_PATH=f'/mnt/4TB_1/Data/Assembly/{PRJ}/'
BASE_PATH=f'/mnt/8TB_0/Data/genbank/{PRJ}/'
#,'SRR22936497'
SRAs=['SRR22936420','SRR22936419','SRR22936773', 'SRR22936421', 'SRR22936770', 'SRR22936422','SRR22936541', 'SRR22936544']

FASTQ_FILES=[]
for f in SRAs:
    FASTQ_FILES.append(f+'_1_fastp.fastq')
    FASTQ_FILES.append(f+'_2_fastp.fastq')
    
workflow_paired(sra_dir=False)

### RhiPru_1.0 (TODO)

In [None]:
#REF_NAME='PRJNA901878_ncbi_subset.fa'
#REF_PATH='/mnt/1TB_0/Data/fasta/combined/'
#TAXA_TAB_DEL='PRJNA901878_ncbi_subset.taxa'
TAXA_TAB_DEL=''
REF_NAME='sus_rhipru.fa'
REF_PATH='/mnt/3TB_0/Data/Code/code/PRJNA901878/'
ALIGN_NAME=REF_NAME.split('.fa')[0]
ALIGNER='minimap2'

NUM_THREADS=32
LAYOUT='PAIRED'
KEEP_UNALIGNED=False
CREATE_FASTP=False

#https://lh3.github.io/minimap2/minimap2.html#5 -2 for 2 IO threads
MINIMAP_PARAMS=f'-MD -c -eqx -x sr -2 -t {NUM_THREADS} -I30g --sam-hit-only --secondary=no'
ALIGN_OPTS_NAME='x_sr_secondary_no'

BAM_POSTFIX='sorted_marked'
BAMSTATS_POST=f'_{ALIGN_NAME}_{ALIGN_OPTS_NAME}_bamstats.txt'
BAM_POST=f'_reads_{REF_NAME.split(".fa")[0]}_{ALIGN_OPTS_NAME}_{ALIGNER}_sorted.bam'

In [None]:
PRJ='PRJNA901878'

PRJ_OUT_PATH=f'/mnt/4TB_1/Data/Assembly/{PRJ}/'
BASE_PATH=f'/mnt/8TB_0/Data/genbank/{PRJ}/'


SRAs=['SRR22936497']

FASTQ_FILES=[]
for f in SRAs:
    FASTQ_FILES.append(f+'_1_fastp.fastq')
    FASTQ_FILES.append(f+'_2_fastp.fastq')
    
workflow_paired(sra_dir=False)

### MJ, Vero, human - in series

In [None]:
#REF_NAME='PRJNA901878_ncbi_subset.fa'
#REF_PATH='/mnt/1TB_0/Data/fasta/combined/'
#TAXA_TAB_DEL='PRJNA901878_ncbi_subset.taxa'
#TAXA_TAB_DEL=''
#REF_NAME='vero_mj_hu.fa'
#REF_PATH='/mnt/3TB_0/Data/Code/code/PRJNA901878/'
#ALIGN_NAME=REF_NAME.split('.fa')[0]
#ALIGNER='minimap2'

TAXA_TAB_DEL=''
REF_NAME='YNU_ManJav_2.0.fna'
REF_PATH='/mnt/3TB_0/Data/Code/code/PRJNA901878/'
ALIGN_NAME=REF_NAME.split('.fna')[0]
ALIGNER='minimap2'

NUM_THREADS=40
LAYOUT='PAIRED'
KEEP_UNALIGNED=False
CREATE_FASTP=False

#https://lh3.github.io/minimap2/minimap2.html#5 -2 for 2 IO threads
MINIMAP_PARAMS=f'-MD -c -eqx -x sr -2 -t {NUM_THREADS} --sam-hit-only --secondary=no'
ALIGN_OPTS_NAME='x_sr_secondary_no'

SORT_TOOL='gatk' #gatk, samtools

BAM_POSTFIX='gatk_sorted_marked'
BAMSTATS_POST=f'_{ALIGN_NAME}_{ALIGN_OPTS_NAME}_bamstats.txt'
BAM_POST=f'_reads_{REF_NAME.split(".fa")[0]}_{ALIGN_OPTS_NAME}_{ALIGNER}_sorted.bam'

In [None]:
PRJ='PRJCA002517'

PRJ_OUT_PATH=f'/mnt/4TB_1/Data/Assembly/{PRJ}/'
BASE_PATH=f'/mnt/8TB_0/Data/bigd/{PRJ}/'


SRAs=['CRR477154','CRR477155','CRR477156','CRR477157']

FASTQ_FILES=[]
for f in SRAs:
    FASTQ_FILES.append(f+'_f1_fastp.fq')
    FASTQ_FILES.append(f+'_r2_fastp.fq')
    
workflow_paired(sra_dir=False)

In [None]:
#REF_NAME='PRJNA901878_ncbi_subset.fa'
#REF_PATH='/mnt/1TB_0/Data/fasta/combined/'
#TAXA_TAB_DEL='PRJNA901878_ncbi_subset.taxa'
#TAXA_TAB_DEL=''
#REF_NAME='vero_mj_hu.fa'
#REF_PATH='/mnt/3TB_0/Data/Code/code/PRJNA901878/'
#ALIGN_NAME=REF_NAME.split('.fa')[0]
#ALIGNER='minimap2'

TAXA_TAB_DEL=''
REF_NAME='GRCh38.p13.fna'
REF_PATH='/mnt/3TB_0/Data/Code/code/PRJNA901878/'
ALIGN_NAME=REF_NAME.split('.fna')[0]
ALIGNER='minimap2'

NUM_THREADS=40
LAYOUT='PAIRED'
KEEP_UNALIGNED=False
CREATE_FASTP=False

#https://lh3.github.io/minimap2/minimap2.html#5 -2 for 2 IO threads
MINIMAP_PARAMS=f'-MD -c -eqx -x sr -2 -t {NUM_THREADS} --sam-hit-only --secondary=no'
ALIGN_OPTS_NAME='x_sr_secondary_no'

SORT_TOOL='gatk' #gatk, samtools

BAM_POSTFIX='gatk_sorted_marked'
BAMSTATS_POST=f'_{ALIGN_NAME}_{ALIGN_OPTS_NAME}_bamstats.txt'
BAM_POST=f'_reads_{REF_NAME.split(".fa")[0]}_{ALIGN_OPTS_NAME}_{ALIGNER}_sorted.bam'

In [None]:
PRJ='PRJCA002517'

PRJ_OUT_PATH=f'/mnt/4TB_1/Data/Assembly/{PRJ}/'
BASE_PATH=f'/mnt/8TB_0/Data/bigd/{PRJ}/'


SRAs=['CRR477154','CRR477155','CRR477156','CRR477157']

FASTQ_FILES=[]
for f in SRAs:
    FASTQ_FILES.append(f+'_f1_fastp.fq')
    FASTQ_FILES.append(f+'_r2_fastp.fq')
    
workflow_paired(sra_dir=False)

In [None]:
#REF_NAME='PRJNA901878_ncbi_subset.fa'
#REF_PATH='/mnt/1TB_0/Data/fasta/combined/'
#TAXA_TAB_DEL='PRJNA901878_ncbi_subset.taxa'
#TAXA_TAB_DEL=''
#REF_NAME='vero_mj_hu.fa'
#REF_PATH='/mnt/3TB_0/Data/Code/code/PRJNA901878/'
#ALIGN_NAME=REF_NAME.split('.fa')[0]
#ALIGNER='minimap2'

TAXA_TAB_DEL=''
REF_NAME='Vero_WHO_p1.0.fna'
REF_PATH='/mnt/3TB_0/Data/Code/code/PRJNA901878/'
ALIGN_NAME=REF_NAME.split('.fna')[0]
ALIGNER='minimap2'

NUM_THREADS=40
LAYOUT='PAIRED'
KEEP_UNALIGNED=False
CREATE_FASTP=False

#https://lh3.github.io/minimap2/minimap2.html#5 -2 for 2 IO threads
MINIMAP_PARAMS=f'-MD -c -eqx -x sr -2 -t {NUM_THREADS} --sam-hit-only --secondary=no'
ALIGN_OPTS_NAME='x_sr_secondary_no'

SORT_TOOL='gatk' #gatk, samtools

BAM_POSTFIX='gatk_sorted_marked'
BAMSTATS_POST=f'_{ALIGN_NAME}_{ALIGN_OPTS_NAME}_bamstats.txt'
BAM_POST=f'_reads_{REF_NAME.split(".fa")[0]}_{ALIGN_OPTS_NAME}_{ALIGNER}_sorted.bam'

In [None]:
PRJ='PRJCA002517'

PRJ_OUT_PATH=f'/mnt/4TB_1/Data/Assembly/{PRJ}/'
BASE_PATH=f'/mnt/8TB_0/Data/bigd/{PRJ}/'


SRAs=['CRR477154','CRR477155','CRR477156','CRR477157']

FASTQ_FILES=[]
for f in SRAs:
    FASTQ_FILES.append(f+'_f1_fastp.fq')
    FASTQ_FILES.append(f+'_r2_fastp.fq')
    
workflow_paired(sra_dir=False)

In [None]:
### serial align

In [None]:
TAXA_TAB_DEL=''
REF_NAME='GCF_000003025.6_Sscrofa11.1_genomic.fna'
REF_PATH='/mnt/3TB_0/Data/fasta/mammal_genomes/'
ALIGN_NAME=REF_NAME.split('.fna')[0]
ALIGNER='minimap2'

NUM_THREADS=40
LAYOUT='PAIRED'
KEEP_UNALIGNED=False
CREATE_FASTP=False

#https://lh3.github.io/minimap2/minimap2.html#5 -2 for 2 IO threads
MINIMAP_PARAMS=f'-MD -c -eqx -x sr -2 -t {NUM_THREADS} -I30g --sam-hit-only --secondary=no'
ALIGN_OPTS_NAME='x_sr_secondary_no'

BAM_POSTFIX='gatk_sorted_marked'
BAMSTATS_POST=f'_{ALIGN_NAME}_{ALIGN_OPTS_NAME}_bamstats.txt'
BAM_POST=f'_reads_{REF_NAME.split(".fa")[0]}_{ALIGN_OPTS_NAME}_{ALIGNER}_sorted.bam'

In [None]:
PRJ='PRJNA901878'

PRJ_OUT_PATH=f'/mnt/4TB_1/Data/Assembly/{PRJ}/'
BASE_PATH=f'/mnt/8TB_0/Data/genbank/{PRJ}/'
#,'SRR22936497'
SRAs=['SRR22936420','SRR22936419','SRR22936773', 'SRR22936421', 'SRR22936770', 'SRR22936422','SRR22936541', 'SRR22936544']

FASTQ_FILES=[]
for f in SRAs:
    FASTQ_FILES.append(f+'_1_fastp.fastq')
    FASTQ_FILES.append(f+'_2_fastp.fastq')
    
workflow_paired(sra_dir=False)

In [None]:
TAXA_TAB_DEL=''
REF_NAME='GCF_014570535.1_YNU_ManJav_2.0_genomic.fna'
REF_PATH='/mnt/3TB_0/Data/fasta/mammal_genomes/'
ALIGN_NAME=REF_NAME.split('.fna')[0]
ALIGNER='minimap2'

NUM_THREADS=40
LAYOUT='PAIRED'
KEEP_UNALIGNED=False
CREATE_FASTP=False

#https://lh3.github.io/minimap2/minimap2.html#5 -2 for 2 IO threads
MINIMAP_PARAMS=f'-MD -c -eqx -x sr -2 -t {NUM_THREADS} -I30g --sam-hit-only --secondary=no'
ALIGN_OPTS_NAME='x_sr_secondary_no'

BAM_POSTFIX='gatk_sorted_marked'
BAMSTATS_POST=f'_{ALIGN_NAME}_{ALIGN_OPTS_NAME}_bamstats.txt'
BAM_POST=f'_reads_{REF_NAME.split(".fa")[0]}_{ALIGN_OPTS_NAME}_{ALIGNER}_sorted.bam'

In [None]:
PRJ='PRJNA901878'

PRJ_OUT_PATH=f'/mnt/4TB_1/Data/Assembly/{PRJ}/'
BASE_PATH=f'/mnt/8TB_0/Data/genbank/{PRJ}/'
#,'SRR22936497'
SRAs=['SRR22936420','SRR22936419','SRR22936773', 'SRR22936421', 'SRR22936770', 'SRR22936422','SRR22936541', 'SRR22936544']

FASTQ_FILES=[]
for f in SRAs:
    FASTQ_FILES.append(f+'_1_fastp.fastq')
    FASTQ_FILES.append(f+'_2_fastp.fastq')
    
workflow_paired(sra_dir=False)

### RP

In [None]:
TAXA_TAB_DEL=''
REF_NAME='GCA_009823505.1_RhiPru_1.0_genomic.fna'
REF_PATH='/mnt/3TB_0/Data/fasta/mammal_genomes/'
ALIGN_NAME=REF_NAME.split('.fna')[0]
ALIGNER='minimap2'

NUM_THREADS=40
LAYOUT='PAIRED'
KEEP_UNALIGNED=False
CREATE_FASTP=False

#https://lh3.github.io/minimap2/minimap2.html#5 -2 for 2 IO threads
MINIMAP_PARAMS=f'-MD -c -eqx -x sr -2 -t {NUM_THREADS} -I30g --sam-hit-only --secondary=no'
ALIGN_OPTS_NAME='x_sr_secondary_no'

BAM_POSTFIX='gatk_sorted_marked'
BAMSTATS_POST=f'_{ALIGN_NAME}_{ALIGN_OPTS_NAME}_bamstats.txt'
BAM_POST=f'_reads_{REF_NAME.split(".fa")[0]}_{ALIGN_OPTS_NAME}_{ALIGNER}_sorted.bam'

In [None]:
PRJ='PRJNA901878'

PRJ_OUT_PATH=f'/mnt/4TB_1/Data/Assembly/{PRJ}/'
BASE_PATH=f'/mnt/8TB_0/Data/genbank/{PRJ}/'

SRAs=['SRR22936497']

FASTQ_FILES=[]
for f in SRAs:
    FASTQ_FILES.append(f+'_1_fastp.fastq')
    FASTQ_FILES.append(f+'_2_fastp.fastq')
    
workflow_paired(sra_dir=False)

In [None]:
TAXA_TAB_DEL=''
REF_NAME='GCF_014570535.1_YNU_ManJav_2.0_genomic.fna'
REF_PATH='/mnt/3TB_0/Data/fasta/mammal_genomes/'
ALIGN_NAME=REF_NAME.split('.fna')[0]
ALIGNER='minimap2'

NUM_THREADS=40
LAYOUT='PAIRED'
KEEP_UNALIGNED=False
CREATE_FASTP=False

#https://lh3.github.io/minimap2/minimap2.html#5 -2 for 2 IO threads
MINIMAP_PARAMS=f'-MD -c -eqx -x sr -2 -t {NUM_THREADS} -I30g --sam-hit-only --secondary=no'
ALIGN_OPTS_NAME='x_sr_secondary_no'

BAM_POSTFIX='gatk_sorted_marked'
BAMSTATS_POST=f'_{ALIGN_NAME}_{ALIGN_OPTS_NAME}_bamstats.txt'
BAM_POST=f'_reads_{REF_NAME.split(".fa")[0]}_{ALIGN_OPTS_NAME}_{ALIGNER}_sorted.bam'

In [None]:
PRJ='PRJNA901878'

PRJ_OUT_PATH=f'/mnt/4TB_1/Data/Assembly/{PRJ}/'
BASE_PATH=f'/mnt/8TB_0/Data/genbank/{PRJ}/'

SRAs=['SRR22936497']

FASTQ_FILES=[]
for f in SRAs:
    FASTQ_FILES.append(f+'_1_fastp.fastq')
    FASTQ_FILES.append(f+'_2_fastp.fastq')
    
workflow_paired(sra_dir=False)