# Alignment

### Requirements
- bwa
- samtools

In [1]:
import os, sys
from pathlib import Path

import pandas as pd
import numpy as np

In [2]:
# import utils.py
currentdir = os.path.dirname(os.path.realpath(__name__))
parentdir = os.path.dirname(currentdir)
sys.path.append(parentdir)

from utils import *

In [3]:
config_file = "config.toml"
config = parse_config(config_file)
root_dir = Path(config['root_dir'])

### Input

In [34]:
trimmed_fpath_file = root_dir / 'trimmed_fpath.csv'
trimmed_fpath_file

PosixPath('data/trimmed_fpath.csv')

### Output

In [35]:
ref_file = root_dir / 'reference' / 'ref.fa'
ref_file.parent.mkdir(exist_ok=True)
ref_file

PosixPath('data/reference/ref.fa')

In [36]:
aligned_fpath_file = root_dir / 'aligned_fpath.csv'
aligned_fpath_file

PosixPath('data/aligned_fpath.csv')

## 1. Load file path

In [37]:
fpath_df = pd.read_csv(trimmed_fpath_file)
fpath_df

Unnamed: 0,running_id,raw_r1,raw_r2,num_seqs,sum_len,avg_len,error_rate,trimmed_r1,unpaired_r1,trimmed_r2,unpaired_r2,trimmomatic_log
0,SRR6046075,data/raw/SRR6046075_1.fastq.gz,data/raw/SRR6046075_2.fastq.gz,2150140,315603941,146.782973,0.004489,data/trimmed/SRR6046075_1_trimmed.fastq.gz,data/trimmed/SRR6046075_1_unpaired.fastq.gz,data/trimmed/SRR6046075_2_trimmed.fastq.gz,data/trimmed/SRR6046075_2_unpaired.fastq.gz,data/trimmed/SRR6046075.log
1,SRR6046701,data/raw/SRR6046701_1.fastq.gz,data/raw/SRR6046701_2.fastq.gz,2181260,329370260,151.0,0.00296,data/trimmed/SRR6046701_1_trimmed.fastq.gz,data/trimmed/SRR6046701_1_unpaired.fastq.gz,data/trimmed/SRR6046701_2_trimmed.fastq.gz,data/trimmed/SRR6046701_2_unpaired.fastq.gz,data/trimmed/SRR6046701.log
2,SRR6045735,data/raw/SRR6045735_1.fastq.gz,data/raw/SRR6045735_2.fastq.gz,2730788,412348988,151.0,0.004648,data/trimmed/SRR6045735_1_trimmed.fastq.gz,data/trimmed/SRR6045735_1_unpaired.fastq.gz,data/trimmed/SRR6045735_2_trimmed.fastq.gz,data/trimmed/SRR6045735_2_unpaired.fastq.gz,data/trimmed/SRR6045735.log
3,SRR6045327,data/raw/SRR6045327_1.fastq.gz,data/raw/SRR6045327_2.fastq.gz,2368378,348449021,147.125594,0.003844,data/trimmed/SRR6045327_1_trimmed.fastq.gz,data/trimmed/SRR6045327_1_unpaired.fastq.gz,data/trimmed/SRR6045327_2_trimmed.fastq.gz,data/trimmed/SRR6045327_2_unpaired.fastq.gz,data/trimmed/SRR6045327.log
4,SRR6045549,data/raw/SRR6045549_1.fastq.gz,data/raw/SRR6045549_2.fastq.gz,2200232,312069380,141.834761,0.004359,data/trimmed/SRR6045549_1_trimmed.fastq.gz,data/trimmed/SRR6045549_1_unpaired.fastq.gz,data/trimmed/SRR6045549_2_trimmed.fastq.gz,data/trimmed/SRR6045549_2_unpaired.fastq.gz,data/trimmed/SRR6045549.log
5,SRR6046105,data/raw/SRR6046105_1.fastq.gz,data/raw/SRR6046105_2.fastq.gz,2739142,397357271,145.066328,0.00721,data/trimmed/SRR6046105_1_trimmed.fastq.gz,data/trimmed/SRR6046105_1_unpaired.fastq.gz,data/trimmed/SRR6046105_2_trimmed.fastq.gz,data/trimmed/SRR6046105_2_unpaired.fastq.gz,data/trimmed/SRR6046105.log
6,SRR6046861,data/raw/SRR6046861_1.fastq.gz,data/raw/SRR6046861_2.fastq.gz,2046324,300436707,146.817761,0.005184,data/trimmed/SRR6046861_1_trimmed.fastq.gz,data/trimmed/SRR6046861_1_unpaired.fastq.gz,data/trimmed/SRR6046861_2_trimmed.fastq.gz,data/trimmed/SRR6046861_2_unpaired.fastq.gz,data/trimmed/SRR6046861.log
7,SRR6044910,data/raw/SRR6044910_1.fastq.gz,data/raw/SRR6044910_2.fastq.gz,1964346,275329828,140.163611,0.002015,data/trimmed/SRR6044910_1_trimmed.fastq.gz,data/trimmed/SRR6044910_1_unpaired.fastq.gz,data/trimmed/SRR6044910_2_trimmed.fastq.gz,data/trimmed/SRR6044910_2_unpaired.fastq.gz,data/trimmed/SRR6044910.log
8,SRR6045763,data/raw/SRR6045763_1.fastq.gz,data/raw/SRR6045763_2.fastq.gz,2311282,344349323,148.986287,0.011394,data/trimmed/SRR6045763_1_trimmed.fastq.gz,data/trimmed/SRR6045763_1_unpaired.fastq.gz,data/trimmed/SRR6045763_2_trimmed.fastq.gz,data/trimmed/SRR6045763_2_unpaired.fastq.gz,data/trimmed/SRR6045763.log
9,SRR6046306,data/raw/SRR6046306_1.fastq.gz,data/raw/SRR6046306_2.fastq.gz,2545170,339698387,133.467858,0.004654,data/trimmed/SRR6046306_1_trimmed.fastq.gz,data/trimmed/SRR6046306_1_unpaired.fastq.gz,data/trimmed/SRR6046306_2_trimmed.fastq.gz,data/trimmed/SRR6046306_2_unpaired.fastq.gz,data/trimmed/SRR6046306.log


## 2. Get reference
 - <i>H37Rv</i>: https://www.ncbi.nlm.nih.gov/nuccore/NC_000962.3/

In [8]:
from Bio import Entrez
from Bio import SeqIO

def download_ref_file(
    accession_number,
    ref_file,
):
    if not ref_file.exists():
        Entrez.email = "your.email@example.com"

        # Fetch
        with Entrez.efetch(db="nucleotide", id=accession_number, rettype="fasta", retmode="text") as handle:
            record = SeqIO.read(handle, "fasta")

        # Save
        output_file = f"{accession_number}.fasta"
        SeqIO.write(record, ref_file, "fasta")

In [9]:
download_ref_file(
    'NC_000962.3',
    ref_file,
)

In [10]:
print(subprocs(f"seqkit stat {ref_file}").stdout.decode())

file                   format  type  num_seqs    sum_len    min_len    avg_len    max_len
data/reference/ref.fa  FASTA   DNA          1  4,411,532  4,411,532  4,411,532  4,411,532



## 3. Indexing

In [11]:
cmd = f"bwa index {ref_file}"

In [12]:
out = subprocs(cmd)

In [13]:
out.returncode

0

## 4. Perform Alignment

In [14]:
sorted_bam_dir = root_dir / 'sorted_bam'
sorted_bam_dir.mkdir(exist_ok=True)
sorted_bam_dir

PosixPath('data/sorted_bam')

In [22]:
def align_with_bwa(running_id, trimmed_r1, trimmed_r2):
    bam_file = sorted_bam_dir / f"{running_id}.bam"

    if not bam_file.exists():
        cmd = f'bwa mem -aM {ref_file} {trimmed_r1} {trimmed_r2} | samtools sort > {bam_file}'
        subprocs(cmd)
    
    return bam_file

In [23]:
out, elp = execute_function_pool_args(
    align_with_bwa,
    fpath_df[['running_id', 'trimmed_r1', 'trimmed_r2']].values,
    4,
)

In [21]:
# elp
495.32800006866455

495.32800006866455

In [29]:
sorted_bams = pd.Series(out)
sorted_bams.name = 'sorted_bam'
sorted_bams

0    data/sorted_bam/SRR6046075.bam
1    data/sorted_bam/SRR6046701.bam
2    data/sorted_bam/SRR6045735.bam
3    data/sorted_bam/SRR6045327.bam
4    data/sorted_bam/SRR6045549.bam
5    data/sorted_bam/SRR6046105.bam
6    data/sorted_bam/SRR6046861.bam
7    data/sorted_bam/SRR6044910.bam
8    data/sorted_bam/SRR6045763.bam
9    data/sorted_bam/SRR6046306.bam
Name: sorted_bam, dtype: object

## 5. Merge and save

In [38]:
new_fpath_df = pd.concat([fpath_df, sorted_bams], axis=1)
new_fpath_df

Unnamed: 0,running_id,raw_r1,raw_r2,num_seqs,sum_len,avg_len,error_rate,trimmed_r1,unpaired_r1,trimmed_r2,unpaired_r2,trimmomatic_log,sorted_bam
0,SRR6046075,data/raw/SRR6046075_1.fastq.gz,data/raw/SRR6046075_2.fastq.gz,2150140,315603941,146.782973,0.004489,data/trimmed/SRR6046075_1_trimmed.fastq.gz,data/trimmed/SRR6046075_1_unpaired.fastq.gz,data/trimmed/SRR6046075_2_trimmed.fastq.gz,data/trimmed/SRR6046075_2_unpaired.fastq.gz,data/trimmed/SRR6046075.log,data/sorted_bam/SRR6046075.bam
1,SRR6046701,data/raw/SRR6046701_1.fastq.gz,data/raw/SRR6046701_2.fastq.gz,2181260,329370260,151.0,0.00296,data/trimmed/SRR6046701_1_trimmed.fastq.gz,data/trimmed/SRR6046701_1_unpaired.fastq.gz,data/trimmed/SRR6046701_2_trimmed.fastq.gz,data/trimmed/SRR6046701_2_unpaired.fastq.gz,data/trimmed/SRR6046701.log,data/sorted_bam/SRR6046701.bam
2,SRR6045735,data/raw/SRR6045735_1.fastq.gz,data/raw/SRR6045735_2.fastq.gz,2730788,412348988,151.0,0.004648,data/trimmed/SRR6045735_1_trimmed.fastq.gz,data/trimmed/SRR6045735_1_unpaired.fastq.gz,data/trimmed/SRR6045735_2_trimmed.fastq.gz,data/trimmed/SRR6045735_2_unpaired.fastq.gz,data/trimmed/SRR6045735.log,data/sorted_bam/SRR6045735.bam
3,SRR6045327,data/raw/SRR6045327_1.fastq.gz,data/raw/SRR6045327_2.fastq.gz,2368378,348449021,147.125594,0.003844,data/trimmed/SRR6045327_1_trimmed.fastq.gz,data/trimmed/SRR6045327_1_unpaired.fastq.gz,data/trimmed/SRR6045327_2_trimmed.fastq.gz,data/trimmed/SRR6045327_2_unpaired.fastq.gz,data/trimmed/SRR6045327.log,data/sorted_bam/SRR6045327.bam
4,SRR6045549,data/raw/SRR6045549_1.fastq.gz,data/raw/SRR6045549_2.fastq.gz,2200232,312069380,141.834761,0.004359,data/trimmed/SRR6045549_1_trimmed.fastq.gz,data/trimmed/SRR6045549_1_unpaired.fastq.gz,data/trimmed/SRR6045549_2_trimmed.fastq.gz,data/trimmed/SRR6045549_2_unpaired.fastq.gz,data/trimmed/SRR6045549.log,data/sorted_bam/SRR6045549.bam
5,SRR6046105,data/raw/SRR6046105_1.fastq.gz,data/raw/SRR6046105_2.fastq.gz,2739142,397357271,145.066328,0.00721,data/trimmed/SRR6046105_1_trimmed.fastq.gz,data/trimmed/SRR6046105_1_unpaired.fastq.gz,data/trimmed/SRR6046105_2_trimmed.fastq.gz,data/trimmed/SRR6046105_2_unpaired.fastq.gz,data/trimmed/SRR6046105.log,data/sorted_bam/SRR6046105.bam
6,SRR6046861,data/raw/SRR6046861_1.fastq.gz,data/raw/SRR6046861_2.fastq.gz,2046324,300436707,146.817761,0.005184,data/trimmed/SRR6046861_1_trimmed.fastq.gz,data/trimmed/SRR6046861_1_unpaired.fastq.gz,data/trimmed/SRR6046861_2_trimmed.fastq.gz,data/trimmed/SRR6046861_2_unpaired.fastq.gz,data/trimmed/SRR6046861.log,data/sorted_bam/SRR6046861.bam
7,SRR6044910,data/raw/SRR6044910_1.fastq.gz,data/raw/SRR6044910_2.fastq.gz,1964346,275329828,140.163611,0.002015,data/trimmed/SRR6044910_1_trimmed.fastq.gz,data/trimmed/SRR6044910_1_unpaired.fastq.gz,data/trimmed/SRR6044910_2_trimmed.fastq.gz,data/trimmed/SRR6044910_2_unpaired.fastq.gz,data/trimmed/SRR6044910.log,data/sorted_bam/SRR6044910.bam
8,SRR6045763,data/raw/SRR6045763_1.fastq.gz,data/raw/SRR6045763_2.fastq.gz,2311282,344349323,148.986287,0.011394,data/trimmed/SRR6045763_1_trimmed.fastq.gz,data/trimmed/SRR6045763_1_unpaired.fastq.gz,data/trimmed/SRR6045763_2_trimmed.fastq.gz,data/trimmed/SRR6045763_2_unpaired.fastq.gz,data/trimmed/SRR6045763.log,data/sorted_bam/SRR6045763.bam
9,SRR6046306,data/raw/SRR6046306_1.fastq.gz,data/raw/SRR6046306_2.fastq.gz,2545170,339698387,133.467858,0.004654,data/trimmed/SRR6046306_1_trimmed.fastq.gz,data/trimmed/SRR6046306_1_unpaired.fastq.gz,data/trimmed/SRR6046306_2_trimmed.fastq.gz,data/trimmed/SRR6046306_2_unpaired.fastq.gz,data/trimmed/SRR6046306.log,data/sorted_bam/SRR6046306.bam


In [39]:
new_fpath_df.to_csv(aligned_fpath_file, index=False)