# Merge metagenomic reads and align to reference

In [1]:
import os 
from Bio import SeqIO
import pandas as pd
import seaborn as sns

## Merge

In [2]:
fastq_dir = '../data/kraken/data/kraken_results/fastq/'

In [3]:
# get files first, to sort them
r1s = []
r2s = []

for file in os.listdir(fastq_dir):
    if file.endswith('_1.alphatecti.fastq'):
        r1s.append(file)
    elif file.endswith('_2.alphatecti.fastq'):
        r2s.append(file)
        
r1s.sort()
r2s.sort()

In [4]:
metadata = pd.read_csv('../data/kraken/data/wastewater_metadata.tsv', sep='\t')

In [6]:
bioproj_dict = dict(zip(metadata['run_acc'], metadata['bioproject']))
bioproj_dict['DIN'] = 'baymlab'
bioproj_dict['DIS'] = 'baymlab'

In [7]:
new_records_r1 = []
new_records_r2 = []

for file in r1s:
    path = os.path.join(fastq_dir, file)

    for record in SeqIO.parse(path, 'fastq'):
        acc = file.split('_')[0]
        record.description = bioproj_dict[acc]
        new_records_r1.append(record)
        
for file in r2s:
    path = os.path.join(fastq_dir, file)

    for record in SeqIO.parse(path, 'fastq'):
        acc = file.split('_')[0]
        record.description = bioproj_dict[acc]
        new_records_r2.append(record)

In [8]:
#SeqIO.write(new_records_r1, '../data/map_kraken/all_reads_r1.fastq', 'fastq')
#SeqIO.write(new_records_r2, '../data/map_kraken/all_reads_r2.fastq', 'fastq')

## Search

In [9]:
# merge all prds

dire = '../../../genomes/data/assemblies_oriented/'
all_prds = []
for file in os.listdir(dire):
    if file.startswith('PRD'):
        path = os.path.join(dire, file)
        for record in SeqIO.parse(path, 'fasta'):
            name = file.split('.fasta')[0]
            record.id = name
            record.name = ''
            record.description = ''
            all_prds.append(record)
            
#SeqIO.write(all_prds, '../data/map_kraken/all_prds.fasta', 'fasta')

In [10]:
#ref = '../data/map_kraken/all_prds.fasta'
ref = '../data/map_kraken/prd1.fasta'
query_r1 = '../data/map_kraken/all_reads_r1.fastq'
query_r2 = '../data/map_kraken/all_reads_r2.fastq'

### minimap

In [11]:
sam = '../data/map_kraken/mm.sam'

cmd = f'minimap2 -ax sr {ref} {query_r1} {query_r2} > {sam}'
#! $cmd

# Subset

In [12]:
bam = '../data/map_kraken/mm.bam'
#out = '../data/map_kraken/mm.p2.bam'
out = '../data/map_kraken/mm.p2.sam'
p2 = "NC_001421.2:3128-4903"

cmd = f'samtools view {bam} {p2} > {out}'
#! $cmd

In [13]:
bam = '../data/map_kraken/mm.bam'
out = '../data/map_kraken/mm.p3.sam'
p3 = "NC_001421.2:8595-9782"

cmd = f'samtools view {bam} {p3} > {out}'
#! $cmd

## Label origin

In [14]:
df_p2 = pd.read_csv('../data/map_kraken/mm.p2.sam', sep='\t', header=None, comment='@')

nameser = df_p2[0].str.split('.', expand=True)[0].map(bioproj_dict)
nameser = nameser.fillna('baymlab')

df_p2[21] = 'CO:Z:' + nameser

#df_p2.to_csv('../data/map_kraken/mm.p2.edit.sam', sep='\t', header=None, index=None)

In [15]:
df_p3 = pd.read_csv('../data/map_kraken/mm.p3.sam', sep='\t', header=None, comment='@')

nameser = df_p3[0].str.split('.', expand=True)[0].map(bioproj_dict)
nameser = nameser.fillna('baymlab')

df_p3[21] = 'CO:Z:' + nameser

#df_p3.to_csv('../data/map_kraken/mm.p3.edit.sam', sep='\t', header=None, index=None)