This script downloads metagenomic datasets useds by Pasoli et al. 2019 for their lab repository. This fastas unzipped take up roughly 2.3T. gyrb reads are filtered out using a blastn search by a fasta containing ASVs from the amplicon datasets and reads from the gyrb gtdbtk reference fasta. Reads are output into an ASV table where the columns represent the sequences and the rows represent datasets, and the values represent the number of samples that particular read was found. This enables the metagenomic data to be merged with the amplicon data. Although it should be noted that the read numbers between the two types of datasets are incomparable, and should be used to indicate presene/absence in various sample types.

Inputs: list of metagenomic datasets, Bt-ASV fasta

Output: ASV table of metagenomic reads to be merged with amplicon datasets

In [4]:
import os
import pandas as pd
import glob
import numpy as np
from Bio import SeqIO
from Bio import Seq
os.getcwd()

'/Volumes/AHN/captive_ape_microbiome/scripts/gyrb/processing/metagenomic_samples'

### Overview of inputs/outpus
due to the size of the data file only the results files will be permanently saved.  

In [10]:
#root_dir = '/Volumes/APE_MacPro_External_2/alex/captive_ape_microbiome'

root_dir = '/Volumes/AHN/captive_ape_microbiome/'
os.chdir(root_dir)
#analysis_dir = root_dir + '/results/gyrb_metagenomic'
analysis_dir = 'results/gyrb/processing/metagenomic_samples/'
metagenomic_https_file = 'metadata_http_to_dnload.txt'
metagenomic_samples_file = 'metadata_all_samples.txt'
ASV_fasta =  'Bt_ASV_gtdbtk.fasta'

os.chdir(analysis_dir)
os.system('mkdir data') #
os.system('mkdir sample_files')
os.system('mkdir blastout')
os.system('mkdir parse_blast')
os.system('mkdir sseq_adj')
os.system('mkdir final_outputs')

256

In [11]:
#list all metagenomic datasets
metagenomic_datasets = pd.read_csv(metagenomic_https_file,header=None)
metagenomic_datasets.columns = ['https']
metagenomic_datasets['filename'] = metagenomic_datasets['https'].apply(lambda x: x.split('/')[-1])
metagenomic_datasets['folder'] = metagenomic_datasets['filename'].apply(lambda x: x.split('.')[0])
metagenomic_datasets.head()
metagenomic_datasets.to_csv('metadata_datasets.txt',sep='\t',index=False)
metagenomic_datasets.head()

Unnamed: 0,https,filename,folder
0,https://www.dropbox.com/s/njbwzpazz4a1dy1/Asni...,AsnicarF_2017.tar.bz2,AsnicarF_2017
1,https://www.dropbox.com/s/n6867hhmo12xnwf/Back...,BackhedF_2015.tar.bz2,BackhedF_2015
2,https://www.dropbox.com/s/ac6cpq50pt5mqj6/Beng...,Bengtsson-PalmeJ_2015.tar.bz2,Bengtsson-PalmeJ_2015
3,https://www.dropbox.com/s/uj9cy9914l0cq3x/Brit...,BritoIL_2016.tar.bz2,BritoIL_2016
4,https://www.dropbox.com/s/dtxmz1apdgtf3d1/Cast...,Castro-NallarE_2015.tar.bz2,Castro-NallarE_2015


### Download datasets

In [11]:
def dnload_file(http): 
    """
    downloads and expands metagenomic dataset from Passoli repo, generates lists of sample files in the data folder
    """
    filename = http.split('/')[-1]
    folder = filename.split('.')[0]
    print(http,filename,folder)
    
    os.system(f'wget {http} --directory-prefix data/')
    os.system(f'tar -vxf data/{filename} -C data/ ')
    os.system(f'ls data/{folder}/*.fa > sample_files/{folder}_files.txt')  
#dnload_file('https://www.dropbox.com/s/njbwzpazz4a1dy1/AsnicarF_2017.tar.bz2')


#list datasets already downloaded
datasets_downloaded = [f.split('.')[0] for f in os.listdir('data/') if f.endswith('bz2')]
print('datasets already downloaded')
print(datasets_downloaded)

#determine datasets that need to be downloads
datasets_to_do = metagenomic_datasets[~metagenomic_datasets['folder'].isin(datasets_downloaded)]
print('\ndatasets left to download')
print(datasets_to_do)
for http in datasets_to_do['https']:
    print(http)
    #dnload_file(http)


datasets already downloaded
['AsnicarF_2017', 'BackhedF_2015', 'Bengtsson-PalmeJ_2015', 'BritoIL_2016', 'Castro-NallarE_2015', 'ChengpingW_2017', 'ChngKR_2016', 'CM_cf', 'CM_madagascar', 'CM_periimplantitis', 'CosteaPI_2017', 'DavidLA_2015', 'FengQ_2015', 'FerrettiP_2018', 'GeversD_2014', 'HanniganGD_2017', 'HeQ_2017', 'HMP_2012', 'IjazUZ_2017', 'KarlssonFH_2013', 'KosticAD_2015', 'LeChatelierE_2013', 'LiJ_2014', 'LiJ_2017', 'LiSS_2016', 'LiuW_2016', 'LomanNJ_2013', 'LoombaR_2017', 'LouisS_2016', 'NielsenHB_2014', 'Obregon-TitoAJ_2015', 'OhJ_2014', 'OlmMR_2017', 'QinJ_2012', 'QinN_2014', 'RampelliS_2015', 'RaymondF_2016', 'SchirmerM_2016', 'SmitsSA_2017', 'VatanenT_2016', 'VincentC_2016', 'VogtmannE_2016', 'WenC_2017', 'XieH_2016', 'YuJ_2015', 'ZeeviD_2015', 'ZellerG_2014']

datasets left to download
Empty DataFrame
Columns: [https, filename, folder]
Index: []


### helper scripts big to small for filtering gyrb reads

In [10]:

def parse_blast(blast_results_file):
    """
    convert raw blast results in a table, selecting only one (the best) hit per metagenomic read
    this blastn table is used by future scripts (prep_df and ) to find the metagenomic fasta, determine the read overlap, 
    """
    #this clause checks to see whether the blastn search of a given metagenomic sample fasta returned any hits
    try:
        blast_results=pd.read_csv(blast_results_file, sep='\t', comment='#',header=None)     
    except:
        print(blast_results_file,'no hits')
        return
    
    qlen_cutoff = .8 #metagenomic reads must have 80% alignment length
    blast_results=pd.read_csv(blast_results_file, sep='\t', comment='#',header=None)
    blast_results.columns=['ASV','sacc','pident','length','qlen','qstart','qend','sstart','send','evalue','sseq']   
    blast_results=blast_results[blast_results['length']/blast_results['qlen'] >= qlen_cutoff]
    blast_results=blast_results.sort_values('evalue')  
    blast_results=blast_results.groupby('sacc').head(1) #get top hit

    sample_name=blast_results_file.split('/')[-1].split('_Bt.txt')[0]
    blast_results['sample_name']=sample_name
    folder=blast_results_file.split('/')[1]
    blast_results['folder']=folder
    outfile = blast_results_file.replace('blastout','parse_blast')
    blast_results.to_csv(outfile,sep='\t',index=False)
    return(blast_results)


def prep_df(df):  
    """
    blastn may return a hit that is the full length of the ASV. So one can't just take the subject sequence
    This scripts determine whether the blast hit spans the full length of the ASV. If it doesn't the read is labeled as long or short
    Long hit means that the ASV had gaps in the alignment, Short hit means that the alignment score fell at the ends of the read
    Additionally, hits be need to reverse complemented
    
    This script determine the metagenomic reads starting position based on where the subject/metagenomic reads aligned with the 
    ASV/query reads. i.e. if ASV start = 3 and the metagenomic reads start is 250000, the adjusted metagenomic start is  250000- 3. 
    This is complicated slightly if the read is in the rc orientation, then the ASV start is added to the metagenomic start.
    
    The final step of this script is to return upstream and downsteam positions that can be used to excise the hit from the metagenomic fasta 
    regardless of read orientation.
    """
    
    df['overlap'] = np.where(df['qlen']==df['length'], 'exact', #or if not exact must be long or short
                             np.where(df['qlen']<df['length'], 'long','short'))
    df['orientation'] = np.where(df['send']>df['sstart'], 'in_frame', 'reverse_complement')
    df['sstart_adj'] = np.where(df['orientation']=='in_frame',
                                             df['sstart'] - df['qstart'],
                                             df['sstart'] + df['qstart'] - 1)
    df['send_adj'] = np.where(df['orientation']=='in_frame',
                                             df['sstart_adj'] + df['qlen'],
                                             df['sstart_adj'] - df['qlen'])
    df['upstream'] = df.apply(lambda row:
                                  str(min([int(row['sstart_adj']),int(row['send_adj'])])),
                                  axis=1)
    df['downstream'] = df.apply(lambda row:
                                  str(max([int(row['sstart_adj']),int(row['send_adj'])])),
                                  axis=1)
    return(df)

def fetch_sseq_adj(row):
    """
    This script takes in a row of the blastn hit table after its been modified by prep_df.
    It determines whether blastn hit was full-length or whether is was shorter or longer than the ASV seqs
    It checks to see whether the adjusted metagenomic read start is still in bounds. For instance, if a hit 
    has a metagenomic read start of 10 but a query/ASV start position of 30 this would be out of bounds, indicated
    by a negative upstream value.
    """
    downstream = int(row['downstream'])
    upstream = int(row['upstream'])
    
    if row['length']==row['qlen']: #exact seq match
        return(pd.Series(['NA',row['sseq']]))
    elif upstream < 0:
        return(pd.Series(['NA','outOfBounds']))
    else:
        #go fishing 
        try:
            fasta_path = row['folder']+'/'+row['sample_name']+'.fa'
            seq_dict = SeqIO.to_dict(SeqIO.parse('data/'+fasta_path,'fasta'))
            contig = seq_dict[row['sacc']].seq
            len_cont = len(str(contig))
            if downstream >len_cont: #means contig is too short, hit goes over end
                return(pd.Series([len_cont,'outOfBounds']))
            else:
                sseq_adj = str(seq_dict[row['sacc']].seq)[upstream:downstream]
                if row['orientation']=='reverse_complement':
                    sseq_adj = Seq.reverse_complement(sseq_adj)     
                return(pd.Series([len_cont,sseq_adj]))
        except:
            return('NA','contig_not_found')   
    
def filter_gyrb_reads(folder):     
    """
    the big kahuna. It returns a table with all the gyrb hits for all sample fastas within a folder.
    
    this script puts it all together. Given is a dataset folder that needs to have gyrb reads exacted
    First, all fastas within the folder are listed, then one-by-one fastas are made into blastdbs and searched with the 
    ASV.fasta. Results from on the samples within a folder are parsed (parse_blast function) concatenated into one dataframe.
    To determine which reads are full length and/or reverse complement the dataframe is processed the with prep_df function.
    fetch_sseq_adj then uses the info to identify adjusted reads that can be pulled from the metagenomic fasta or 
    whether these reads are out of bounds.
    
    
    """
    fastas = [f for f in os.listdir('data/'+folder) if f.endswith('.fa')]
    print(fastas)
    for fasta in fastas:
        fasta_name = fasta.split('.fa')[0]
        #os.system(f'makeblastdb -in data/{folder}/{fasta} -dbtype nucl')
        os.system(f'blastn -query {ASV_fasta} -db data/{folder}/{fasta}  \
                    -outfmt "7 qseqid sseqid pident length qlen qstart qend sstart send evalue sseq" \
                    -out blastout/{folder}/{fasta_name}_Bt.txt')
    
    all_files = glob.glob(f"blastout/{folder}/*.txt")   
    df_from_each_file = [parse_blast(f) for f in all_files]
    df = pd.concat(df_from_each_file, ignore_index=True)   
    
    df = prep_df(df)
    df.to_csv(folder+'.txt',sep='\t',index=False)
    df[['contig_len','sseq_adj']] = df.apply(lambda row: fetch_sseq_adj(row),axis=1)
    df['sseq_same']= df['sseq']==df['sseq_adj']
    df['sseq_adj_len'] = df['sseq_adj'].apply(lambda x: len(str(x)))
    df = df[['ASV','sacc','sample_name','folder','pident','sseq_adj_len','evalue','overlap','orientation','qstart','qend','sstart','send','sstart_adj','send_adj','upstream','downstream','sseq_same','sseq','sseq_adj']]
    print(len(df),'gyrb hits in dataset',folder)
    df.to_csv('sseq_adj/'+folder+'.txt',sep='\t',index=False)
    
#filter_gyrb_reads('AsnicarF_2017')

['AsnicarF_2017__MV_FEI1_t1Q14.fa', 'AsnicarF_2017__MV_FEI2_t1Q14.fa', 'AsnicarF_2017__MV_FEI3_t1Q14.fa', 'AsnicarF_2017__MV_FEI4_t1Q14.fa', 'AsnicarF_2017__MV_FEI4_t2Q15.fa', 'AsnicarF_2017__MV_FEI5_t1Q14.fa', 'AsnicarF_2017__MV_FEI5_t2Q14.fa', 'AsnicarF_2017__MV_FEI5_t3Q15.fa', 'AsnicarF_2017__MV_FEM1_t1Q14.fa', 'AsnicarF_2017__MV_FEM2_t1Q14.fa', 'AsnicarF_2017__MV_FEM3_t1Q14.fa', 'AsnicarF_2017__MV_FEM4_t1Q14.fa', 'AsnicarF_2017__MV_FEM4_t2Q15.fa', 'AsnicarF_2017__MV_FEM5_t1Q14.fa', 'AsnicarF_2017__MV_FEM5_t2Q14.fa', 'AsnicarF_2017__MV_FEM5_t3Q15.fa', 'AsnicarF_2017__MV_MIM2_t1M14.fa', 'AsnicarF_2017__MV_MIM3_t1M14.fa', 'AsnicarF_2017__MV_MIM4_t2F15.fa', 'AsnicarF_2017__MV_MIM5_t2M14.fa', 'AsnicarF_2017__MV_MIM5_t3F15.fa']
blastout/AsnicarF_2017/AsnicarF_2017__MV_MIM2_t1M14_Bt.txt no hits
blastout/AsnicarF_2017/AsnicarF_2017__MV_MIM3_t1M14_Bt.txt no hits
blastout/AsnicarF_2017/AsnicarF_2017__MV_MIM4_t2F15_Bt.txt no hits
blastout/AsnicarF_2017/AsnicarF_2017__MV_MIM5_t2M14_Bt.txt no h

### Filter metagenomic fastas for gyrb hits

In [15]:
#list datasets that have already been filtered
datasets_complete = [f.split('.')[0] for f in os.listdir('sseq_adj/')]
print('\ndatasets already filtered')
print(datasets_complete)

datasets_to_do = set(datasets_downloaded) - set(datasets_complete)
print('\ndatasets left to filter')
print(datasets_to_do)


datasets already filtered
['', 'AsnicarF_2017', 'BackhedF_2015', 'Bengtsson-PalmeJ_2015', 'BritoIL_2016', 'Castro-NallarE_2015', 'ChengpingW_2017', 'ChngKR_2016', 'CM_cf', 'CM_madagascar', 'CM_periimplantitis', 'CosteaPI_2017', 'DavidLA_2015', 'FengQ_2015', 'FerrettiP_2018', 'GeversD_2014', 'HanniganGD_2017', 'HeQ_2017', 'HMP_2012', 'IjazUZ_2017', 'KarlssonFH_2013', 'KosticAD_2015', 'LeChatelierE_2013', 'LiJ_2014', 'LiJ_2017', 'LiSS_2016', 'LiuW_2016', 'LomanNJ_2013', 'LoombaR_2017', 'LouisS_2016', 'NielsenHB_2014', 'Obregon-TitoAJ_2015', 'OhJ_2014', 'OlmMR_2017', 'QinJ_2012', 'QinN_2014', 'RampelliS_2015', 'RaymondF_2016', 'SchirmerM_2016', 'SmitsSA_2017', 'VatanenT_2016', 'VincentC_2016', 'VogtmannE_2016', 'WenC_2017', 'XieH_2016', 'YuJ_2015', 'ZeeviD_2015', 'ZellerG_2014']

datasets left to filter
set()


In [None]:
#prep for parallel processing
import multiprocessing as mp
print("Number of processors: ", mp.cpu_count())

#filter out gyrb seqs
pool = mp.Pool(mp.cpu_count())  
results = pool.map_async(filter_gyrb_reads, [folder for folder in datasets_to_do])
pool.close()  

### Merge gyrb hits across datasets and add sample metadata

In [14]:
#merge results from complete datasets
dataset_gyrb_hits =  glob.glob(f"sseq_adj/*.txt")   
df_from_each_file = [pd.read_csv(f,sep='\t') for f in dataset_gyrb_hits]
datasets_merged = pd.concat(df_from_each_file, ignore_index=True)  
datasets_merged.to_csv('final_outputs/gyrb_hits_all_samples.txt',sep='\t',index=False)

#filter out reads that are not full length or have ambig character
datasets_merged = datasets_merged[datasets_merged['sseq_adj']!='outOfBounds']
datasets_merged = datasets_merged[~datasets_merged['sseq_adj'].str.contains('-')]

#print(datasets_merged.head())

#load metadata for samples
metadata = pd.read_csv(metagenomic_samples_file,sep='\t')
#print(metadata.head())
metadata['sample_name'] = metadata['Study']+'__'+metadata['Sample ID']

print("are there any samples that aren't in the metadata file?")
print(set(datasets_merged['sample_name'])-set(metadata['sample_name']))

print("how many samples returned gyrb hits?")
print(len(set(datasets_merged['sample_name'])))

df = datasets_merged.merge(metadata,on='sample_name')
df['Westernized_edit']=df['Westernized'].apply(lambda x: x.replace('Yes','western').replace('No','nonwestern'))
df['sample_group'] = df['folder']+'__'+df['Westernized_edit']
df['count'] = 1 #read detect in that one sample

print(df.head())

df_select = df.groupby(['sample_name','sseq_adj'])['count'].sum().reset_index()
seqtab = df_select.pivot(index='sample_name',columns='sseq_adj',values='count').fillna(0)  
seqtab.to_csv('final_outputs/gyrb_metagenomic_seqtab.txt',sep='\t')

print('how many unique gyrb seqs were filtered out of metagenomic samples?')
print(len(seqtab.columns))

print('gives the number of samples each read is found in by percentile')
print(seqtab.sum(axis=0).quantile([.10,.50,.70,.80,.90,.95,.99]))

are there any samples that aren't in the metadata file?
set()
how many samples returned gyrb hits?
7300
               ASV                               sacc  \
0  GCF_000012825.1  NODE_219_length_15742_cov_58.1685   
1  GCF_000012825.1  NODE_31_length_121814_cov_92.2338   
2  GCF_000265365.1    NODE_554_length_8127_cov_3.2111   
3  GCF_000265365.1   NODE_49_length_87972_cov_16.9398   
4  GCF_000156075.1  NODE_5293_length_3058_cov_4.31402   

                    sample_name         folder  pident  sseq_adj_len  \
0  AsnicarF_2017__MV_FEI4_t1Q14  AsnicarF_2017   100.0           250   
1  AsnicarF_2017__MV_FEI4_t2Q15  AsnicarF_2017   100.0           250   
2  AsnicarF_2017__MV_FEI5_t1Q14  AsnicarF_2017    99.6           250   
3  AsnicarF_2017__MV_FEI5_t2Q14  AsnicarF_2017    99.6           250   
4  AsnicarF_2017__MV_FEI5_t3Q15  AsnicarF_2017   100.0           250   

          evalue overlap orientation  qstart  ...  Body Site  Body Subsite  \
0  2.060000e-130   exact    in_frame      

### Output fasta of unqiue metagenomic reads
use first sample as header

In [113]:
df_unique = df.groupby('sseq_adj').head(1).reset_index()
print(len(df_unique))
with open('final_outputs/gyrb_metagenomic_unique.fasta' ,'w') as f:
    for index,row in df_unique.iterrows():
        header='>'+str(row['sample_name'])+'\n'
        seq=row['sseq_adj']+'\n'
        f.write(header)
        f.write(seq)

7124


7124
0.10      1.00
0.50      1.00
0.70      2.00
0.80      3.00
0.90      7.00
0.95     17.00
0.99    143.31
dtype: float64
