Generates a metadata sheet with 9000 plus individual metagenomic samples. Reformats the metadata from Passoli et al to be compatible with this study and then double checks that no samples in the asv sequence table are missing from metadata.

In [37]:
import pandas as pd
import numpy as np
import os 

In [38]:
amplicon_metadata_file = 'metadata/metadata_Bt_samples_gyrb.txt'
metagenomic_metadata_file = 'metadata/passoli_metadata.txt'
metagenomic_metadata_file = 'metadata/passoli_metadata_formatted.txt'

In [40]:
os.chdir('/Volumes/AHN/captive_ape_microbiome')
amplicon_metadata = pd.read_csv(amplicon_metadata_file,sep='\t')
amplicon_metadata.head()

Unnamed: 0,X.SampleID,BarcodeSequence,LinkerPrimerSequence,Old_SampleID,dataset,genus_sp,subspecies,common_name,uniqueid,site,site_code,country,captivity_status,amplicon,sample,Description
0,wd.bon.IK.KSG2876.Bt,TCACCTCATACGGCTA,CAAGCAGAAGACGGCATACGAGAT,83,gyrb_moeller_wild,Pan_paniscus,Pan_paniscus,bonobo,KSG2876,DRC.IK,IK,DRC,wild,Bt,wd.bon.IK.KSG2876,wild_bonobo
1,wd.bon.IK.KSG2883.Bt,GCTTGCTTCGTTGCAA,CAAGCAGAAGACGGCATACGAGAT,97,gyrb_moeller_wild,Pan_paniscus,Pan_paniscus,bonobo,KSG2883,DRC.IK,IK,DRC,wild,Bt,wd.bon.IK.KSG2883,wild_bonobo
2,wd.bon.IK.KSG2877.Bt,TCACCTCACGTTGCAA,CAAGCAGAAGACGGCATACGAGAT,98,gyrb_moeller_wild,Pan_paniscus,Pan_paniscus,bonobo,KSG2877,DRC.IK,IK,DRC,wild,Bt,wd.bon.IK.KSG2877,wild_bonobo
3,wd.bon.IK.KSG2879.Bt,GCTTGCTTCTGAAGTC,CAAGCAGAAGACGGCATACGAGAT,112,gyrb_moeller_wild,Pan_paniscus,Pan_paniscus,bonobo,KSG2879,DRC.IK,IK,DRC,wild,Bt,wd.bon.IK.KSG2879,wild_bonobo
4,wd.bon.IK.KSG2882.Bt,TCACCTCACTGAAGTC,CAAGCAGAAGACGGCATACGAGAT,113,gyrb_moeller_wild,Pan_paniscus,Pan_paniscus,bonobo,KSG2882,DRC.IK,IK,DRC,wild,Bt,wd.bon.IK.KSG2882,wild_bonobo


In [41]:
metagenomic_metadata = pd.read_csv(metagenomic_metadata_file,sep='\t')
metagenomic_metadata.head()

Unnamed: 0,Study,Sample ID,# Reads,# Bases,Median Read Length,Body Site,Body Subsite,Age (years),Age Category,Infant Age (days),Westernized,Country
0,AsnicarF_2017,MV_FEI1_t1Q14,27553455,2748531713,100.0,Stool,,1,Newborn,90.0,Yes,ITA
1,AsnicarF_2017,MV_FEI2_t1Q14,32197545,3210606434,100.0,Stool,,1,Newborn,90.0,Yes,ITA
2,AsnicarF_2017,MV_FEI3_t1Q14,32310216,3222955818,100.0,Stool,,1,Newborn,90.0,Yes,ITA
3,AsnicarF_2017,MV_FEI4_t1Q14,14270590,1423299652,100.0,Stool,,1,Newborn,,Yes,ITA
4,AsnicarF_2017,MV_FEI4_t2Q15,37677351,3735487055,100.0,Stool,,1,Newborn,,Yes,ITA


In [26]:
metagenomic_metadata['X.SampleID'] = metagenomic_metadata.apply(lambda row: row['Study'] +'__' + row['Sample ID'],axis=1)
metagenomic_metadata['X.SampleID'] = metagenomic_metadata['X.SampleID'].replace('.','').replace('-','')
metagenomic_metadata['BarcodeSequence'] = 'NA'
metagenomic_metadata['LinkerPrimerSequence'] = 'NA'
metagenomic_metadata['Old_SampleID'] = metagenomic_metadata['Sample ID']
metagenomic_metadata['dataset'] = metagenomic_metadata['Study']
metagenomic_metadata['genus_sp'] = 'Homo_sapiens'
metagenomic_metadata['subspecies']  = 'Homo_sapiens'
metagenomic_metadata['common_name'] = 'human'
metagenomic_metadata['uniqueid'] = 'NA'
metagenomic_metadata['site'] = metagenomic_metadata['Country']
metagenomic_metadata['site_code'] = metagenomic_metadata['Country']
metagenomic_metadata['country'] = metagenomic_metadata['Country']
metagenomic_metadata['captivity_status'] = np.where(metagenomic_metadata['Westernized']=='Yes', 'western_human', 'non_western_human') 
metagenomic_metadata['amplicon'] = 'META'
metagenomic_metadata['sample'] = metagenomic_metadata['Sample ID']
metagenomic_metadata['Description'] = metagenomic_metadata['captivity_status']
metagenomic_metadata_select = metagenomic_metadata[['X.SampleID','BarcodeSequence','LinkerPrimerSequence','Old_SampleID','dataset',
                     'genus_sp','subspecies','common_name','uniqueid','site','site_code','country',
                     'captivity_status','amplicon','sample','Description']]
metagenomic_metadata_select.head()
metagenomic_metadata_select.to_csv(,sep='\t',index=None)

In [28]:
os.system('cp metadata/metadata_Bt_samples_gyrb.txt metadata/metadata_Bt_samples_gyrb_passoli.txt')
with open('metadata/metadata_Bt_samples_gyrb_passoli.txt','a') as g:
    with open('metadata/passoli_metadata_formatted.txt','r') as f:
        f.readline()
        for l in f:
            g.write(l)

In [36]:
all_samples_metadata = pd.read_csv('metadata/metadata_Bt_samples_gyrb_passoli.txt',sep='\t')
all_samples_metadata.tail()
asv_tab = pd.read_csv('results/gyrb/inputs/ASVs_filtered_counts.tsv',sep='\t')
asv_tab.head()

Unnamed: 0.1,Unnamed: 0,wd.chi.GM.9.Bt,wd.gor.CP.CR1426.Bt,wd.gor.CP.CR5756.Bt,wd.gor.CP.CR6097.Bt,wd.gor.CP.CR2138.Bt,we.hum.CT.B11.Bt,we.hum.CT.C12.Bt,wd.chi.GM.341.Bt,wd.chi.GM.76.Bt,...,ZellerG_2014__CCMD88272491ST-21-0,ZellerG_2014__CCMD90311071ST-21-0,ZellerG_2014__CCMD93344354ST-21-0,ZellerG_2014__CCMD95433940ST-21-0,ZellerG_2014__CCMD96553385ST-21-0,ZellerG_2014__MMPU29365221ST,ZellerG_2014__MMPU68403337ST,ZellerG_2014__MMPU72854103ST,ZellerG_2014__MMPU84450604ST,ZellerG_2014__MMPU99077057ST
0,ASV_1,0,0,0,0,0,0,1,0,0,...,1,1,0,0,1,1,0,0,1,1
1,ASV_2,0,0,0,0,0,1,1,0,0,...,1,1,1,0,1,1,0,1,0,1
2,ASV_3,0,0,0,0,0,1,1,0,0,...,0,1,1,0,0,1,1,1,1,1
3,ASV_4,0,0,0,0,0,0,1,0,0,...,1,0,1,1,0,0,1,0,1,0
4,ASV_5,0,0,0,0,0,1,0,1,1,...,0,0,1,0,0,0,0,0,0,0


In [35]:
set(list(asv_tab.columns)) - set(list(all_samples_metadata['X.SampleID']))

{'Unnamed: 0'}

In [51]:
strain_samples = asv_tab[['Unnamed: 0','wd.chi.GM.1203dup.Bt','cp.chi.HOUZ.strains.Bt',
'cp.ora.HOUZ.strains.Bt',
'cp.gor.COLZ.strains.Bt',
'cp.gor.HOUZ.strains.Bt']]


In [64]:
ASVs = strain_samples[strain_samples['cp.chi.HOUZ.strains.Bt']+
               strain_samples['cp.ora.HOUZ.strains.Bt']+
               strain_samples['cp.gor.COLZ.strains.Bt']+
               strain_samples['cp.gor.HOUZ.strains.Bt']!=0]['Unnamed: 0']

asvs_in_strain_samples = asv_tab[asv_tab['Unnamed: 0'].isin(list(ASVs))]
asvs_in_strain_samples.to_csv('asvs_in_strain_samples.csv')

In [50]:
for i in ['wd.chi.GM.1203dup.Bt','cp.chi.HOUZ.strains.Bt',
'cp.ora.HOUZ.strains.Bt',
'cp.gor.COLZ.strains.Bt',
'cp.gor.HOUZ.strains.Bt']:
    print(i,sum(strain_samples[i]))
    print()

wd.chi.GM.1203dup.Bt 17
cp.chi.HOUZ.strains.Bt 13
cp.ora.HOUZ.strains.Bt 4
cp.gor.COLZ.strains.Bt 6
cp.gor.HOUZ.strains.Bt 3


In [53]:
strain_samples.to_csv('test.csv')