In [1]:
%matplotlib inline
import pandas as pd

import os

# Load in SRA metadata and search results

### Load in SRA metadata info:

This is the 'run info' that you can download from NCBI in bulk; it's got one entry for every accession, approximately.

In [2]:
run_info = pd.read_csv('../../big.runinfo.csv.gz')

  run_info = pd.read_csv('../../big.runinfo.csv.gz')


The two most important columns for our purposes are 'Run' and 'ScientificName':

In [3]:
print(run_info.columns)

run_info[['Run', 'ScientificName']].head()

Index(['Run', 'ReleaseDate', 'LoadDate', 'spots', 'bases', 'spots_with_mates',
       'avgLength', 'size_MB', 'AssemblyName', 'download_path', 'Experiment',
       'LibraryName', 'LibraryStrategy', 'LibrarySelection', 'LibrarySource',
       'LibraryLayout', 'InsertSize', 'InsertDev', 'Platform', 'Model',
       'SRAStudy', 'BioProject', 'Study_Pubmed_id', 'ProjectID', 'Sample',
       'BioSample', 'SampleType', 'TaxID', 'ScientificName', 'SampleName',
       'g1k_pop_code', 'source', 'g1k_analysis_group', 'Subject_ID', 'Sex',
       'Disease', 'Tumor', 'Affection_Status', 'Analyte_Type',
       'Histological_Type', 'Body_Site', 'CenterName', 'Submission',
       'dbgap_study_accession', 'Consent', 'RunHash', 'ReadHash'],
      dtype='object')


Unnamed: 0,Run,ScientificName
0,SRR18036904,bovine metagenome
1,SRR18036905,bovine metagenome
2,SRR18036906,bovine metagenome
3,SRR18036907,bovine metagenome
4,SRR18036908,bovine metagenome


In [4]:
# there are ~700,000 entries:
len(run_info)

702013

In [5]:
run_info2 = run_info[['Run', 'ScientificName']]

### Now, load in the stamps MAGsearch results


In [6]:
magsearch_df = pd.read_csv('stamps.csv', quotechar="'")
print(len(magsearch_df))
magsearch_df.head()

118541


Unnamed: 0,query,Run,containment
0,S26,/group/ctbrowngrp/irber/data/wort-data/wort-sr...,0.012544
1,S26,/group/ctbrowngrp/irber/data/wort-data/wort-sr...,0.013018
2,S26,/group/ctbrowngrp/irber/data/wort-data/wort-sr...,0.012071
3,FV_DSM_15829_genome,/group/ctbrowngrp/irber/data/wort-data/wort-sr...,0.397849
4,C3_T13_0,/group/ctbrowngrp/irber/data/wort-data/wort-sr...,0.013866


# Make the results more human-readable and add SRA metadata info

### First, we need to take the filenames in the 'Run' column and turn them into accessions.

In [7]:
def extract_run_acc(x):
    # get just the end filename
    x = os.path.basename(x)
    # remove extension '.sig'
    y, ext = os.path.splitext(x)
    assert ext == '.sig', ext
    return y

# this can be used in case we have .gz, .fasta, .fa, etc in the query filename
def remove_extension(x):
    x = os.path.basename(x)
    y, ext = os.path.splitext(x)
    while ext in ('.gz', '.fasta', '.fa', '.fna'):
        x = y
        y, ext = os.path.splitext(x)
    return y

magsearch_df['Run'] = magsearch_df['Run'].apply(extract_run_acc)
magsearch_df.head()

Unnamed: 0,query,Run,containment
0,S26,SRR7479650,0.012544
1,S26,ERR771002,0.013018
2,S26,ERR3573764,0.012071
3,FV_DSM_15829_genome,ERR3500860,0.397849
4,C3_T13_0,SRR6466471,0.013866


### Now we can correlate magsearch results with SRA RunInfo

In [8]:
run_info2.set_index('Run')#['ScientificName']

Unnamed: 0_level_0,ScientificName
Run,Unnamed: 1_level_1
SRR18036904,bovine metagenome
SRR18036905,bovine metagenome
SRR18036906,bovine metagenome
SRR18036907,bovine metagenome
SRR18036908,bovine metagenome
...,...
SRR11108097,gut metagenome
SRR8144073,Uvigerina striata
SRR8144074,Uvigerina striata
SRR8144079,Valvulineria inflata


In [9]:
magsearch2_df = magsearch_df.set_index('Run').join(run_info2.set_index('Run')['ScientificName'])
magsearch2_df.head()

Unnamed: 0_level_0,query,containment,ScientificName
Run,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
DRR001455,S26,0.018225,soil metagenome
DRR001456,S26,0.012544,soil metagenome
DRR001457,S26,0.014201,soil metagenome
DRR001458,S26,0.013491,soil metagenome
DRR001459,S26,0.017278,soil metagenome


### Subset to just SRA results with good scientific names

In [11]:
# how many have 'null' scientific name?
null_df = magsearch2_df[magsearch2_df['ScientificName'].isnull()]
print(len(null_df))

13596


In [12]:
# pull out just the ones with good scientific names:
magsearch3_df = magsearch2_df[~magsearch2_df['ScientificName'].isnull()]
perc_non_null = len(magsearch3_df)/len(magsearch2_df)*100
print(f"Of {len(magsearch2_df)} MAGsearch results, {len(magsearch3_df)} have non-null metadata ({perc_non_null:.2f}%)")
magsearch3_df.head()

Of 118541 MAGsearch results, 104945 have non-null metadata (88.53%)


Unnamed: 0_level_0,query,containment,ScientificName
Run,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
DRR001455,S26,0.018225,soil metagenome
DRR001456,S26,0.012544,soil metagenome
DRR001457,S26,0.014201,soil metagenome
DRR001458,S26,0.013491,soil metagenome
DRR001459,S26,0.017278,soil metagenome


In [13]:
print(f'{len(set(magsearch3_df["query"]))} independent queries in results')

4 independent queries in results


In [14]:
# how many matches do we have for each query?
magsearch3_df["query"].value_counts()#[:20]

S26                     40004
FV_DSM_15829_genome     26336
C3_T13_0                24175
FV_PB189-T1-4_genome    14430
Name: query, dtype: int64

### Split Results by Query

We two very different queries! Let's split our results into a dataframe that only contains the marine queries.
marine_queries = ["S26", "C3_T13_0"]

vg_queries = ["FV_DSM_15829_genome", "FV_PB189-T1-4_genome"]

In [16]:
marine_queries = ["S26", "C3_T13_0"]
#vg_queries = ["FV_DSM_15829_genome", "FV_PB189-T1-4_genome"]

marine_df = magsearch3_df[magsearch3_df["query"].isin(marine_queries)]
#vg_df = magsearch3_df[magsearch3_df["query"].isin(vg_queries)]

In [17]:
marine_df["query"].unique()

array(['S26', 'C3_T13_0'], dtype=object)

In [18]:
marine_df.head()

Unnamed: 0_level_0,query,containment,ScientificName
Run,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
DRR001455,S26,0.018225,soil metagenome
DRR001456,S26,0.012544,soil metagenome
DRR001457,S26,0.014201,soil metagenome
DRR001458,S26,0.013491,soil metagenome
DRR001459,S26,0.017278,soil metagenome


# Start looking at the results!


In [19]:
# what are the top ScientificNames of the matches?
marine_df["ScientificName"].value_counts()[:20]

soil metagenome                   12975
marine metagenome                  7671
metagenome                         5904
wastewater metagenome              4551
sediment metagenome                3475
freshwater metagenome              2907
activated sludge metagenome        1944
gut metagenome                     1787
aquatic metagenome                 1518
seawater metagenome                1407
peat metagenome                    1188
freshwater sediment metagenome      996
rhizosphere metagenome              956
bioreactor metagenome               950
lake water metagenome               933
bovine gut metagenome               914
biofilm metagenome                  783
marine sediment metagenome          727
groundwater metagenome              596
root metagenome                     575
Name: ScientificName, dtype: int64

## Sort Results by Containment

The default threshold for containment is 0.01, which means ~1% of the query genome needs to be found in the metagenome for it to be reported. That's not very stringent!

First, let's look at the SRA runs that had the **best** containment of our queries:

In [20]:
marine_df.sort_values(by=['containment'], ascending=False)[:20]

Unnamed: 0_level_0,query,containment,ScientificName
Run,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ERR2094172,S26,0.461775,metagenome
SRR6675345,S26,0.384852,gut metagenome
SRR12112861,S26,0.285207,seawater metagenome
ERR4674711,S26,0.281183,marine metagenome
ERR4592245,S26,0.267692,seagrass metagenome
ERR2094169,S26,0.250414,metagenome
SRR12918224,S26,0.244734,soil metagenome
ERR2094168,S26,0.23929,metagenome
SRR12112859,S26,0.229822,seawater metagenome
SRR6877521,S26,0.228166,biofilm metagenome


## Filter Results by Containment

We've found (rule of thumb) that 0.2 is a decent value - 20% - indicating some level of stringency. Let's take a look -

In [22]:
# let's do some filtering -
marine_df2 = marine_df[marine_df['containment'] > 0.2]

for name, df in {"marine": marine_df2}.items():
    print('query type:', name)
    print('total matches:', len(df))
    print('query:', len(set(df["query"])))
    print('metagenomes:', len(set(df.index)))
    print("\n")
    print(df["ScientificName"].value_counts()[:20], "\n\n")

query type: marine
total matches: 24
query: 1
metagenomes: 24


metagenome                      5
aquatic metagenome              5
biofilm metagenome              3
macroalgae metagenome           2
seawater metagenome             2
hydrothermal vent metagenome    2
seagrass metagenome             1
marine metagenome               1
soil metagenome                 1
marine sediment metagenome      1
gut metagenome                  1
Name: ScientificName, dtype: int64 




In [23]:
marine_df2

Unnamed: 0_level_0,query,containment,ScientificName
Run,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ERR2094168,S26,0.23929,metagenome
ERR2094169,S26,0.250414,metagenome
ERR2094172,S26,0.461775,metagenome
ERR2179510,S26,0.208521,metagenome
ERR2179512,S26,0.20426,metagenome
ERR4592244,S26,0.203077,macroalgae metagenome
ERR4592245,S26,0.267692,seagrass metagenome
ERR4592246,S26,0.203314,macroalgae metagenome
ERR4674711,S26,0.281183,marine metagenome
SRR12112859,S26,0.229822,seawater metagenome


In [25]:
c3 = marine_df[marine_df['query'] == "C3_T13_0"]
c3

Unnamed: 0_level_0,query,containment,ScientificName
Run,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
DRR001464,C3_T13_0,0.010631,soil metagenome
DRR012573,C3_T13_0,0.012711,soil metagenome
DRR012574,C3_T13_0,0.015253,soil metagenome
DRR041863,C3_T13_0,0.018489,museum specimen metagenome
DRR041864,C3_T13_0,0.020106,museum specimen metagenome
...,...,...,...
SRR9998341,C3_T13_0,0.022880,soil metagenome
SRR9998342,C3_T13_0,0.024266,soil metagenome
SRR9998343,C3_T13_0,0.023804,soil metagenome
SRR9998344,C3_T13_0,0.019875,soil metagenome


In [27]:
c3.sort_values(by=['containment'], ascending=False)[:30]

Unnamed: 0_level_0,query,containment,ScientificName
Run,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ERR4592245,C3_T13_0,0.165473,seagrass metagenome
SRR12918224,C3_T13_0,0.165242,soil metagenome
SRR6877521,C3_T13_0,0.147446,biofilm metagenome
ERR4592244,C3_T13_0,0.129882,macroalgae metagenome
ERR4592246,C3_T13_0,0.128727,macroalgae metagenome
SRR14699981,C3_T13_0,0.118558,marine sediment metagenome
SRR8497107,C3_T13_0,0.11486,biofilm metagenome
SRR12681089,C3_T13_0,0.113011,plant metagenome
ERR2179512,C3_T13_0,0.113011,metagenome
SRR7207567,C3_T13_0,0.113011,hydrothermal vent metagenome
