In [1]:
%matplotlib inline
import pandas as pd

import os

# Load in SRA metadata and search results

### Load in SRA metadata info:

This is the 'run info' that you can download from NCBI in bulk; it's got one entry for every accession, approximately.

In [2]:
run_info = pd.read_csv('../big.runinfo.csv.gz')

  run_info = pd.read_csv('../big.runinfo.csv.gz')


The two most important columns for our purposes are 'Run' and 'ScientificName':

In [3]:
print(run_info.columns)

run_info[['Run', 'ScientificName']].head()

Index(['Run', 'ReleaseDate', 'LoadDate', 'spots', 'bases', 'spots_with_mates',
       'avgLength', 'size_MB', 'AssemblyName', 'download_path', 'Experiment',
       'LibraryName', 'LibraryStrategy', 'LibrarySelection', 'LibrarySource',
       'LibraryLayout', 'InsertSize', 'InsertDev', 'Platform', 'Model',
       'SRAStudy', 'BioProject', 'Study_Pubmed_id', 'ProjectID', 'Sample',
       'BioSample', 'SampleType', 'TaxID', 'ScientificName', 'SampleName',
       'g1k_pop_code', 'source', 'g1k_analysis_group', 'Subject_ID', 'Sex',
       'Disease', 'Tumor', 'Affection_Status', 'Analyte_Type',
       'Histological_Type', 'Body_Site', 'CenterName', 'Submission',
       'dbgap_study_accession', 'Consent', 'RunHash', 'ReadHash'],
      dtype='object')


Unnamed: 0,Run,ScientificName
0,SRR18036904,bovine metagenome
1,SRR18036905,bovine metagenome
2,SRR18036906,bovine metagenome
3,SRR18036907,bovine metagenome
4,SRR18036908,bovine metagenome


In [4]:
# there are ~700,000 entries:
len(run_info)

702013

In [5]:
run_info2 = run_info[['Run', 'ScientificName']]

### Now, load in the stamps MAGsearch results


In [6]:
magsearch_df = pd.read_csv('../output.magsearch.k31/results/stamps.csv', quotechar="'")
print(len(magsearch_df))
magsearch_df.head()

5527


Unnamed: 0,query,Run,containment
0,FV_DSM_15829_genome,/group/ctbrowngrp/irber/data/wort-data/wort-sr...,0.038168
1,FV_DSM_15829_genome,/group/ctbrowngrp/irber/data/wort-data/wort-sr...,0.091603
2,FV_PB189-T1-4_genome,/group/ctbrowngrp/irber/data/wort-data/wort-sr...,0.037313
3,FV_DSM_15829_genome,/group/ctbrowngrp/irber/data/wort-data/wort-sr...,0.351145
4,FV_PB189-T1-4_genome,/group/ctbrowngrp/irber/data/wort-data/wort-sr...,0.149254


# Make the results more human-readable and add SRA metadata info

### First, we need to take the filenames in the 'Run' column and turn them into accessions.

In [7]:
def extract_run_acc(x):
    # get just the end filename
    x = os.path.basename(x)
    # remove extension '.sig'
    y, ext = os.path.splitext(x)
    assert ext == '.sig', ext
    return y

# this can be used in case we have .gz, .fasta, .fa, etc in the query filename
def remove_extension(x):
    x = os.path.basename(x)
    y, ext = os.path.splitext(x)
    while ext in ('.gz', '.fasta', '.fa', '.fna'):
        x = y
        y, ext = os.path.splitext(x)
    return y

magsearch_df['Run'] = magsearch_df['Run'].apply(extract_run_acc)
magsearch_df.head()

Unnamed: 0,query,Run,containment
0,FV_DSM_15829_genome,ERR4333983,0.038168
1,FV_DSM_15829_genome,SRR17547361,0.091603
2,FV_PB189-T1-4_genome,SRR17547361,0.037313
3,FV_DSM_15829_genome,ERR2014370,0.351145
4,FV_PB189-T1-4_genome,ERR2014370,0.149254


### Now we can correlate magsearch results with SRA RunInfo

In [8]:
run_info2.set_index('Run')#['ScientificName']

Unnamed: 0_level_0,ScientificName
Run,Unnamed: 1_level_1
SRR18036904,bovine metagenome
SRR18036905,bovine metagenome
SRR18036906,bovine metagenome
SRR18036907,bovine metagenome
SRR18036908,bovine metagenome
...,...
SRR11108097,gut metagenome
SRR8144073,Uvigerina striata
SRR8144074,Uvigerina striata
SRR8144079,Valvulineria inflata


In [9]:
magsearch2_df = magsearch_df.set_index('Run').join(run_info2.set_index('Run')['ScientificName'])
magsearch2_df.head()

Unnamed: 0_level_0,query,containment,ScientificName
Run,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
DRR042391,FV_DSM_15829_genome,0.030534,
DRR042545,FV_DSM_15829_genome,0.022901,
DRR042550,FV_DSM_15829_genome,0.022901,
DRR042658,FV_DSM_15829_genome,0.015267,
DRR086622,S26,0.020896,marine metagenome


### Subset to just SRA results with good scientific names

In [10]:
# how many have 'null' scientific name?
null_df = magsearch2_df[magsearch2_df['ScientificName'].isnull()]
print(len(null_df))

601


In [11]:
# pull out just the ones with good scientific names:
magsearch3_df = magsearch2_df[~magsearch2_df['ScientificName'].isnull()]
perc_non_null = len(magsearch3_df)/len(magsearch2_df)*100
print(f"Of {len(magsearch2_df)} MAGsearch results, {len(magsearch3_df)} have non-null metadata ({perc_non_null:.2f}%)")
magsearch3_df.head()

Of 5527 MAGsearch results, 4926 have non-null metadata (89.13%)


Unnamed: 0_level_0,query,containment,ScientificName
Run,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
DRR086622,S26,0.020896,marine metagenome
DRR086623,S26,0.01814,marine metagenome
DRR086624,S26,0.019977,marine metagenome
DRR086640,S26,0.017451,marine metagenome
DRR086642,S26,0.024569,marine metagenome


In [12]:
print(f'{len(set(magsearch3_df["query"]))} independent queries in results')

4 independent queries in results


In [13]:
# how many matches do we have for each query?
magsearch3_df["query"].value_counts()#[:20]

FV_DSM_15829_genome     3083
FV_PB189-T1-4_genome     890
S26                      841
C3_T13_0                 112
Name: query, dtype: int64

### Split Results by Query

We two very different queries! Let's split our results into a dataframe that only contains the marine queries.
marine_queries = ["S26", "C3_T13_0"]

vg_queries = ["FV_DSM_15829_genome", "FV_PB189-T1-4_genome"]

In [14]:
marine_queries = ["S26", "C3_T13_0"]
#vg_queries = ["FV_DSM_15829_genome", "FV_PB189-T1-4_genome"]

marine_df = magsearch3_df[magsearch3_df["query"].isin(marine_queries)]
#vg_df = magsearch3_df[magsearch3_df["query"].isin(vg_queries)]

In [15]:
marine_df["query"].unique()

array(['S26', 'C3_T13_0'], dtype=object)

In [16]:
marine_df.head()

Unnamed: 0_level_0,query,containment,ScientificName
Run,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
DRR086622,S26,0.020896,marine metagenome
DRR086623,S26,0.01814,marine metagenome
DRR086624,S26,0.019977,marine metagenome
DRR086640,S26,0.017451,marine metagenome
DRR086642,S26,0.024569,marine metagenome


# Start looking at the results!


In [17]:
# what are the top ScientificNames of the matches?
marine_df["ScientificName"].value_counts()[:20]

marine metagenome             167
biofilm metagenome            137
metagenome                    131
seawater metagenome            87
sediment metagenome            73
aquatic metagenome             64
marine sediment metagenome     34
oyster metagenome              28
synthetic metagenome           19
estuary metagenome             19
soil metagenome                16
wastewater metagenome          15
microbial mat metagenome       15
viral metagenome               12
salt marsh metagenome          11
coral reef metagenome           9
Escherichia                     9
freshwater metagenome           9
plastic metagenome              6
plant metagenome                6
Name: ScientificName, dtype: int64

## Sort Results by Containment

The default threshold for containment is 0.01, which means ~1% of the query genome needs to be found in the metagenome for it to be reported. That's not very stringent!

First, let's look at the SRA runs that had the **best** containment of our queries:

In [18]:
marine_df.sort_values(by=['containment'], ascending=False)[:20]

Unnamed: 0_level_0,query,containment,ScientificName
Run,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ERR2094172,S26,0.339839,metagenome
SRR6675345,S26,0.245924,gut metagenome
SRR12112861,S26,0.177268,seawater metagenome
ERR4674711,S26,0.166475,marine metagenome
ERR2094169,S26,0.143054,metagenome
SRR12112859,S26,0.134558,seawater metagenome
ERR2094168,S26,0.133639,metagenome
ERR2094175,S26,0.11504,metagenome
SRR3405417,S26,0.107003,synthetic metagenome
SRR3405420,S26,0.09667,synthetic metagenome


## Filter Results by Containment

We've found (rule of thumb) that 0.2 is a decent value - 20% - indicating some level of stringency. Let's take a look -

In [19]:
# let's do some filtering -
marine_df2 = marine_df[marine_df['containment'] > 0.2]

for name, df in {"marine": marine_df2}.items():
    print('query type:', name)
    print('total matches:', len(df))
    print('query:', len(set(df["query"])))
    print('metagenomes:', len(set(df.index)))
    print("\n")
    print(df["ScientificName"].value_counts()[:20], "\n\n")

query type: marine
total matches: 2
query: 1
metagenomes: 2


metagenome        1
gut metagenome    1
Name: ScientificName, dtype: int64 




In [20]:
marine_df2

Unnamed: 0_level_0,query,containment,ScientificName
Run,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ERR2094172,S26,0.339839,metagenome
SRR6675345,S26,0.245924,gut metagenome


In [25]:
# only two!! filter less stringently..
# let's do some filtering -
marine_df3 = marine_df[marine_df['containment'] > 0.05]

for name, df in {"marine": marine_df3}.items():
    print('query type:', name)
    print('total matches:', len(df))
    print('query:', len(set(df["query"])))
    print('metagenomes:', len(set(df.index)))
    print("\n")
    print(df["ScientificName"].value_counts()[:20], "\n\n")

query type: marine
total matches: 58
query: 1
metagenomes: 58


aquatic metagenome       18
seawater metagenome      13
synthetic metagenome      9
metagenome                8
biofilm metagenome        5
seagrass metagenome       1
marine metagenome         1
sponge metagenome         1
freshwater metagenome     1
gut metagenome            1
Name: ScientificName, dtype: int64 




In [26]:
# save the 5% containment results to a file
marine_df3.to_csv("marine_metagenome_results.0.05.containment.csv")

In [None]:
# look at just one genome...

In [27]:
c3 = marine_df[marine_df['query'] == "C3_T13_0"]
c3

Unnamed: 0_level_0,query,containment,ScientificName
Run,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ERR1992794,C3_T13_0,0.011612,synthetic metagenome
ERR1992808,C3_T13_0,0.013661,synthetic metagenome
ERR2092773,C3_T13_0,0.010018,metagenome
ERR2092774,C3_T13_0,0.013661,metagenome
ERR2092775,C3_T13_0,0.013206,metagenome
...,...,...,...
SRR8497110,C3_T13_0,0.018898,biofilm metagenome
SRR8497111,C3_T13_0,0.017760,biofilm metagenome
SRR8497112,C3_T13_0,0.023679,biofilm metagenome
SRR9037724,C3_T13_0,0.020947,aquatic metagenome


In [28]:
c3.sort_values(by=['containment'], ascending=False)[:30]

Unnamed: 0_level_0,query,containment,ScientificName
Run,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SRR8497107,C3_T13_0,0.030282,biofilm metagenome
SRR16548364,C3_T13_0,0.030055,aquatic metagenome
SRR16548371,C3_T13_0,0.030055,aquatic metagenome
SRR16548366,C3_T13_0,0.028689,aquatic metagenome
ERR4592245,C3_T13_0,0.028005,seagrass metagenome
SRR16548368,C3_T13_0,0.027778,aquatic metagenome
SRR16548367,C3_T13_0,0.024135,aquatic metagenome
SRR8497102,C3_T13_0,0.023907,biofilm metagenome
SRR8497112,C3_T13_0,0.023679,biofilm metagenome
SRR16548375,C3_T13_0,0.023224,aquatic metagenome
