In [4]:
import pandas as pd

In [5]:
pd.__version__

'1.2.4'

## Assess SRA Search Results

### Load Results

In [9]:
%%bash

#curl -L https://osf.io/n8zga/download -o ../output.genome-magsearch/results/jb.k21.csv.gz
ls ../output.genome-magsearch/results

jb.k21.csv.gz
jb.k31.csv
jb.k31.csv.gz


In [12]:
# read in k=21 SRA search results
ms21 = pd.read_csv("../output.genome-magsearch/results/jb.k21.csv.gz",
                   sep=",",
                   quotechar="'",
                   header=0,
                   names=["search_genome", "metagenome", "containment"])
ms21

Unnamed: 0,search_genome,metagenome,containment
0,2523533589,/group/ctbrowngrp/irber/data/wort-data/wort-sr...,0.043357
1,2561511076,/group/ctbrowngrp/irber/data/wort-data/wort-sr...,0.010787
2,2563366544,/group/ctbrowngrp/irber/data/wort-data/wort-sr...,0.010702
3,2630968516,/group/ctbrowngrp/irber/data/wort-data/wort-sr...,0.012132
4,2630968951,/group/ctbrowngrp/irber/data/wort-data/wort-sr...,0.010401
...,...,...,...
4496729,2534682289,/group/ctbrowngrp/irber/data/wort-data/wort-sr...,0.011494
4496730,2558860163,/group/ctbrowngrp/irber/data/wort-data/wort-sr...,0.017456
4496731,2561511058,/group/ctbrowngrp/irber/data/wort-data/wort-sr...,0.015075
4496732,2561511071,/group/ctbrowngrp/irber/data/wort-data/wort-sr...,0.010786


In [13]:
# Fix names so it's easier to query
ms21['search_genome'] = ms21['search_genome'].str.replace(r"'(?P<id>.*)'", lambda m: m.group("id"), regex=True)
ms21['metagenome'] = ms21['metagenome'].str.replace(r".*/(?P<id>.*).sig.*", lambda m: m.group("id"), regex=True)
ms21['accession'] = ms21['search_genome'].str.split(" ", 1, expand=True)[0]
ms21

Unnamed: 0,search_genome,metagenome,containment,accession
0,2523533589,SRR532530,0.043357,2523533589
1,2561511076,SRR12927917,0.010787,2561511076
2,2563366544,ERR1913926,0.010702,2563366544
3,2630968516,ERR1913926,0.012132,2630968516
4,2630968951,SRR5131961,0.010401,2630968951
...,...,...,...,...
4496729,2534682289,SRR10692326,0.011494,2534682289
4496730,2558860163,SRR10692326,0.017456,2558860163
4496731,2561511058,SRR10692326,0.015075,2561511058
4496732,2561511071,SRR10692326,0.010786,2561511071


In [14]:
# print some summary info for 50%+ containment
search_ksize = 21
num_search_genomes = len(ms21["search_genome"].unique())
num_search_genomes_with_matches = len(ms21[ms21['containment'] > 0.5]["search_genome"].unique())
num_unique_metagenome_matches = len(ms21[ms21['containment'] > 0.5]["metagenome"].unique())

print(f"Search ksize: {search_ksize}")
print(f"# Search genomes: {num_search_genomes}")
print(f"# Search genomes with 'good' metagenome matches: {num_search_genomes_with_matches}")
print(f"# Metagenomes with >50% containment of at least one search genome: {num_unique_metagenome_matches}")

Search ksize: 21
# Search genomes: 95
# Search genomes with 'good' metagenome matches: 35
# Metagenomes with >50% containment of at least one search genome: 493


In [15]:
# Summarize 30% containment, just out of curiosity
search_ksize = 21
c30_matches = len(ms21[ms21['containment'] > 0.3]["metagenome"].unique())
print(f"Search ksize: {search_ksize}")
print(f"# Metagenomes with >30% containment of at least one search genome: {c30_matches}")

Search ksize: 21
# Metagenomes with >30% containment of at least one search genome: 848


## Explore & Summarize >50% containment results

In [16]:
#subset to just >50% containment; sort by containment & look at top metagenome matches
ms21_c50 = ms21[ms21['containment'] > 0.5]
ms21_c50 = ms21_c50.sort_values(by="containment", ascending=False, ignore_index=True)
ms21_c50

Unnamed: 0,search_genome,metagenome,containment,accession
0,637000176,ERR3377554,1.000000,637000176
1,2597490271,SRR2133847,1.000000,2597490271
2,ERR4765901_1,ERR4765901,1.000000,ERR4765901_1
3,637000176,ERR3386158,1.000000,637000176
4,637000176,ERR3415696,1.000000,637000176
...,...,...,...,...
524,2667527558,SRR8369521,0.503226,2667527558
525,2757320367,SRR12349262,0.503089,2757320367
526,2561511236,SRR5371507,0.503016,2561511236
527,648028044,SRR10136702,0.502523,648028044


## Save some better formatted results

In [17]:
# save sorted containment-50 csvs
!mkdir -p ../output.genome-magsearch/processed_results

ms21_c50.to_csv("../output.genome-magsearch/processed_results/jb.sra-search.k21-c50.csv", index=False)

In [18]:
# save list of all unique matched metagenomes
m21_metagenomes = ms21_c50["metagenome"].unique()
!mkdir -p ../output.magsearch/processed_results
with open('../output.genome-magsearch/processed_results/ms21_c50.unique-metagenomes.txt', 'w') as out:
    for mg in list(m21_metagenomes):
        out.write(f"{mg}\n")
m21_metagenomes

array(['ERR3377554', 'SRR2133847', 'ERR4765901', 'ERR3386158',
       'ERR3415696', 'SRR4046667', 'ERR3370534', 'ERR3348307',
       'ERR3325125', 'ERR3140035', 'ERR3385286', 'ERR3374312',
       'ERR3348960', 'ERR3363815', 'ERR3365599', 'ERR4765900',
       'ERR3444455', 'ERR3140034', 'ERR3427528', 'SRR4046666',
       'ERR3458908', 'ERR3404460', 'ERR3140033', 'ERR4765898',
       'ERR3318558', 'ERR3485549', 'ERR3404461', 'ERR3386235',
       'ERR3140022', 'ERR4765902', 'ERR4765899', 'ERR3140021',
       'ERR3140023', 'SRR5416892', 'ERR3139975', 'ERR2856160',
       'SRR8369453', 'SRR12635561', 'ERR3386233', 'ERR2856158',
       'ERR3139973', 'ERR2856159', 'ERR3139974', 'SRR9841585',
       'ERR3450189', 'SRR8369705', 'SRR8369523', 'SRR8369653',
       'SRR8369569', 'SRR8369694', 'ERR5094874', 'SRR13206618',
       'SRR8369545', 'SRR8369423', 'SRR12635522', 'SRR8369395',
       'SRR8369531', 'SRR8369497', 'SRR12635555', 'SRR8369426',
       'SRR13178630', 'SRR13203221', 'SRR8369679', 