# Comparison of number of reference and predicted plasmids

*Summary:*   
Recycler was the only tool to predict less plasmids than were expected (108 vs 147). 
plasmidSPAdes (166) and HyAsP bins (MOB-database: 156, NCBI-database: 202) were relatively close to the expected number of reference plasmids. 
MOB-recon and HyAsP plasmids (MOB-database) predicted almost 300 plasmids.
The number of predictions by HyAsP plasmids (NCBI-database) and the number of questionable plasmids on both databases were very high, indicating the importance of the postprocessing step of HyAsP to determine the putative plasmids and the benefits of the optional binning step.

In [3]:
import os.path
import pandas as pd
import subprocess

from Bio import SeqIO

pd.options.display.max_rows = None

In [5]:
# references and plasmids created by HyAsP or Recycler ('putative_plasmids.fasta', 'questionable_plasmids.fasta', 'assembly_graph.cycs.fasta')
def count_plasmids(plasmid_file):
    cnt = 0
    with open(plasmid_file, 'r') as in_file:
        for record in SeqIO.parse(in_file, 'fasta'):
            cnt += 1
    return cnt

# plasmid bins created by HyAsP ('plasmid_bins_putative.csv', when the binning option is activated)
def count_hyasp_bins(bin_list_file):
    cnt = 0
    with open(bin_list_file, 'r') as in_file:
        for i, line in enumerate(in_file):
            cnt += 1
    return cnt

# plasmids created by MOB-recon (plasmid FASTA files 'plasmid_*.fasta')
def count_mob_recon(predictions_files):
    cnt = 0
    for file in predictions_files:
        cnt += 1
    return cnt

# plasmids created by plasmidSPAdes ('scaffolds.fasta')
def count_plasmidspades(predictions_file):
    cnt = 0
    prediction_parts = []
    if os.path.isfile(predictions_file):
        with open(predictions_file, 'r') as in_file:
            for record in SeqIO.parse(in_file, 'fasta'):
                prediction_parts.append(record.id)

        cnt = len(set([p[p.rfind('component_') + 10:] for p in prediction_parts]))
    return cnt

## MOB-database

In [9]:
run = 'mob_filtered'
test_ids = [1,5,15,16,18,19,23,24,25,26,27,28,30,31,32,33,34,35,36,37,38,39,40,41,42,44,45,46,47,48,49,50,51,52,55,56,62,63,64,65,66,76,85,86,87,102,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,129,133]

contents = []

# reference plasmids
for sid in test_ids:
    references_file = '/project/6007976/wg-anoph/Plasmids-Assembly/exp/2018-10-29__analyses_greedy_paper/analysis_%s/sample_%i/eval/plasmids.fasta' % (run, sid)
    
    contents.append([sid, 'references', count_plasmids(references_file)])

# HyAsP (putative plasmids)
for sid in test_ids:
    predictions_file = '/project/6007976/wg-anoph/Plasmids-Assembly/exp/2018-10-29__analyses_greedy_paper/analysis_%s/sample_%i/greedy/plasmids/greedy/putative_plasmids.fasta' % (run, sid)
    
    contents.append([sid, 'HyAsP_putative', count_plasmids(predictions_file)])

# HyAsP (questionable plasmids)
for sid in test_ids:
    predictions_file = '/project/6007976/wg-anoph/Plasmids-Assembly/exp/2018-10-29__analyses_greedy_paper/analysis_%s/sample_%i/greedy/plasmids/greedy/questionable_plasmids.fasta' % (run, sid)

    contents.append([sid, 'HyAsP_questionable', count_plasmids(predictions_file)])
    
# HyAsP (bins)
bf = 2.5
for sid in test_ids:
    out_dir = '/project/6007976/wg-anoph/Plasmids-Assembly/revision1/binning/hyasp_binning_option/%s/sample_%i/bins_%f' % (run, sid, bf)
    bin_list_file = '%s/plasmid_bins_putative.csv' % out_dir

    contents.append([sid, 'HyAsP_putative_bins', count_hyasp_bins(bin_list_file)])
    
# MOB-recon
for sid in test_ids:
    out_dir = '/project/6007976/wg-anoph/Plasmids-Assembly/exp/2018-10-29__analyses_greedy_paper/analysis_%s/sample_%i' % (run, sid)

    p = subprocess.Popen('ls %s/mob_recon/plasmid_*.fasta' % out_dir, stdout = subprocess.PIPE, shell = True)
    output, _ = p.communicate()
    p.wait()
    predictions_files = output.rstrip().decode().split('\n')
    if predictions_files == ['']:
        predictions_files = []

    contents.append([sid, 'MOB-recon', count_mob_recon(predictions_files)])

# plasmidSPAdes
for sid in test_ids:
    predictions_file = '/project/6007976/wg-anoph/Plasmids-Assembly/exp/2018-10-29__analyses_greedy_paper/analysis_%s/sample_%i/plasmidspades/scaffolds.fasta' % (run, sid)

    contents.append([sid, 'plasmidSPAdes', count_plasmidspades(predictions_file)])
    
# Recycler
for sid in test_ids:
    predictions_file = '/project/6007976/wg-anoph/Plasmids-Assembly/revision1/recycler_test/recycler_outputs/plasmids/%i_assembly_graph.cycs.fasta' % sid
    
    contents.append([sid, 'Recycler', count_plasmids(predictions_file)])
    
mob_counts = pd.DataFrame(contents, columns = ['sample_id', 'tool', 'num_plasmids'])

In [12]:
pd.DataFrame([[t, grp['num_plasmids'].sum()] for t, grp in mob_counts.groupby('tool')], columns = ['tool', 'num_plasmids'])

Unnamed: 0,tool,num_plasmids
0,HyAsP_putative,293
1,HyAsP_putative_bins,156
2,HyAsP_questionable,364
3,MOB-recon,287
4,Recycler,108
5,plasmidSPAdes,166
6,references,147


## NCBI-database

In [13]:
run = 'ncbi_filtered'
test_ids = [1,5,15,16,18,19,23,24,25,26,27,28,30,31,32,33,34,35,36,37,38,39,40,41,42,44,45,46,47,48,49,50,51,52,55,56,62,63,64,65,66,76,85,86,87,102,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,129,133]

contents = []

# reference plasmids
for sid in test_ids:
    references_file = '/project/6007976/wg-anoph/Plasmids-Assembly/exp/2018-10-29__analyses_greedy_paper/analysis_%s/sample_%i/eval/plasmids.fasta' % (run, sid)
    
    contents.append([sid, 'references', count_plasmids(references_file)])

# HyAsP (putative plasmids)
for sid in test_ids:
    predictions_file = '/project/6007976/wg-anoph/Plasmids-Assembly/exp/2018-10-29__analyses_greedy_paper/analysis_%s/sample_%i/greedy/plasmids/greedy/putative_plasmids.fasta' % (run, sid)
    
    contents.append([sid, 'HyAsP_putative', count_plasmids(predictions_file)])

# HyAsP (questionable plasmids)
for sid in test_ids:
    predictions_file = '/project/6007976/wg-anoph/Plasmids-Assembly/exp/2018-10-29__analyses_greedy_paper/analysis_%s/sample_%i/greedy/plasmids/greedy/questionable_plasmids.fasta' % (run, sid)

    contents.append([sid, 'HyAsP_questionable', count_plasmids(predictions_file)])
    
# HyAsP (bins)
bf = 2.5
for sid in test_ids:
    out_dir = '/project/6007976/wg-anoph/Plasmids-Assembly/revision1/binning/hyasp_binning_option/%s/sample_%i/bins_%f' % (run, sid, bf)
    bin_list_file = '%s/plasmid_bins_putative.csv' % out_dir

    contents.append([sid, 'HyAsP_putative_bins', count_hyasp_bins(bin_list_file)])
    
# MOB-recon
for sid in test_ids:
    out_dir = '/project/6007976/wg-anoph/Plasmids-Assembly/exp/2018-10-29__analyses_greedy_paper/analysis_%s/sample_%i' % (run, sid)

    p = subprocess.Popen('ls %s/mob_recon/plasmid_*.fasta' % out_dir, stdout = subprocess.PIPE, shell = True)
    output, _ = p.communicate()
    p.wait()
    predictions_files = output.rstrip().decode().split('\n')
    if predictions_files == ['']:
        predictions_files = []

    contents.append([sid, 'MOB-recon', count_mob_recon(predictions_files)])

# plasmidSPAdes
for sid in test_ids:
    predictions_file = '/project/6007976/wg-anoph/Plasmids-Assembly/exp/2018-10-29__analyses_greedy_paper/analysis_%s/sample_%i/plasmidspades/scaffolds.fasta' % (run, sid)

    contents.append([sid, 'plasmidSPAdes', count_plasmidspades(predictions_file)])
    
# Recycler
for sid in test_ids:
    predictions_file = '/project/6007976/wg-anoph/Plasmids-Assembly/revision1/recycler_test/recycler_outputs/plasmids/%i_assembly_graph.cycs.fasta' % sid
    
    contents.append([sid, 'Recycler', count_plasmids(predictions_file)])
    
ncbi_counts = pd.DataFrame(contents, columns = ['sample_id', 'tool', 'num_plasmids'])

In [14]:
pd.DataFrame([[t, grp['num_plasmids'].sum()] for t, grp in ncbi_counts.groupby('tool')], columns = ['tool', 'num_plasmids'])

Unnamed: 0,tool,num_plasmids
0,HyAsP_putative,490
1,HyAsP_putative_bins,202
2,HyAsP_questionable,945
3,MOB-recon,286
4,Recycler,108
5,plasmidSPAdes,166
6,references,147
