In [64]:
import os
import sys
import glob
import scipy
import skbio
import matplotlib
import numpy as np
import pandas as pd
import seaborn as sns
from collections import defaultdict
from matplotlib import pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages

%matplotlib inline
sns.set_style('whitegrid')
matplotlib.rcParams['ps.fonttype'] = 42
matplotlib.rcParams['pdf.fonttype'] = 42

from IPython.display import display, HTML
import tRep

import warnings
warnings.filterwarnings("ignore")

pd.options.display.max_columns = 100
pd.options.display.max_colwidth = 500
pd.options.display.max_rows = 200

### Load relevant tables

In [65]:
Gdb = pd.read_csv('/groups/banfield/projects/human/data3/clou/wastewater/Datasheets/metadata/viral_name_df.csv')
Rdb = pd.read_csv('/groups/banfield/projects/human/data3/clou/wastewater/Datasheets/waster_water_readDepths_formatted.csv')
Mdb = pd.read_csv('/groups/banfield/projects/human/data3/clou/wastewater/Datasheets/metadata/sample_metadata_v2_sample_name_adjusted.csv')

In [66]:
genome2description = dict(zip(Gdb['viral_NCBI_refname'],Gdb['description']))

sample2core = dict(zip(Mdb['sample'],Mdb['core']))
sample2description = dict(zip(Mdb['sample'],Mdb['sample_complete_description']))
sample2enrichment = dict(zip(Mdb['sample'],Mdb['sequencing']))
sample2method = dict(zip(Mdb['sample'],Mdb['method']))

### Load IS genome_info tables

In [67]:
tables = []
for gw in glob.glob('/groups/banfield/projects/human/data3/clou/wastewater/Mapping_to_viral_genomes/inStrain/all_viral_genomes-*.IS/output/*_genome_info.tsv'):
    df = pd.read_csv(gw,sep='\t')
    sample_name = os.path.basename(gw).split('vs-')[1].split('.shrink')[0]
    df.insert(0,'sample',sample_name)
    tables.append(df)

ISdb = pd.concat(tables)
ISdb['sample_complete_description'] = ISdb['sample'].map(sample2description)
ISdb['core'] = ISdb['sample'].map(sample2core)
ISdb['method'] = ISdb['sample'].map(sample2method)
ISdb['method'] = ['amicon' if 'amicon' in x else 'MOS & COL' for x in ISdb['method']]
ISdb['enrichment'] = ISdb['sample'].map(sample2enrichment)
ISdb['subgroup'] = ['MIGS_'+x if y == 'MIGS' else "Illumina_"+z for x,y,z in zip(ISdb['method'],ISdb['core'],ISdb['enrichment'])]

ISdb = ISdb[['sample', 'sample_complete_description','core','method','enrichment','subgroup',\
             'genome', 'length', 'coverage', 'breadth',
             'popANI_reference','conANI_reference','filtered_read_pair_count', 'reads_unfiltered_pairs',
             'reads_mean_PID', 'divergent_site_count', 'reads_unfiltered_reads','nucl_diversity', 
             'true_scaffolds', 'detected_scaffolds','coverage_median', 'coverage_std', 'coverage_SEM', 
             'breadth_minCov','breadth_expected', 'nucl_diversity_rarefied', 'iRep', 'iRep_GC_corrected', \
             'linked_SNV_count','SNV_distance_mean', 'r2_mean', 'd_prime_mean',\
             'consensus_divergent_sites', 'population_divergent_sites', 'SNS_count','SNV_count']]

ISdb = ISdb.sort_values('sample').reset_index(drop=True)
ISdb


Unnamed: 0,sample,sample_complete_description,core,method,enrichment,subgroup,genome,length,coverage,breadth,popANI_reference,conANI_reference,filtered_read_pair_count,reads_unfiltered_pairs,reads_mean_PID,divergent_site_count,reads_unfiltered_reads,nucl_diversity,true_scaffolds,detected_scaffolds,coverage_median,coverage_std,coverage_SEM,breadth_minCov,breadth_expected,nucl_diversity_rarefied,iRep,iRep_GC_corrected,linked_SNV_count,SNV_distance_mean,r2_mean,d_prime_mean,consensus_divergent_sites,population_divergent_sites,SNS_count,SNV_count
0,5_13_A_S1,5_13_A_S1_MIGS,MIGS,amicon,enriched,MIGS_amicon,Youcai_mosaic_virus-tax228578-GCF_000852505.1_ViralProj14869_genomic.fna,6303.0,0.048707,0.023322,0.000000,0.000000,3,3,0.972819,0.0,6,,1.0,1.0,0,0.337108,0.004316,0.000000,0.042097,,,False,,,,,0.0,0.0,0.0,0.0
1,5_13_A_S1,5_13_A_S1_MIGS,MIGS,amicon,enriched,MIGS_amicon,Wuhan_insect_virus_23_strain_WHCCII13263-tax1923727-GCF_001921615.1_ViralMultiSegProj358547_genomic.fna,2858.0,0.939468,0.479356,1.000000,1.000000,21,28,0.969949,0.0,56,0.000000,2.0,2.0,1,1.296242,0.026151,0.013296,0.563755,0.0,,False,,,,,0.0,0.0,0.0,0.0
2,5_13_A_S1,5_13_A_S1_MIGS,MIGS,amicon,enriched,MIGS_amicon,Bovine_coronavirus-tax11128-GCF_000862505.1_ViralProj15385_genomic.fna,31028.0,0.004093,0.004093,0.000000,0.000000,1,6,0.969073,0.0,12,,1.0,1.0,0,0.064052,0.000365,0.000000,0.003608,,,False,,,,,0.0,0.0,0.0,0.0
3,5_13_A_S1,5_13_A_S1_MIGS,MIGS,amicon,enriched,MIGS_amicon,Cactus_virus_X-tax112227-GCF_000856405.1_ViralProj14996_genomic.fna,6614.0,0.040067,0.040067,0.000000,0.000000,2,2,0.996599,0.0,4,,1.0,1.0,0,0.199020,0.002485,0.000000,0.034760,,,False,,,,,0.0,0.0,0.0,0.0
4,5_13_A_S1,5_13_A_S1_MIGS,MIGS,amicon,enriched,MIGS_amicon,Chayote_mosaic_virus-tax71030-GCF_000862665.1_ViralProj15420_genomic.fna,6364.0,0.039912,0.028127,0.000000,0.000000,2,7,0.909987,0.0,14,,1.0,1.0,0,0.252673,0.003219,0.000000,0.034629,,,False,,,,,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5439,pSQ2,pSQ2_ILLUMINA,ILLUMINA,MOS & COL,enriched,Illumina_enriched,Lettuce_big_vein_associated_varicosavirus-tax1985698-GCF_000881815.1_ViralMultiSegProj32725_genomic.fna,12878.0,0.011725,0.011725,0.000000,0.000000,1,1,0.973510,0.0,8,,2.0,1.0,0,0.109338,0.000979,0.000000,0.010300,,,False,,,,,0.0,0.0,0.0,0.0
5440,pSQ2,pSQ2_ILLUMINA,ILLUMINA,MOS & COL,enriched,Illumina_enriched,Lactobacillus_phage_JCL1032-tax37105-GCF_000902335.1_ViralProj181076_genomic.fna,49433.0,0.011005,0.001780,0.953125,0.953125,4,4,0.960526,3.0,28,0.000000,1.0,1.0,0,0.267160,0.001204,0.001295,0.009670,0.0,,False,,,,,3.0,3.0,3.0,0.0
5441,pSQ2,pSQ2_ILLUMINA,ILLUMINA,MOS & COL,enriched,Illumina_enriched,Klebsiella_phage_4_LV_2017-tax1960658-GCF_002619645.1_ASM261964v1_genomic.fna,33540.0,0.027072,0.004532,0.986842,0.986842,6,10,0.971053,2.0,30,0.000000,1.0,1.0,0,0.402574,0.002205,0.004532,0.023621,0.0,,False,,,,,2.0,2.0,2.0,0.0
5442,pSQ2,pSQ2_ILLUMINA,ILLUMINA,MOS & COL,enriched,Illumina_enriched,Salmonella_phage_RE_2010-tax929814-GCF_000903195.1_ViralProj181070_genomic.fna,34117.0,0.012428,0.003547,1.000000,1.000000,3,6,0.971390,0.0,23,0.000000,1.0,1.0,0,0.215198,0.001169,0.000879,0.010914,0.0,,False,,,,,0.0,0.0,0.0,0.0


In [68]:
print(len(ISdb['sample'].unique()))
print(len(ISdb['genome'].unique()))

34
1169


In [69]:
# save
ISdb.to_csv('/groups/banfield/projects/human/data3/clou/wastewater/Datasheets/Wastewater_DeltaCoverage_All_Viral_Genomes_v1.csv',\
            index=False)

### Add relative abundance

In [70]:
ISdb2 = ISdb.copy()
ISdb2['sample_read_bases'] = ISdb2['sample'].map(Rdb.set_index('sample')['read_bases'].to_dict())
ISdb2['sample_reads'] = ISdb2['sample'].map(Rdb.set_index('sample')['reads'].to_dict())
ISdb2['relative_abundance (%)'] = [(r*2/tr)*100 for r,tr in zip(ISdb2['filtered_read_pair_count'], ISdb2['sample_reads'])]

ISdb2 = ISdb2[['sample', 'sample_complete_description','core','method','enrichment','subgroup',\
               'genome', 'length', 'relative_abundance (%)','breadth','coverage','sample_read_bases','sample_reads',
               'popANI_reference','conANI_reference','filtered_read_pair_count', 'reads_unfiltered_pairs',
               'reads_mean_PID', 'divergent_site_count', 'reads_unfiltered_reads','nucl_diversity', 
               'true_scaffolds', 'detected_scaffolds','coverage_median', 'coverage_std', 'coverage_SEM', 
               'breadth_minCov','breadth_expected', 'nucl_diversity_rarefied', 'iRep', 'iRep_GC_corrected', \
               'linked_SNV_count','SNV_distance_mean', 'r2_mean', 'd_prime_mean',\
               'consensus_divergent_sites', 'population_divergent_sites', 'SNS_count','SNV_count']]

ISdb2 = ISdb2.sort_values('sample').reset_index(drop=True)
ISdb2

Unnamed: 0,sample,sample_complete_description,core,method,enrichment,subgroup,genome,length,relative_abundance (%),breadth,coverage,sample_read_bases,sample_reads,popANI_reference,conANI_reference,filtered_read_pair_count,reads_unfiltered_pairs,reads_mean_PID,divergent_site_count,reads_unfiltered_reads,nucl_diversity,true_scaffolds,detected_scaffolds,coverage_median,coverage_std,coverage_SEM,breadth_minCov,breadth_expected,nucl_diversity_rarefied,iRep,iRep_GC_corrected,linked_SNV_count,SNV_distance_mean,r2_mean,d_prime_mean,consensus_divergent_sites,population_divergent_sites,SNS_count,SNV_count
0,5_13_A_S1,5_13_A_S1_MIGS,MIGS,amicon,enriched,MIGS_amicon,Youcai_mosaic_virus-tax228578-GCF_000852505.1_ViralProj14869_genomic.fna,6303.0,0.007302,0.023322,0.048707,6107540,82174,0.000000,0.000000,3,3,0.972819,0.0,6,,1.0,1.0,0,0.337108,0.004316,0.000000,0.042097,,,False,,,,,0.0,0.0,0.0,0.0
1,5_13_A_S1,5_13_A_S1_MIGS,MIGS,amicon,enriched,MIGS_amicon,Melon_necrotic_spot_virus-tax11987-GCF_000865645.1_ViralProj15502_genomic.fna,4266.0,0.002434,0.031177,0.031177,6107540,82174,0.000000,0.000000,1,1,0.979866,0.0,2,,1.0,1.0,0,0.177877,0.002790,0.000000,0.027154,,,False,,,,,0.0,0.0,0.0,0.0
2,5_13_A_S1,5_13_A_S1_MIGS,MIGS,amicon,enriched,MIGS_amicon,Tropical_soda_apple_mosaic_virus-tax327387-GCF_001654245.1_ViralProj322841_genomic.fna,6350.0,0.019471,0.084094,0.149921,6107540,82174,0.000000,0.000000,8,8,0.989111,0.0,16,,1.0,1.0,0,0.542597,0.006920,0.000000,0.123992,,,False,,,,,0.0,0.0,0.0,0.0
3,5_13_A_S1,5_13_A_S1_MIGS,MIGS,amicon,enriched,MIGS_amicon,Tomato_mosaic_virus-tax12253-GCF_000853705.1_ViralProj14926_genomic.fna,6383.0,0.116825,0.436002,0.950337,6107540,82174,1.000000,0.995833,48,67,0.974626,1.0,136,0.003333,1.0,1.0,0,1.602238,0.020378,0.037600,0.567921,0.000000,,False,,,,,1.0,0.0,0.0,1.0
4,5_13_A_S1,5_13_A_S1_MIGS,MIGS,amicon,enriched,MIGS_amicon,Tomato_brown_rugose_fruit_virus-tax1761477-GCF_001461485.1_ViralProj304915_genomic.fna,6393.0,1.591744,0.936806,13.321758,6107540,82174,0.999802,0.999207,654,743,0.990614,33.0,1524,0.004306,1.0,1.0,11,11.393603,0.144792,0.788675,0.999992,0.000000,1.607168,False,20.0,7.950,0.515256,0.97197,4.0,1.0,1.0,32.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5439,pSQ2,pSQ2_ILLUMINA,ILLUMINA,MOS & COL,enriched,Illumina_enriched,Aeromonas_phage_Aes012-tax1198014-GCF_000907075.1_ViralProj195532_genomic.fna,161978.0,0.000500,0.001852,0.031294,1010081054,14005704,0.993333,0.993333,35,108,0.995379,2.0,304,0.000332,1.0,1.0,0,0.730426,0.001816,0.001852,0.027255,0.000000,,False,,,,,2.0,2.0,2.0,0.0
5440,pSQ2,pSQ2_ILLUMINA,ILLUMINA,MOS & COL,enriched,Illumina_enriched,Aeromonas_phage_AS_gz-tax2026082-GCF_002629085.1_ASM262908v1_genomic.fna,162422.0,0.000071,0.000782,0.004642,1010081054,14005704,1.000000,1.000000,5,75,0.995129,0.0,289,0.000000,1.0,1.0,0,0.174726,0.000434,0.000782,0.004091,0.000000,,False,,,,,0.0,0.0,0.0,0.0
5441,pSQ2,pSQ2_ILLUMINA,ILLUMINA,MOS & COL,enriched,Illumina_enriched,Cucumber_green_mottle_mosaic_virus_strain_SH-tax12235-GCF_000849225.1_ViralProj14681_genomic.fna,6424.0,0.019763,0.855542,31.806351,1010081054,14005704,0.985368,0.984983,1384,1793,0.985378,92.0,4166,0.001427,1.0,1.0,21,34.210298,0.433668,0.808531,1.000000,0.000944,,False,8.0,27.875,0.594470,1.00000,78.0,76.0,76.0,16.0
5442,pSQ2,pSQ2_ILLUMINA,ILLUMINA,MOS & COL,enriched,Illumina_enriched,Corynebacterium_phage_SamW-tax2301601-GCF_003601335.1_ASM360133v1_genomic.fna,44609.0,0.000014,0.003385,0.003385,1010081054,14005704,0.000000,0.000000,1,1,0.973510,0.0,4,,1.0,1.0,0,0.058212,0.000276,0.000000,0.002984,,,False,,,,,0.0,0.0,0.0,0.0


In [71]:
# filtered by breadth

ISdb2_filtered = ISdb2[ISdb2['breadth']>=0.1].reset_index(drop=True)
ISdb2_filtered

Unnamed: 0,sample,sample_complete_description,core,method,enrichment,subgroup,genome,length,relative_abundance (%),breadth,coverage,sample_read_bases,sample_reads,popANI_reference,conANI_reference,filtered_read_pair_count,reads_unfiltered_pairs,reads_mean_PID,divergent_site_count,reads_unfiltered_reads,nucl_diversity,true_scaffolds,detected_scaffolds,coverage_median,coverage_std,coverage_SEM,breadth_minCov,breadth_expected,nucl_diversity_rarefied,iRep,iRep_GC_corrected,linked_SNV_count,SNV_distance_mean,r2_mean,d_prime_mean,consensus_divergent_sites,population_divergent_sites,SNS_count,SNV_count
0,5_13_A_S1,5_13_A_S1_MIGS,MIGS,amicon,enriched,MIGS_amicon,Tomato_mosaic_virus-tax12253-GCF_000853705.1_ViralProj14926_genomic.fna,6383.0,0.116825,0.436002,0.950337,6107540,82174,1.000000,0.995833,48,67,0.974626,1.0,136,0.003333,1.0,1.0,0,1.602238,0.020378,0.037600,0.567921,0.000000,,False,,,,,1.0,0.0,0.0,1.0
1,5_13_A_S1,5_13_A_S1_MIGS,MIGS,amicon,enriched,MIGS_amicon,Tomato_brown_rugose_fruit_virus-tax1761477-GCF_001461485.1_ViralProj304915_genomic.fna,6393.0,1.591744,0.936806,13.321758,6107540,82174,0.999802,0.999207,654,743,0.990614,33.0,1524,0.004306,1.0,1.0,11,11.393603,0.144792,0.788675,0.999992,0.000000,1.607168,False,20.0,7.950000,0.515256,0.971970,4.0,1.0,1.0,32.0
2,5_13_A_S1,5_13_A_S1_MIGS,MIGS,amicon,enriched,MIGS_amicon,Severe_acute_respiratory_syndrome_coronavirus_2-tax2697049-GCF_009858895.2_ASM985889v3_genomic.fna,29903.0,1.445713,0.704979,2.508344,6107540,82174,0.999827,0.999482,594,662,0.988274,8.0,1377,0.004077,1.0,1.0,1,3.323712,0.019285,0.193860,0.890832,0.000000,,False,,,,,3.0,1.0,1.0,7.0
3,5_13_A_S1,5_13_A_S1_MIGS,MIGS,amicon,enriched,MIGS_amicon,Pepper_mild_mottle_virus_strain_S-tax12239-GCF_000859645.1_ViralProj15148_genomic.fna,6357.0,0.058413,0.339626,0.495989,6107540,82174,0.000000,0.000000,24,30,0.974398,0.0,61,,1.0,1.0,0,0.776961,0.009903,0.000000,0.354647,,,False,,,,,0.0,0.0,0.0,0.0
4,5_13_A_S1,5_13_A_S1_MIGS,MIGS,amicon,enriched,MIGS_amicon,Cucumber_green_mottle_mosaic_virus_strain_SH-tax12235-GCF_000849225.1_ViralProj14681_genomic.fna,6424.0,0.029206,0.149440,0.259496,6107540,82174,1.000000,1.000000,12,13,0.981476,0.0,26,0.000000,1.0,1.0,0,0.770032,0.009761,0.008095,0.204779,0.000000,,False,,,,,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
688,pSQ2,pSQ2_ILLUMINA,ILLUMINA,MOS & COL,enriched,Illumina_enriched,Severe_acute_respiratory_syndrome_coronavirus_2-tax2697049-GCF_009858895.2_ASM985889v3_genomic.fna,29903.0,14.236542,0.701970,4979.995853,1010081054,14005704,0.998414,0.998126,996964,1043166,0.997563,72.0,3061737,0.001374,1.0,1.0,2962,6815.093249,39.543851,0.695984,1.000000,0.001237,2.546182,False,57.0,44.666667,0.517487,0.904007,39.0,33.0,33.0,39.0
689,pSQ2,pSQ2_ILLUMINA,ILLUMINA,MOS & COL,enriched,Illumina_enriched,Tomato_brown_rugose_fruit_virus-tax1761477-GCF_001461485.1_ViralProj304915_genomic.fna,6393.0,0.028374,0.877366,45.930862,1010081054,14005704,0.998573,0.997681,1987,2632,0.997079,30.0,5454,0.001613,1.0,1.0,37,43.645683,0.554659,0.877053,1.000000,0.001194,,False,6.0,6.166667,0.573476,1.000000,13.0,8.0,8.0,22.0
690,pSQ2,pSQ2_ILLUMINA,ILLUMINA,MOS & COL,enriched,Illumina_enriched,Pepper_mild_mottle_virus_strain_S-tax12239-GCF_000859645.1_ViralProj15148_genomic.fna,6357.0,0.001885,0.229353,3.068429,1010081054,14005704,0.991555,0.991555,132,214,0.991006,14.0,475,0.000455,1.0,1.0,0,6.690919,0.085278,0.223533,0.933425,0.000000,,False,1.0,46.000000,1.000000,1.000000,12.0,12.0,12.0,2.0
691,pSQ2,pSQ2_ILLUMINA,ILLUMINA,MOS & COL,enriched,Illumina_enriched,uncultured_crAssphage-tax1211417-GCF_000922395.1_ViralProj259336_genomic.fna,97065.0,0.011467,0.122907,1.223974,1010081054,14005704,0.987179,0.986958,803,1006,0.985908,125.0,2573,0.000986,1.0,1.0,0,4.188498,0.013458,0.093216,0.660665,0.000000,,False,1.0,108.000000,1.000000,1.000000,118.0,116.0,116.0,9.0


In [72]:
print(len(ISdb2['sample'].unique()))
print(len(ISdb2['genome'].unique()))
print(len(ISdb2_filtered['sample'].unique()))
print(len(ISdb2_filtered['genome'].unique()))

34
1169
34
131


In [73]:
# save
ISdb2.to_csv('/groups/banfield/projects/human/data3/clou/wastewater/Datasheets/Wastewater_DeltaCoverage_All_Viral_Genomes_v2_AddRelativeAbundance.csv',\
            index=False)
ISdb2_filtered.to_csv('/groups/banfield/projects/human/data3/clou/wastewater/Datasheets/Wastewater_DeltaCoverage_All_Viral_Genomes_v2b_SetMinBreadth.csv',\
                     index=False)

### Load IS_mapping tables

In [74]:
tables = []
for IS_m in glob.glob('/groups/banfield/projects/human/data3/clou/wastewater/Mapping_to_viral_genomes/inStrain/all_viral_genomes-vs-*.IS/output/all_viral_genomes-vs-*.IS_mapping_info.tsv'):
    df = pd.read_csv(IS_m,sep='\t',skiprows=1).head(1)
    sample_name = os.path.basename(IS_m).split('-vs-')[1].split('.shrink')[0]
    df.insert(0,'sample',sample_name)
    tables.append(df)
    
RMdb = pd.concat(tables)
RMdb['sample_complete_description'] = RMdb['sample'].map(sample2description)
RMdb['core'] = RMdb['sample'].map(sample2core)
RMdb['method'] = RMdb['sample'].map(sample2method)
RMdb['method'] = ['amicon' if 'amicon' in x else 'MOS & COL' for x in RMdb['method']]
RMdb['enrichment'] = RMdb['sample'].map(sample2enrichment)
RMdb['subgroup'] = ['MIGS_'+x if y == 'MIGS' else "Illumina_"+z for x,y,z in zip(RMdb['method'],RMdb['core'],RMdb['enrichment'])]

RMdb = RMdb[['sample', 'sample_complete_description','core','method','enrichment','subgroup',\
              'filtered_pairs','unfiltered_reads','unfiltered_pairs','pass_pairing_filter',
             'pass_min_read_ani', 'unfiltered_priority_reads', 'filtered_singletons',
             'mean_insert_distance', 'mean_mistmaches', 'pass_max_insert',
             'unfiltered_singletons', 'mean_PID', 'pass_min_insert',
              'mean_mapq_score', 'filtered_priority_reads',
             'mean_pair_length', 'pass_min_mapq', 'median_insert']]

RMdb = RMdb.sort_values('sample').reset_index(drop=True)
RMdb

Unnamed: 0,sample,sample_complete_description,core,method,enrichment,subgroup,filtered_pairs,unfiltered_reads,unfiltered_pairs,pass_pairing_filter,pass_min_read_ani,unfiltered_priority_reads,filtered_singletons,mean_insert_distance,mean_mistmaches,pass_max_insert,unfiltered_singletons,mean_PID,pass_min_insert,mean_mapq_score,filtered_priority_reads,mean_pair_length,pass_min_mapq,median_insert
0,5_13_A_S1,5_13_A_S1_MIGS,MIGS,amicon,enriched,MIGS_amicon,1381,3313,1606,1606,1506.0,0,0,163.290162,1.908468,1598.0,101,0.987174,1499.0,39.1868,0,148.517435,1556.0,153.980075
1,5_13_C_S2,5_13_C_S2_MIGS,MIGS,amicon,enriched,MIGS_amicon,1076,2448,1199,1199,1146.0,0,0,207.155963,1.323603,1186.0,50,0.991113,1140.0,39.793161,0,148.83653,1171.0,186.267723
2,5_19_A_S4,5_19_A_S4_MIGS,MIGS,amicon,enriched,MIGS_amicon,8235,27313,12926,12926,10659.0,0,0,181.58951,3.669813,12889.0,1461,0.975357,12339.0,29.266362,0,148.754913,9471.0,172.181649
3,5_19_F_S3,5_19_F_S3_MIGS,MIGS,amicon,enriched,MIGS_amicon,24688,52292,25784,25784,25317.0,0,0,236.19516,0.620773,25596.0,724,0.995832,25426.0,40.722813,0,149.057594,25586.0,206.54433
4,5_28_A_S28,5_28_A_S28_MIGS,MIGS,amicon,enriched,MIGS_amicon,15543,37725,18367,18367,17026.0,0,0,185.489301,2.153318,18238.0,991,0.985212,17250.0,38.384603,0,145.459248,17386.0,173.818588
5,5_28_E_S26,5_28_E_S26_MIGS,MIGS,amicon,enriched,MIGS_amicon,17613,44947,21749,21749,20050.0,0,0,184.522599,2.243965,21622.0,1449,0.984538,19890.0,37.717412,0,145.062302,20323.0,165.175778
6,5_28_I_S25,5_28_I_S25_MIGS,MIGS,amicon,enriched,MIGS_amicon,19995,66690,31436,31436,25935.0,0,0,193.277294,3.649097,31269.0,3818,0.974949,29383.0,30.46733,0,145.380137,23812.0,178.327904
7,630-N,630-N_ILLUMINA,ILLUMINA,MOS & COL,unenriched,Illumina_unenriched,86312,294303,125967,125967,100799.0,0,0,265.029539,4.535307,124769.0,42369,0.970162,119073.0,30.69912,0,152.0,99942.0,214.164337
8,630-S1,630-S1_ILLUMINA,ILLUMINA,MOS & COL,unenriched,Illumina_unenriched,78763,314509,125337,125337,96115.0,0,0,281.792647,4.905511,122077.0,63835,0.967727,111917.0,30.117499,0,152.0,97192.0,180.354269
9,630-S2,630-S2_ILLUMINA,ILLUMINA,MOS & COL,unenriched,Illumina_unenriched,70088,248587,94821,94821,80458.0,0,0,224.799844,3.60726,93646.0,58945,0.976268,83223.0,34.493446,0,152.0,81965.0,154.877379


In [75]:
RMdb.to_csv('/groups/banfield/projects/human/data3/clou/wastewater/Datasheets/Wastewater_ReadMapping_All_Viral_Genomes_v1.csv',\
            index=False)

In [63]:
test = pd.read_csv('/groups/banfield/projects/human/data3/clou/wastewater/Mapping_to_viral_genomes/inStrain/all_viral_genomes-vs-6_02_Sc_S29.shrink.sort.bam.IS/output/all_viral_genomes-vs-6_02_Sc_S29.shrink.sort.bam.IS_mapping_info.tsv',\
                  sep='\t',skiprows=1)
test_nonan = test.dropna()
test_nonan

Unnamed: 0,scaffold,pass_pairing_filter,filtered_pairs,unfiltered_reads,pass_max_insert,mean_pair_length,mean_mistmaches,filtered_singletons,unfiltered_priority_reads,median_insert,pass_min_read_ani,unfiltered_pairs,pass_min_insert,mean_mapq_score,filtered_priority_reads,mean_insert_distance,mean_PID,unfiltered_singletons,pass_min_mapq
0,all_scaffolds,8872,5356,19227,8833.0,145.432710,4.091186,0,0,217.145514,7037.0,8872,8393.0,28.297453,0,246.735798,0.971912,1483,6354.0
54,NC_047783.1,1,0,2,1.0,147.000000,9.000000,0,0,178.000000,0.0,1,1.0,0.000000,0,178.000000,0.938776,0,0.0
56,NC_042138.1,6,0,12,6.0,146.000000,2.166667,0,0,353.000000,5.0,6,6.0,1.666667,0,305.666667,0.985353,0,0.0
62,NC_020488.1,4,0,8,4.0,147.000000,11.750000,0,0,250.500000,0.0,4,4.0,11.500000,0,250.500000,0.920068,0,2.0
81,NC_048189.1,2,0,4,2.0,146.500000,16.000000,0,0,103.000000,0.0,2,1.0,0.000000,0,103.000000,0.890807,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12018,NC_008208.1,2,0,5,2.0,146.500000,2.000000,0,0,194.000000,2.0,2,2.0,11.500000,0,194.000000,0.986325,1,0.0
12021,NC_028750.1,15,0,30,15.0,141.800000,1.000000,0,0,200.000000,15.0,15,13.0,2.333333,0,188.800000,0.993132,0,0.0
12023,NC_012042.1,2080,645,4790,2077.0,146.074038,6.921154,0,0,229.000000,1230.0,2080,2036.0,16.303846,0,241.833654,0.952634,630,944.0
12027,NC_028478.1,2293,2038,4659,2278.0,145.056258,1.423463,0,0,211.000000,2195.0,2293,2147.0,40.491060,0,232.317488,0.990148,73,2273.0
