In [73]:
import os
import sys
import glob
import scipy
import skbio
import matplotlib
import numpy as np
import pandas as pd
import seaborn as sns
from collections import defaultdict
from matplotlib import pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages

%matplotlib inline
sns.set_style('white')
matplotlib.rcParams['ps.fonttype'] = 42
matplotlib.rcParams['pdf.fonttype'] = 42

from IPython.display import display, HTML
import tRep

import warnings
warnings.filterwarnings("ignore")

pd.options.display.max_columns = 100
pd.options.display.max_colwidth = 500
pd.options.display.max_rows = 200

### Load relevant metadata tables

In [74]:
Ndb = pd.read_csv('/data/viral_genome_analysis/tables/IS_outputs/Wastewater_DeltaCoverage_All_Viral_Genomes_v2b_SetMinBreadth.csv')
Ndb_raw = pd.read_csv('/data/viral_genome_analysis/tables/IS_outputs/Wastewater_DeltaCoverage_All_Viral_Genomes_v2_AddRelativeAbundance.csv')
RMdb = pd.read_csv('/data/viral_genome_analysis/tables/IS_outputs/Wastewater_ReadMapping_All_Viral_Genomes_v1.csv')

Gdb = pd.read_csv('/data/viral_genome_analysis/tables/metadata/viral_name_df.csv')
Rdb = pd.read_csv('/data/viral_genome_analysis/tables/metadata/waster_water_readDepths_formatted.csv')
Mdb = pd.read_csv('/data/viral_genome_analysis/tables/metadata/sample_metadata_v2_sample_name_adjusted.csv')

### Load virus of interest table

In [75]:
Vdb = pd.read_csv('/data/viral_genome_analysis/tables/virus_of_interests/viruses.txt',sep='\t')
Vdb.sort_values('common_name',ascending=False)

Unnamed: 0,common_name,genome
18,zika virus,Zika_virus_strain_MR_766-tax64320-GCF_000882815.3_ViralProj36615_genomic.fna
19,zika virus,Zika_virus_strain_Natal_RGN-tax64320-GCF_002366285.1_ViralProj411812_genomic.fna
2,rotavirus,Adult_diarrheal_rotavirus_strain_J19-tax335103-GCF_000864245.1_ViralMultiSegProj16144_genomic.fna
3,rotavirus,Human_rotavirus_B_strain_Bang373-tax10942-GCF_000907835.1_ViralMultiSegProj209367_genomic.fna
21,rhinovirus,Rhinovirus_A-tax147711-GCF_000862245.1_ViralProj15330_genomic.fna
23,rhinovirus,Rhinovirus_C_strain_024-tax463676-GCF_000872325.1_ViralProj27901_genomic.fna
22,rhinovirus,Rhinovirus_B14-tax12131-GCF_000861265.1_ViralProj15309_genomic.fna
26,respiratory syncytial virus,Human_orthopneumovirus-tax11250-GCF_002815475.1_ASM281547v1_genomic.fna
25,respiratory syncytial virus,Human_orthopneumovirus_strain_B1-tax11250-GCF_000855545.1_ViralProj15003_genomic.fna
24,respiratory syncytial virus,Respiratory_syncytial_virus_strain_S2_ts1C-tax12814-GCF_000856445.1_ViralProj15004_genomic.fna


In [76]:
print(len(Vdb['genome'].unique()))

47


### check if there is a naming inconsistency b/w virus metadata & virus genomes

In [77]:
table = defaultdict(list)
for g in glob.glob('/groups/banfield/projects/human/data3/clou/wastewater/viral_complete_genomes/indi_genomes/*fna'):
    genome_name=os.path.basename(g)
    table['genome'].append(genome_name)
gdb = pd.DataFrame(table)
len(gdb)

9504

In [None]:
gdb.to_csv('/data/viral_genome_analysis/tables/metadata/all_virus_genomes_name.csv',index=False)

In [78]:
gdb_selected = gdb[gdb['genome'].isin(Vdb['genome'].tolist())]
len(gdb_selected['genome'].unique())

47

In [79]:
set(Vdb['genome'])-set(gdb_selected['genome'])

set()

### OK now, let's find these viruses from the samples.

In [80]:
Ndb_raw_selected = Ndb_raw[Ndb_raw['genome'].isin(Vdb['genome'].tolist())]
Ndb_raw_selected

Unnamed: 0,sample,sample_complete_description,core,method,enrichment,subgroup,genome,length,relative_abundance (%),breadth,coverage,sample_read_bases,sample_reads,popANI_reference,conANI_reference,filtered_read_pair_count,reads_unfiltered_pairs,reads_mean_PID,divergent_site_count,reads_unfiltered_reads,nucl_diversity,true_scaffolds,detected_scaffolds,coverage_median,coverage_std,coverage_SEM,breadth_minCov,breadth_expected,nucl_diversity_rarefied,iRep,iRep_GC_corrected,linked_SNV_count,SNV_distance_mean,r2_mean,d_prime_mean,consensus_divergent_sites,population_divergent_sites,SNS_count,SNV_count
56,5_19_A_S4,5_19_A_S4_MIGS,MIGS,amicon,enriched,MIGS_amicon,Human_coronavirus_OC43_strain_ATCC_VR_759-tax31631-GCF_003972325.1_ASM397232v1_genomic.fna,30741.0,0.000583,0.004294,0.004294,25431914,343252,0.0,0.0,1,10,0.975216,0.0,21,,1.0,1.0,0,0.0656,0.000375,0.0,0.003784,,,False,,,,,0.0,0.0,0.0,0.0
57,5_19_A_S4,5_19_A_S4_MIGS,MIGS,amicon,enriched,MIGS_amicon,Human_bocavirus_4_NI_strain_HBoV4_NI_385-tax1511883-GCF_000886375.1_ViralProj38243_genomic.fna,5104.0,0.005827,0.090909,0.260188,25431914,343252,0.977011,0.977011,10,601,0.954404,2.0,1364,0.002815,1.0,1.0,0,1.111483,0.015873,0.017045,0.205265,0.0,,False,,,,,2.0,2.0,2.0,0.0
58,5_19_A_S4,5_19_A_S4_MIGS,MIGS,amicon,enriched,MIGS_amicon,Human_bocavirus_3_strain_W471-tax638313-GCF_000882855.1_ViralProj37291_genomic.fna,5242.0,0.76154,0.93781,32.860168,25431914,343252,0.998735,0.99726,1307,1788,0.981823,37.0,3754,0.004135,1.0,1.0,35,19.942285,0.280877,0.904998,1.0,0.00311,,False,15.0,25.066667,0.735926,1.0,13.0,6.0,6.0,31.0
59,5_19_A_S4,5_19_A_S4_MIGS,MIGS,amicon,enriched,MIGS_amicon,Human_bocavirus_2c_PK-tax1511882-GCF_000882675.1_ViralProj33891_genomic.fna,5196.0,0.583245,0.657236,27.008468,25431914,343252,0.970803,0.963119,1001,3678,0.952956,150.0,8036,0.013569,1.0,1.0,6,42.404268,0.599987,0.500962,1.0,0.016049,,False,244.0,52.733607,0.353956,0.949669,96.0,76.0,70.0,80.0
154,5_28_A_S28,5_28_A_S28_MIGS,MIGS,amicon,enriched,MIGS_amicon,Human_bocavirus_3_strain_W471-tax638313-GCF_000882855.1_ViralProj37291_genomic.fna,5242.0,0.003271,0.186952,0.371423,71023874,978422,0.0,0.0,16,23,0.966985,0.0,55,,1.0,1.0,0,0.921035,0.012972,0.0,0.279613,,,False,,,,,0.0,0.0,0.0,0.0
194,5_28_E_S26,5_28_E_S26_MIGS,MIGS,amicon,enriched,MIGS_amicon,Human_bocavirus_2c_PK-tax1511882-GCF_000882675.1_ViralProj33891_genomic.fna,5196.0,0.044986,0.662433,5.872787,72838764,1009198,0.969231,0.965242,227,791,0.953807,63.0,1725,0.004304,1.0,1.0,2,9.333466,0.132061,0.33776,0.994404,0.0,,False,,,,,61.0,54.0,53.0,10.0
214,5_28_E_S26,5_28_E_S26_MIGS,MIGS,amicon,enriched,MIGS_amicon,Human_bocavirus_3_strain_W471-tax638313-GCF_000882855.1_ViralProj37291_genomic.fna,5242.0,0.05549,0.871232,6.680465,72838764,1009198,1.0,0.998742,280,388,0.982258,15.0,806,0.003648,1.0,1.0,7,4.854814,0.068378,0.606639,0.997257,0.0,,False,,,,,4.0,0.0,0.0,15.0
217,5_28_E_S26,5_28_E_S26_MIGS,MIGS,amicon,enriched,MIGS_amicon,Human_bocavirus_4_NI_strain_HBoV4_NI_385-tax1511883-GCF_000886375.1_ViralProj38243_genomic.fna,5104.0,0.000198,0.018809,0.018809,72838764,1009198,0.0,0.0,1,124,0.959198,0.0,271,,1.0,1.0,0,0.138538,0.001979,0.0,0.016471,,,False,,,,,0.0,0.0,0.0,0.0
311,5_28_I_S25,5_28_I_S25_MIGS,MIGS,amicon,enriched,MIGS_amicon,Human_bocavirus_4_NI_strain_HBoV4_NI_385-tax1511883-GCF_000886375.1_ViralProj38243_genomic.fna,5104.0,0.00641,0.148315,1.019592,89771642,1248062,0.981595,0.972393,40,1525,0.952458,14.0,3533,0.014743,1.0,1.0,0,3.630397,0.051847,0.063871,0.593552,0.0,,False,,,,,9.0,6.0,5.0,9.0
312,5_28_I_S25,5_28_I_S25_MIGS,MIGS,amicon,enriched,MIGS_amicon,Human_bocavirus_3_strain_W471-tax638313-GCF_000882855.1_ViralProj37291_genomic.fna,5242.0,0.111533,0.955361,16.937428,89771642,1248062,1.0,0.999093,696,1216,0.973594,32.0,2640,0.004902,1.0,1.0,17,10.244754,0.144292,0.841091,1.0,0.0,,False,6.0,17.666667,0.802174,1.0,4.0,0.0,0.0,32.0


In [81]:
print(len(Ndb_raw_selected['genome'].unique()))
print((Ndb_raw_selected['genome'].unique()))

5
['Human_coronavirus_OC43_strain_ATCC_VR_759-tax31631-GCF_003972325.1_ASM397232v1_genomic.fna'
 'Human_bocavirus_4_NI_strain_HBoV4_NI_385-tax1511883-GCF_000886375.1_ViralProj38243_genomic.fna'
 'Human_bocavirus_3_strain_W471-tax638313-GCF_000882855.1_ViralProj37291_genomic.fna'
 'Human_bocavirus_2c_PK-tax1511882-GCF_000882675.1_ViralProj33891_genomic.fna'
 'Norovirus_GI_strain_Hu_JP_2000_GI_6_PNA1_WUG1-tax122928-GCF_008703985.1_ASM870398v1_genomic.fna']


In [82]:
Ndb_raw_selected_simple = Ndb_raw_selected[['sample','core','method','enrichment','genome','relative_abundance (%)','breadth','coverage',\
                                            'filtered_read_pair_count','reads_unfiltered_pairs']]
Ndb_raw_selected_simple = Ndb_raw_selected_simple.reset_index(drop=True)
Ndb_raw_selected_simple

Unnamed: 0,sample,core,method,enrichment,genome,relative_abundance (%),breadth,coverage,filtered_read_pair_count,reads_unfiltered_pairs
0,5_19_A_S4,MIGS,amicon,enriched,Human_coronavirus_OC43_strain_ATCC_VR_759-tax31631-GCF_003972325.1_ASM397232v1_genomic.fna,0.000583,0.004294,0.004294,1,10
1,5_19_A_S4,MIGS,amicon,enriched,Human_bocavirus_4_NI_strain_HBoV4_NI_385-tax1511883-GCF_000886375.1_ViralProj38243_genomic.fna,0.005827,0.090909,0.260188,10,601
2,5_19_A_S4,MIGS,amicon,enriched,Human_bocavirus_3_strain_W471-tax638313-GCF_000882855.1_ViralProj37291_genomic.fna,0.76154,0.93781,32.860168,1307,1788
3,5_19_A_S4,MIGS,amicon,enriched,Human_bocavirus_2c_PK-tax1511882-GCF_000882675.1_ViralProj33891_genomic.fna,0.583245,0.657236,27.008468,1001,3678
4,5_28_A_S28,MIGS,amicon,enriched,Human_bocavirus_3_strain_W471-tax638313-GCF_000882855.1_ViralProj37291_genomic.fna,0.003271,0.186952,0.371423,16,23
5,5_28_E_S26,MIGS,amicon,enriched,Human_bocavirus_2c_PK-tax1511882-GCF_000882675.1_ViralProj33891_genomic.fna,0.044986,0.662433,5.872787,227,791
6,5_28_E_S26,MIGS,amicon,enriched,Human_bocavirus_3_strain_W471-tax638313-GCF_000882855.1_ViralProj37291_genomic.fna,0.05549,0.871232,6.680465,280,388
7,5_28_E_S26,MIGS,amicon,enriched,Human_bocavirus_4_NI_strain_HBoV4_NI_385-tax1511883-GCF_000886375.1_ViralProj38243_genomic.fna,0.000198,0.018809,0.018809,1,124
8,5_28_I_S25,MIGS,amicon,enriched,Human_bocavirus_4_NI_strain_HBoV4_NI_385-tax1511883-GCF_000886375.1_ViralProj38243_genomic.fna,0.00641,0.148315,1.019592,40,1525
9,5_28_I_S25,MIGS,amicon,enriched,Human_bocavirus_3_strain_W471-tax638313-GCF_000882855.1_ViralProj37291_genomic.fna,0.111533,0.955361,16.937428,696,1216


In [83]:
Ndb_raw_selected_simple.to_csv('/data/viral_genome_analysis/tables/virus_of_interests/Wastewater_virus_of_interest_breadth_read_count_per_sample.csv',\
                              index=False)

### Find SARS-CoV2 in all samples

In [84]:
Ndb_raw_covid19 = Ndb_raw[Ndb_raw['genome'].str.contains('Severe_acute_respiratory_syndrome_coronavirus_2')].reset_index(drop=True)
Ndb_raw_covid19_simple = Ndb_raw_covid19[['sample','genome','breadth','coverage','filtered_read_pair_count']]
Ndb_raw_covid19_simple

Unnamed: 0,sample,genome,breadth,coverage,filtered_read_pair_count
0,5_13_A_S1,Severe_acute_respiratory_syndrome_coronavirus_2-tax2697049-GCF_009858895.2_ASM985889v3_genomic.fna,0.704979,2.508344,594
1,5_13_C_S2,Severe_acute_respiratory_syndrome_coronavirus_2-tax2697049-GCF_009858895.2_ASM985889v3_genomic.fna,0.94422,3.716851,821
2,5_19_A_S4,Severe_acute_respiratory_syndrome_coronavirus_2-tax2697049-GCF_009858895.2_ASM985889v3_genomic.fna,0.997927,14.679631,3325
3,5_19_F_S3,Severe_acute_respiratory_syndrome_coronavirus_2-tax2697049-GCF_009858895.2_ASM985889v3_genomic.fna,0.998228,111.80925,24053
4,5_28_A_S28,Severe_acute_respiratory_syndrome_coronavirus_2-tax2697049-GCF_009858895.2_ASM985889v3_genomic.fna,0.231616,0.470789,115
5,5_28_E_S26,Severe_acute_respiratory_syndrome_coronavirus_2-tax2697049-GCF_009858895.2_ASM985889v3_genomic.fna,0.783333,3.045781,734
6,5_28_I_S25,Severe_acute_respiratory_syndrome_coronavirus_2-tax2697049-GCF_009858895.2_ASM985889v3_genomic.fna,0.994315,12.160419,2883
7,630-S1,Severe_acute_respiratory_syndrome_coronavirus_2-tax2697049-GCF_009858895.2_ASM985889v3_genomic.fna,0.012005,0.014547,4
8,630-S2,Severe_acute_respiratory_syndrome_coronavirus_2-tax2697049-GCF_009858895.2_ASM985889v3_genomic.fna,0.00505,0.00505,1
9,6_02_Ac_S31,Severe_acute_respiratory_syndrome_coronavirus_2-tax2697049-GCF_009858895.2_ASM985889v3_genomic.fna,0.135873,0.215029,50


In [85]:
len(Ndb_raw_covid19_simple)

32

In [86]:
Ndb_raw_covid19_simple.to_csv('/data/viral_genome_analysis/tables/virus_of_interests/Wastewater_sars_cov2.csv',index=False)