In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42

## Build Theoretical Distributions for all datasets

Mock datasets and sequencing types
- Zymo-D6331
    - HiFi-Zymo-D6331
- Zymo-D6300
    - Illumina-Zymo-D6300
    - ONT-Q20-Zymo-D6300
    - ONT-R10-Zymo-D6300
    - SR-Sim-Zymo-D6300
- ATCC-MSA1003
    - HiFi-ATCC-MSA1003
    - Illumina-ATCC-MSA1003
    - SR-Sim-ATCC-MSA1003

all read sets:
 - HiFi-Zymo-D6331
 - HiFi-ATCC-MSA1003
 - Illumina-ATCC-MSA1003
 - SR-Sim-ATCC-MSA1003
 - Illumina-Zymo-D6300
 - ONT-Q20-Zymo-D6300
 - ONT-R10-Zymo-D6300
 - SR-Sim-Zymo-D6300


### Zymo-D6331

info: https://files.zymoresearch.com/protocols/_d6331_zymobiomics_gut_microbiome_standard.pdf

![image.png](attachment:705deae4-7a62-4259-8b0b-c63d63d42cc3.png)


In [2]:
# need to account for some differences in taxonomy for one of the Zymo species
# here, Lactobacillus fermentum = Limosilactobacillus fermentum
zymo_species = ['Faecalibacterium prausnitzii','Veillonella rogosae',
                'Roseburia hominis','Bacteroides fragilis','Escherichia coli',
                'Prevotella corporis','Bifidobacterium adolescentis',
                'Fusobacterium nucleatum','Limosilactobacillus fermentum',
                'Clostridioides difficile','Akkermansia muciniphila',
                'Methanobrevibacter smithii','Salmonella enterica',
                'Enterococcus faecalis','Clostridium perfringens',
                'Candida albicans','Saccharomyces cerevisiae']

# remove Veillonella rogosae and Prevotella corporis
zymo_species_adjusted = ['Faecalibacterium prausnitzii',
                'Roseburia hominis','Bacteroides fragilis',
                'Escherichia coli','Bifidobacterium adolescentis',
                'Fusobacterium nucleatum','Limosilactobacillus fermentum',
                'Clostridioides difficile','Akkermansia muciniphila',
                'Methanobrevibacter smithii','Salmonella enterica',
                'Enterococcus faecalis','Clostridium perfringens',
                'Candida albicans','Saccharomyces cerevisiae']


zymo_genera = ['Faecalibacterium','Veillonella',
               'Roseburia','Bacteroides','Escherichia',
               'Prevotella','Bifidobacterium',
               'Fusobacterium','Limosilactobacillus',
               'Clostridioides','Akkermansia',
               'Methanobrevibacter','Salmonella',
               'Enterococcus','Clostridium',
               'Candida','Saccharomyces']

theoretical_dists = [0.14, 0.14, 0.14, 0.14, 0.14, 
                     0.06, 0.06, 0.06, 0.06, 
                     0.015, 0.015, 0.001, 
                     0.0001, 0.00001, 0.000001, 
                     0.015, 0.014]#, 0]

theoretical_dists_adjusted = [0.14, 0.14, 0.14, 0.14, 0.14, 
                     0.06, 0.06, 0.06, 0.06, 
                     0.015, 0.015, 0.001, 
                     0.0001, 0.00001, 0.000001, 
                     0.015, 0.014]

HiFi_Zymo_D6331_reads = 1978852

In [3]:
species_D6331 = pd.DataFrame(theoretical_dists, index=zymo_species, columns=["Theoretical Distribution"])
species_D6331['HiFi_Zymo_D6331_theoretical_reads'] = species_D6331['Theoretical Distribution'] * HiFi_Zymo_D6331_reads

species_D6331
#species_D6331_adjusted = pd.DataFrame(theoretical_dists_adjusted, index=zymo_species, columns=["Theoretical Distribution"])

Unnamed: 0,Theoretical Distribution,HiFi_Zymo_D6331_theoretical_reads
Faecalibacterium prausnitzii,0.14,277039.28
Veillonella rogosae,0.14,277039.28
Roseburia hominis,0.14,277039.28
Bacteroides fragilis,0.14,277039.28
Escherichia coli,0.14,277039.28
Prevotella corporis,0.06,118731.12
Bifidobacterium adolescentis,0.06,118731.12
Fusobacterium nucleatum,0.06,118731.12
Limosilactobacillus fermentum,0.06,118731.12
Clostridioides difficile,0.015,29682.78


In [4]:
genus_D6331 = pd.DataFrame(theoretical_dists, index=zymo_genera, columns=["Theoretical Distribution"])
genus_D6331['HiFi_Zymo_D6331_theoretical_reads'] = genus_D6331['Theoretical Distribution'] * HiFi_Zymo_D6331_reads

genus_D6331

Unnamed: 0,Theoretical Distribution,HiFi_Zymo_D6331_theoretical_reads
Faecalibacterium,0.14,277039.28
Veillonella,0.14,277039.28
Roseburia,0.14,277039.28
Bacteroides,0.14,277039.28
Escherichia,0.14,277039.28
Prevotella,0.06,118731.12
Bifidobacterium,0.06,118731.12
Fusobacterium,0.06,118731.12
Limosilactobacillus,0.06,118731.12
Clostridioides,0.015,29682.78


In [5]:
## write to theoretical dist files

species_D6331.to_csv("HiFi_Zymo_D6331.theoretical-distrib.species.csv", index_label="Taxon" )
genus_D6331.to_csv("HiFi_Zymo_D6331.theoretical-distrib.genus.csv", index_label = "Taxon")

### Zymo-D6300

datasheet: https://files.zymoresearch.com/datasheets/ds1706_zymobiomics_microbial_community_standards_data_sheet.pdf

![image.png](attachment:c393fee6-bb83-4a0e-8de4-0c486c0c275e.png)

In [6]:
# need to account for some differences in taxonomy for one of the Zymo species
# here, Lactobacillus fermentum = Limosilactobacillus fermentum
# Bacillus subtilis = Bacillus spizizenii 
zymo_species = ['Pseudomonas aeruginosa','Escherichia coli',
                'Salmonella enterica','Limosilactobacillus fermentum',
                'Enterococcus faecalis','Staphylococcus aureus',
                'Listeria monocytogenes','Bacillus subtilis',
                'Saccharomyces cerevisiae','Cryptococcus neoformans']

zymo_genera = ['Pseudomonas','Escherichia',
               'Salmonella','Limosilactobacillus',
               'Enterococcus','Staphylococcus',
               'Listeria','Bacillus',
               'Saccharomyces','Cryptococcus']

theoretical_dists = [0.12, 0.12, 0.12, 0.12, 0.12, 0.12, 0.12, 0.12, 0.02, 0.02]#, 0]

# diff read sets
#label = "ILM_ZymoD6300"
Illumina_Zymo_D6300_total_reads = 20000000
ONT_Q20_ZymoD6300_total_reads = 2000000
ONT_R10_ZymoD6300_total_reads = 275318
ONTShortReads_ZymoD6300_total_reads = 20000000



In [7]:
species_D6300 = pd.DataFrame(theoretical_dists, index=zymo_species, columns=["Theoretical Distribution"])
species_D6300['Illumina_Zymo_D6300_theoretical_reads'] = species_D6300['Theoretical Distribution'] * Illumina_Zymo_D6300_total_reads
species_D6300['ONT_Q20_Zymo_D6300_full_theoretical_reads'] = species_D6300['Theoretical Distribution'] * ONT_Q20_ZymoD6300_total_reads
species_D6300['ONT_R10_Zymo_D6300_theoretical_reads'] = species_D6300['Theoretical Distribution'] * ONT_R10_ZymoD6300_total_reads
species_D6300['ONT_Q20_Zymo_D6300_short_theoretical_reads'] = species_D6300['Theoretical Distribution'] * ONTShortReads_ZymoD6300_total_reads

species_D6300

Unnamed: 0,Theoretical Distribution,Illumina_Zymo_D6300_theoretical_reads,ONT_Q20_Zymo_D6300_full_theoretical_reads,ONT_R10_Zymo_D6300_theoretical_reads,ONT_Q20_Zymo_D6300_short_theoretical_reads
Pseudomonas aeruginosa,0.12,2400000.0,240000.0,33038.16,2400000.0
Escherichia coli,0.12,2400000.0,240000.0,33038.16,2400000.0
Salmonella enterica,0.12,2400000.0,240000.0,33038.16,2400000.0
Limosilactobacillus fermentum,0.12,2400000.0,240000.0,33038.16,2400000.0
Enterococcus faecalis,0.12,2400000.0,240000.0,33038.16,2400000.0
Staphylococcus aureus,0.12,2400000.0,240000.0,33038.16,2400000.0
Listeria monocytogenes,0.12,2400000.0,240000.0,33038.16,2400000.0
Bacillus subtilis,0.12,2400000.0,240000.0,33038.16,2400000.0
Saccharomyces cerevisiae,0.02,400000.0,40000.0,5506.36,400000.0
Cryptococcus neoformans,0.02,400000.0,40000.0,5506.36,400000.0


In [8]:
genus_D6300 = pd.DataFrame(theoretical_dists, index=zymo_genera, columns=["Theoretical Distribution"])
genus_D6300['Illumina_Zymo_D6300_theoretical_reads'] = genus_D6300['Theoretical Distribution'] * Illumina_Zymo_D6300_total_reads
genus_D6300['ONT_Q20_Zymo_D6300_full_theoretical_reads'] = genus_D6300['Theoretical Distribution'] * ONT_Q20_ZymoD6300_total_reads
genus_D6300['ONT_R10_Zymo_D6300_theoretical_reads'] = genus_D6300['Theoretical Distribution'] * ONT_R10_ZymoD6300_total_reads
genus_D6300['ONT_Q20_Zymo_D6300_short_theoretical_reads'] = genus_D6300['Theoretical Distribution'] * ONTShortReads_ZymoD6300_total_reads
genus_D6300

Unnamed: 0,Theoretical Distribution,Illumina_Zymo_D6300_theoretical_reads,ONT_Q20_Zymo_D6300_full_theoretical_reads,ONT_R10_Zymo_D6300_theoretical_reads,ONT_Q20_Zymo_D6300_short_theoretical_reads
Pseudomonas,0.12,2400000.0,240000.0,33038.16,2400000.0
Escherichia,0.12,2400000.0,240000.0,33038.16,2400000.0
Salmonella,0.12,2400000.0,240000.0,33038.16,2400000.0
Limosilactobacillus,0.12,2400000.0,240000.0,33038.16,2400000.0
Enterococcus,0.12,2400000.0,240000.0,33038.16,2400000.0
Staphylococcus,0.12,2400000.0,240000.0,33038.16,2400000.0
Listeria,0.12,2400000.0,240000.0,33038.16,2400000.0
Bacillus,0.12,2400000.0,240000.0,33038.16,2400000.0
Saccharomyces,0.02,400000.0,40000.0,5506.36,400000.0
Cryptococcus,0.02,400000.0,40000.0,5506.36,400000.0


In [9]:
## write to theoretical dist files

species_D6300.to_csv("Zymo_D6300.theoretical-distrib.species.csv", index_label="Taxon" )
genus_D6300.to_csv("Zymo_D6300.theoretical-distrib.genus.csv", index_label = "Taxon")

In [10]:
!pwd

/Users/tessa/dib-lab/2022-lr-tax/notebooks


### ATCC-MSA1003

info: https://www.atcc.org/products/msa-1003

![image.png](attachment:7c118a21-8806-47ce-9782-93534da55d14.png)

In [11]:
# need to account for some differences in taxonomy for the ATCC species
# here, Rhodobacter sphaeroides = Luteovulum sphaeroides = Cereibacter sphaeroides
# Bacteroides vulgatus = Phocaeicola vulgatus
# Bacillus pacificus = Bacillus cereus in NCBI (specifically ATCC 10987), use cereus only
atcc_species = ['Escherichia coli','Porphyromonas gingivalis',
                'Luteovulum sphaeroides','Staphylococcus epidermidis',
                'Streptococcus mutans','Bacillus cereus',
                'Clostridium beijerinckii','Pseudomonas aeruginosa',
                'Staphylococcus aureus','Streptococcus agalactiae',
                'Acinetobacter baumannii','Cutibacterium acnes',
                'Helicobacter pylori','Lactobacillus gasseri',
                'Neisseria meningitidis','Phocaeicola vulgatus', 
                'Bifidobacterium adolescentis', 'Deinococcus radiodurans', 
                'Enterococcus faecalis', 'Schaalia odontolytica']

atcc_genera = ['Staphylococcus', 'Streptococcus',
               'Escherichia', 'Porphyromonas', 
               'Luteovulum', 'Bacillus', 'Clostridium',
               'Pseudomonas', 'Acinetobacter', 
               'Cutibacterium', 'Helicobacter',
               'Lactobacillus', 'Neisseria',
               'Phocaeicola', 'Bifidobacterium',
               'Deinococcus', 'Enterococcus',
               'Schaalia']

# because there are multiple species in some of the genera,
# we need two theoretical distributions here, one for 
# species and one for genus

theoretical_dists_sp = [18, 18, 18, 18, 18,
                        1.8, 1.8, 1.8, 1.8, 1.8,
                        0.18, 0.18, 0.18, 0.18, 0.18,
                        0.02, 0.02, 0.02, 0.02, 0.02]#, 0]

theoretical_dists_gn = [19.8, 19.8, 
                        18, 18, 18, 
                        1.8, 1.8, 1.8,
                        0.18, 0.18, 0.18, 0.18, 0.18,
                        0.02, 0.02, 0.02, 0.02, 0.02]#, 0]


HiFi_ATCC_MSA1003_total_reads = 2419037
ILM125_ATCC_MSA1003_total_reads = 10038314
HiFi_ShortSimReads_ATCC_MSA1003_total_reads = 24182940

In [12]:
species_ATCC = pd.DataFrame(theoretical_dists_sp, index=atcc_species, columns=["Theoretical Distribution"])
species_ATCC['HiFi_ATCC_MSA1003_full_theoretical_reads'] = species_ATCC['Theoretical Distribution'] * HiFi_ATCC_MSA1003_total_reads
species_ATCC['Illumina_ATCC_MSA1003_theoretical_reads'] = species_ATCC['Theoretical Distribution'] * ILM125_ATCC_MSA1003_total_reads
species_ATCC['HiFi_ATCC_MSA1003_short_theoretical_reads'] = species_ATCC['Theoretical Distribution'] * HiFi_ShortSimReads_ATCC_MSA1003_total_reads

species_ATCC

Unnamed: 0,Theoretical Distribution,HiFi_ATCC_MSA1003_full_theoretical_reads,Illumina_ATCC_MSA1003_theoretical_reads,HiFi_ATCC_MSA1003_short_theoretical_reads
Escherichia coli,18.0,43542666.0,180689700.0,435292920.0
Porphyromonas gingivalis,18.0,43542666.0,180689700.0,435292920.0
Luteovulum sphaeroides,18.0,43542666.0,180689700.0,435292920.0
Staphylococcus epidermidis,18.0,43542666.0,180689700.0,435292920.0
Streptococcus mutans,18.0,43542666.0,180689700.0,435292920.0
Bacillus cereus,1.8,4354266.6,18068970.0,43529292.0
Clostridium beijerinckii,1.8,4354266.6,18068970.0,43529292.0
Pseudomonas aeruginosa,1.8,4354266.6,18068970.0,43529292.0
Staphylococcus aureus,1.8,4354266.6,18068970.0,43529292.0
Streptococcus agalactiae,1.8,4354266.6,18068970.0,43529292.0


In [13]:
genus_ATCC = pd.DataFrame(theoretical_dists_gn, index=atcc_genera, columns=["Theoretical Distribution"])
genus_ATCC['HiFi_ATCC_MSA1003_full_theoretical_reads'] = genus_ATCC['Theoretical Distribution'] * HiFi_ATCC_MSA1003_total_reads
genus_ATCC['Illumina_ATCC_MSA1003_theoretical_reads'] = genus_ATCC['Theoretical Distribution'] * ILM125_ATCC_MSA1003_total_reads
genus_ATCC['HiFi_ATCC_MSA1003_short_theoretical_reads'] = genus_ATCC['Theoretical Distribution'] * HiFi_ShortSimReads_ATCC_MSA1003_total_reads

genus_ATCC

Unnamed: 0,Theoretical Distribution,HiFi_ATCC_MSA1003_full_theoretical_reads,Illumina_ATCC_MSA1003_theoretical_reads,HiFi_ATCC_MSA1003_short_theoretical_reads
Staphylococcus,19.8,47896932.6,198758600.0,478822212.0
Streptococcus,19.8,47896932.6,198758600.0,478822212.0
Escherichia,18.0,43542666.0,180689700.0,435292920.0
Porphyromonas,18.0,43542666.0,180689700.0,435292920.0
Luteovulum,18.0,43542666.0,180689700.0,435292920.0
Bacillus,1.8,4354266.6,18068970.0,43529292.0
Clostridium,1.8,4354266.6,18068970.0,43529292.0
Pseudomonas,1.8,4354266.6,18068970.0,43529292.0
Acinetobacter,0.18,435426.66,1806897.0,4352929.2
Cutibacterium,0.18,435426.66,1806897.0,4352929.2


In [14]:
## write to theoretical dist files

species_ATCC.to_csv("ATCC_MSA1003.theoretical-distrib.species.csv", index_label="Taxon" )
genus_ATCC.to_csv("ATCC_MSA1003.theoretical-distrib.genus.csv", index_label = "Taxon")