In [21]:
import os
import sys
import glob
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

from tqdm import tqdm # progress bars :)
tqdm.pandas()

import mrcc
from mrcc.p_from_scaled_containment import compute_confidence_intervals, compute_confidence_interval_one_step

sns.set_context("paper")

In [22]:
pd.__version__

'0.24.2'

In [23]:
## Run the Test Data with mrcc.p_from_scaled_containment

In [24]:
%%bash
# run on command line
python -m mrcc.p_from_scaled_containment -L 100K -k 21 -c 0.95 --sccon 0.10605

Interval using two steps:
L	k	conf	Cks	CLow	CHigh	pLow	pHigh
100000	21	0.95	0.10605	0.10046	0.11191	0.09623	0.10655
Interval using a single step:
L	k	conf	Cks	pLow	pHigh	PtEst	Midpoint
100000	21	0.95	0.10605	0.09766	0.10500	0.10133	0.10133


In [25]:
# run the same within python:

num_unique_kmers = 100000
ksize = 21
confidence=0.95
observed_containment=[0.10605]
scaled=0.1
compute_confidence_intervals(observed_containment, num_unique_kmers, ksize, confidence, scaled)

[[100000,
  21,
  0.95,
  0.10605,
  0.10046044759723283,
  0.11191185994396291,
  0.09623087565457937,
  0.10655865394035777]]

In [26]:
# run one step
num_unique_kmers = 100000
ksize = 21
confidence=0.95
observed_containment=[0.10605]
scaled=0.1
compute_confidence_interval_one_step(observed_containment, num_unique_kmers, ksize, confidence, scaled)



[[100000,
  21,
  0.95,
  0.10605,
  0.09766031318520162,
  0.1050025284806568,
  0.1013393068788695,
  0.1013314208329292]]

# Import Preprocessed Simulated Read Data

In [27]:
%%bash
mkdir -p ../data
curl -L https://osf.io/f4a9k/download -o ../data/simreads-compare.dnainput.processed.csv.gz
ls ../data

simreads-compare.dnainput.csv.gz
simreads-compare.dnainput.processed.csv.gz
simreads-compare.dnainput.processed.mashD.csv.gz
simreads-compare.dnainput.processed.mrcc-twostep.csv.gz
simreads-compare.dnainput.processed.mrcc.csv.gz


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   459  100   459    0     0    968      0 --:--:-- --:--:-- --:--:--   966
100 8139k  100 8139k    0     0  5039k      0  0:00:01  0:00:01 --:--:-- 17.9M


In [28]:
simDF = pd.read_csv("../data/simreads-compare.dnainput.processed.csv.gz")
simDF.head()

Unnamed: 0,comparison_name,sig1_name,sig2_name,alphabet,ksize,scaled,jaccard,max_containment,sig1_containment,sig2_containment,sig1_hashes,sig2_hashes,num_common,alpha-ksize,num_unique_kmers,true p-distance
0,data-d0.05-f1-nogam-seed11,data-d0.05-f1-nogam-seed11-seq1,data-d0.05-f1-nogam-seed11-seq2,dna,21,1,0.192024,0.329152,0.329152,0.3155,4974666,5189923,1637423,dna-21,5082294.5,0.047855
1,data-d0.05-f1-nogam-seed11,data-d0.05-f1-nogam-seed11-seq1,data-d0.05-f1-nogam-seed11-seq2,dna,21,100,0.193861,0.332344,0.332344,0.317521,49849,52176,16567,dna-21,5082294.5,0.047855
2,data-d0.05-f1-nogam-seed11,data-d0.05-f1-nogam-seed11-seq1,data-d0.05-f1-nogam-seed11-seq2,dna,21,1000,0.203267,0.343331,0.343331,0.332559,5001,5163,1717,dna-21,5082294.5,0.047855
3,data-d0.05-f1-nogam-seed11,data-d0.05-f1-nogam-seed11-seq1,data-d0.05-f1-nogam-seed11-seq2,dna,21,2000,0.203029,0.344026,0.344026,0.331274,2494,2590,858,dna-21,5082294.5,0.047855
4,data-d0.05-f1-nogam-seed11,data-d0.05-f1-nogam-seed11-seq1,data-d0.05-f1-nogam-seed11-seq2,dna,31,1,0.108379,0.199794,0.199794,0.191507,4974660,5189923,993907,dna-31,5082291.5,0.047855


In [29]:
num_unique_kmers=4980082
ksize=21
confidence=0.95
scaled=1.0
observed_containment=[0.320984]

compute_confidence_intervals(observed_containment, num_unique_kmers, ksize, confidence, scaled)


[[4980082,
  21,
  0.95,
  0.320984,
  0.32098400000000005,
  0.32098400000000005,
  0.05243629673352139,
  0.052913080266991326]]

In [30]:
# same but one step
num_unique_kmers=4980082
ksize=21
confidence=0.95
scaled=1.0
observed_containment=[0.320984]
compute_confidence_interval_one_step(observed_containment, num_unique_kmers, ksize, confidence, scaled)

[[4980082,
  21,
  0.95,
  0.320984,
  0.052436296733521424,
  0.0529130802669913,
  0.05267454134109484,
  0.05267468850025636]]

In [31]:
# what gets returned from compute_confidence_interval_one_step
#values = [L,k,confidence,Cks,sol2,sol1,1.0-Cks**(1.0/k),(sol2+sol1)/2.0]
#L	k	conf	Cks	pLow	pHigh	PtEst	Midpoint

In [39]:
# function run p_from_scaled_containment on full dataframe
def mrcc_maxcontain_one_step(row, confidence=0.95, return_ANI=False):
    scaled_perc = 1/row["scaled"]
    mc = row["max_containment"]
    num_kmers = row["num_unique_kmers"]
    ksize = row["ksize"]
    mc_results = compute_confidence_interval_one_step([mc], num_kmers, ksize, confidence, scaled_perc)[0]
    pLow,pHigh,ptEst,pMid = mc_results[4:]
    if return_ANI:
        row["mc_ANILow"] = 1 - pLow
        row["mc_ANIHigh"] = 1- pHigh
        row['mc_ANIMid'] = 1 - pMid
        row['mc_ANIEst'] = 1 - ptEst
    else:
        row["mc_pLow"] = pLow
        row["mc_pHigh"] = pHigh
        row['mc_pMid'] = pMid
        row['mc_pEst'] = ptEst
    return row
#fullDF = simDF.appl y(mrcc_maxcontain, axis=1)

In [40]:
# to run p_from_scaled_containment on full dataframe
fullDF = simDF.progress_apply(mrcc_maxcontain_one_step, axis=1) # with tqdm progress bar!
fullDF.head()

100%|██████████| 273600/273600 [22:47<00:00, 200.14it/s] 


Unnamed: 0,comparison_name,sig1_name,sig2_name,alphabet,ksize,scaled,jaccard,max_containment,sig1_containment,sig2_containment,sig1_hashes,sig2_hashes,num_common,alpha-ksize,num_unique_kmers,true p-distance,mc_pLow,mc_pHigh,mc_pMid,mc_pEst
0,data-d0.05-f1-nogam-seed11,data-d0.05-f1-nogam-seed11-seq1,data-d0.05-f1-nogam-seed11-seq2,dna,21,1,0.192024,0.329152,0.329152,0.3155,4974666,5189923,1637423,dna-21,5082294.5,0.047855,0.051308,0.051773,0.05154,0.05154
1,data-d0.05-f1-nogam-seed11,data-d0.05-f1-nogam-seed11-seq1,data-d0.05-f1-nogam-seed11-seq2,dna,21,100,0.193861,0.332344,0.332344,0.317521,49849,52176,16567,dna-21,5082294.5,0.047855,0.050506,0.051706,0.051106,0.051104
2,data-d0.05-f1-nogam-seed11,data-d0.05-f1-nogam-seed11-seq1,data-d0.05-f1-nogam-seed11-seq2,dna,21,1000,0.203267,0.343331,0.343331,0.332559,5001,5163,1717,dna-21,5082294.5,0.047855,0.047915,0.051384,0.049649,0.049634
3,data-d0.05-f1-nogam-seed11,data-d0.05-f1-nogam-seed11-seq1,data-d0.05-f1-nogam-seed11-seq2,dna,21,2000,0.203029,0.344026,0.344026,0.331274,2494,2590,858,dna-21,5082294.5,0.047855,0.047135,0.052012,0.049573,0.049542
4,data-d0.05-f1-nogam-seed11,data-d0.05-f1-nogam-seed11-seq1,data-d0.05-f1-nogam-seed11-seq2,dna,31,1,0.108379,0.199794,0.199794,0.191507,4974660,5189923,993907,dna-31,5082291.5,0.047855,0.050369,0.05088,0.050624,0.050624


In [41]:
fullDF.tail()

Unnamed: 0,comparison_name,sig1_name,sig2_name,alphabet,ksize,scaled,jaccard,max_containment,sig1_containment,sig2_containment,sig1_hashes,sig2_hashes,num_common,alpha-ksize,num_unique_kmers,true p-distance,mc_pLow,mc_pHigh,mc_pMid,mc_pEst
273595,data-d0.95-f3-gamma-seed482,data-d0.95-f3-gamma-seed482-seq1,data-d0.95-f3-gamma-seed482-seq2,dna,31,2000,0.0,0.0,0.0,0.0,1748,1587,0,dna-31,3321167.5,0.276761,1.0,1.0,1.0,1.0
273596,data-d0.95-f3-gamma-seed482,data-d0.95-f3-gamma-seed482-seq1,data-d0.95-f3-gamma-seed482-seq2,dna,51,1,0.0,0.0,0.0,0.0,3361366,3280929,0,dna-51,3321147.5,0.276761,1.0,1.0,1.0,1.0
273597,data-d0.95-f3-gamma-seed482,data-d0.95-f3-gamma-seed482-seq1,data-d0.95-f3-gamma-seed482-seq2,dna,51,100,0.0,0.0,0.0,0.0,33232,32597,0,dna-51,3321147.5,0.276761,1.0,1.0,1.0,1.0
273598,data-d0.95-f3-gamma-seed482,data-d0.95-f3-gamma-seed482-seq1,data-d0.95-f3-gamma-seed482-seq2,dna,51,1000,0.0,0.0,0.0,0.0,3231,3246,0,dna-51,3321147.5,0.276761,1.0,1.0,1.0,1.0
273599,data-d0.95-f3-gamma-seed482,data-d0.95-f3-gamma-seed482-seq1,data-d0.95-f3-gamma-seed482-seq2,dna,51,2000,0.0,0.0,0.0,0.0,1644,1585,0,dna-51,3321147.5,0.276761,1.0,1.0,1.0,1.0


In [42]:
fullDF.to_csv("../data/simreads-compare.dnainput.processed.mrcc-onestep.csv.gz", index=False)

In [43]:
# print information from a sequence comparison that initially failed
simDF.iloc[9622]

comparison_name          data-d0.15-f1-nogam-seed188
sig1_name           data-d0.15-f1-nogam-seed188-seq1
sig2_name           data-d0.15-f1-nogam-seed188-seq2
alphabet                                         dna
ksize                                             51
scaled                                          1000
jaccard                                  0.000412159
max_containment                          0.000837696
sig1_containment                         0.000810701
sig2_containment                         0.000837696
sig1_hashes                                     4934
sig2_hashes                                     4775
num_common                                         4
alpha-ksize                                   dna-51
num_unique_kmers                         4.85339e+06
true p-distance                             0.133207
Name: 9622, dtype: object

In [44]:
fullDF.iloc[9622]

comparison_name          data-d0.15-f1-nogam-seed188
sig1_name           data-d0.15-f1-nogam-seed188-seq1
sig2_name           data-d0.15-f1-nogam-seed188-seq2
alphabet                                         dna
ksize                                             51
scaled                                          1000
jaccard                                  0.000412159
max_containment                          0.000837696
sig1_containment                         0.000810701
sig2_containment                         0.000837696
sig1_hashes                                     4934
sig2_hashes                                     4775
num_common                                         4
alpha-ksize                                   dna-51
num_unique_kmers                         4.85339e+06
true p-distance                             0.133207
mc_pLow                                     0.113458
mc_pHigh                                    0.145634
mc_pMid                                     0.