In [1]:
import os
import sys
import glob
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

from tqdm import tqdm # progress bars :)
tqdm.pandas()

import mrcc
from mrcc.p_from_scaled_containment import compute_confidence_intervals

sns.set_context("paper")

In [2]:
pd.__version__

'0.24.2'

## Run the Test Data with mrcc.p_from_scaled_containment

In [3]:
%%bash
# run on command line
python -m mrcc.p_from_scaled_containment -L 100K -k 21 -c 0.95 --sccon 0.10605

L	k	conf	Cks	CLow	CHigh	pLow	pHigh
100000	21	0.95	0.10605	0.10046	0.11191	0.09623	0.10655


In [4]:
# run the same within python:

num_unique_kmers = 100000
ksize = 21
confidence=0.95
observed_containment=[0.10605]
scaled=0.1
compute_confidence_intervals(observed_containment, num_unique_kmers, ksize, confidence, scaled)

[[100000,
  21,
  0.95,
  0.10605,
  0.10046044759723283,
  0.11191185994396291,
  0.09623087565457937,
  0.10655865394035777]]

# Import Preprocessed Simulated Read Data

In [5]:
%%bash
mkdir -p ../data
curl -L https://osf.io/f4a9k/download -o ../data/simreads-compare.dnainput.processed.csv.gz
ls ../data

simreads-compare.dnainput.csv.gz
simreads-compare.dnainput.mrcc.csv.gz
simreads-compare.dnainput.processed.csv.gz
simreads-compare.dnainput.processed.mashD.csv.gz


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   459  100   459    0     0   1195      0 --:--:-- --:--:-- --:--:--  1195
100 8139k  100 8139k    0     0  5200k      0  0:00:01  0:00:01 --:--:-- 7215k


In [6]:
simDF = pd.read_csv("../data/simreads-compare.dnainput.processed.csv.gz")
simDF.head()

Unnamed: 0,comparison_name,sig1_name,sig2_name,alphabet,ksize,scaled,jaccard,max_containment,sig1_containment,sig2_containment,sig1_hashes,sig2_hashes,num_common,alpha-ksize,num_unique_kmers,true p-distance
0,data-d0.05-f1-nogam-seed11,data-d0.05-f1-nogam-seed11-seq1,data-d0.05-f1-nogam-seed11-seq2,dna,21,1,0.192024,0.329152,0.329152,0.3155,4974666,5189923,1637423,dna-21,5082294.5,0.047855
1,data-d0.05-f1-nogam-seed11,data-d0.05-f1-nogam-seed11-seq1,data-d0.05-f1-nogam-seed11-seq2,dna,21,100,0.193861,0.332344,0.332344,0.317521,49849,52176,16567,dna-21,5082294.5,0.047855
2,data-d0.05-f1-nogam-seed11,data-d0.05-f1-nogam-seed11-seq1,data-d0.05-f1-nogam-seed11-seq2,dna,21,1000,0.203267,0.343331,0.343331,0.332559,5001,5163,1717,dna-21,5082294.5,0.047855
3,data-d0.05-f1-nogam-seed11,data-d0.05-f1-nogam-seed11-seq1,data-d0.05-f1-nogam-seed11-seq2,dna,21,2000,0.203029,0.344026,0.344026,0.331274,2494,2590,858,dna-21,5082294.5,0.047855
4,data-d0.05-f1-nogam-seed11,data-d0.05-f1-nogam-seed11-seq1,data-d0.05-f1-nogam-seed11-seq2,dna,31,1,0.108379,0.199794,0.199794,0.191507,4974660,5189923,993907,dna-31,5082291.5,0.047855


### Try mrcc to estimate p from scaled containment

In [7]:
# works with containment <=0.15, not above?
num_unique_kmers=4980082
ksize=21
confidence=0.95
scaled=1.0
observed_containment=[0.15]

compute_confidence_intervals(observed_containment, num_unique_kmers, ksize, confidence, scaled)

[[4980082,
  21,
  0.95,
  0.15,
  0.1499999999999999,
  0.1499999999999999,
  0.08602929375779907,
  0.08672776348578598]]

In [8]:
## First, test some values directly
num_unique_kmers=4980082
ksize=21
confidence=0.95
scaled=1.0
observed_containment=[0.320984]

compute_confidence_intervals(observed_containment, num_unique_kmers, ksize, confidence, scaled)

[[4980082,
  21,
  0.95,
  0.320984,
  0.32098400000000005,
  0.32098400000000005,
  0.05243629673352139,
  0.052913080266991326]]

### Use the following function to get values for all comparisons...

In [9]:
# function run p_from_scaled_containment on full dataframe
def mrcc_maxcontain(row, confidence=0.95, return_ANI=False):
    scaled_perc = 1/row["scaled"]
    mc = row["max_containment"]
    num_kmers = row["num_unique_kmers"]
    ksize = row["ksize"]
    mc_results = compute_confidence_intervals([mc], num_kmers, ksize, confidence, scaled_perc)[0]
    pLow,pHigh = mc_results[6:]
    if return_ANI:
        row["mc_ANILow"] = 1 - pLow
        row["mc_ANIHigh"] = 1- pHigh   
    else:
        row["mc_pLow"] = pLow
        row["mc_pHigh"] = pHigh
    return row
#fullDF = simDF.apply(mrcc_maxcontain, axis=1)

In [10]:
# to run p_from_scaled_containment on full dataframe
fullDF = simDF.progress_apply(mrcc_maxcontain, axis=1) # with tqdm progress bar!
fullDF.head()

100%|██████████| 273600/273600 [11:42<00:00, 389.22it/s]


Unnamed: 0,comparison_name,sig1_name,sig2_name,alphabet,ksize,scaled,jaccard,max_containment,sig1_containment,sig2_containment,sig1_hashes,sig2_hashes,num_common,alpha-ksize,num_unique_kmers,true p-distance,mc_pLow,mc_pHigh
0,data-d0.05-f1-nogam-seed11,data-d0.05-f1-nogam-seed11-seq1,data-d0.05-f1-nogam-seed11-seq2,dna,21,1,0.192024,0.329152,0.329152,0.3155,4974666,5189923,1637423,dna-21,5082294.5,0.047855,0.051308,0.051773
1,data-d0.05-f1-nogam-seed11,data-d0.05-f1-nogam-seed11-seq1,data-d0.05-f1-nogam-seed11-seq2,dna,21,100,0.193861,0.332344,0.332344,0.317521,49849,52176,16567,dna-21,5082294.5,0.047855,0.050323,0.051893
2,data-d0.05-f1-nogam-seed11,data-d0.05-f1-nogam-seed11-seq1,data-d0.05-f1-nogam-seed11-seq2,dna,21,1000,0.203267,0.343331,0.343331,0.332559,5001,5163,1717,dna-21,5082294.5,0.047855,0.047708,0.051601
3,data-d0.05-f1-nogam-seed11,data-d0.05-f1-nogam-seed11-seq1,data-d0.05-f1-nogam-seed11-seq2,dna,21,2000,0.203029,0.344026,0.344026,0.331274,2494,2590,858,dna-21,5082294.5,0.047855,0.046926,0.052235
4,data-d0.05-f1-nogam-seed11,data-d0.05-f1-nogam-seed11-seq1,data-d0.05-f1-nogam-seed11-seq2,dna,31,1,0.108379,0.199794,0.199794,0.191507,4974660,5189923,993907,dna-31,5082291.5,0.047855,0.050369,0.05088


In [11]:
fullDF.tail()

Unnamed: 0,comparison_name,sig1_name,sig2_name,alphabet,ksize,scaled,jaccard,max_containment,sig1_containment,sig2_containment,sig1_hashes,sig2_hashes,num_common,alpha-ksize,num_unique_kmers,true p-distance,mc_pLow,mc_pHigh
273595,data-d0.95-f3-gamma-seed482,data-d0.95-f3-gamma-seed482-seq1,data-d0.95-f3-gamma-seed482-seq2,dna,31,2000,0.0,0.0,0.0,0.0,1748,1587,0,dna-31,3321167.5,0.276761,1.0,1.0
273596,data-d0.95-f3-gamma-seed482,data-d0.95-f3-gamma-seed482-seq1,data-d0.95-f3-gamma-seed482-seq2,dna,51,1,0.0,0.0,0.0,0.0,3361366,3280929,0,dna-51,3321147.5,0.276761,1.0,1.0
273597,data-d0.95-f3-gamma-seed482,data-d0.95-f3-gamma-seed482-seq1,data-d0.95-f3-gamma-seed482-seq2,dna,51,100,0.0,0.0,0.0,0.0,33232,32597,0,dna-51,3321147.5,0.276761,1.0,1.0
273598,data-d0.95-f3-gamma-seed482,data-d0.95-f3-gamma-seed482-seq1,data-d0.95-f3-gamma-seed482-seq2,dna,51,1000,0.0,0.0,0.0,0.0,3231,3246,0,dna-51,3321147.5,0.276761,1.0,1.0
273599,data-d0.95-f3-gamma-seed482,data-d0.95-f3-gamma-seed482-seq1,data-d0.95-f3-gamma-seed482-seq2,dna,51,2000,0.0,0.0,0.0,0.0,1644,1585,0,dna-51,3321147.5,0.276761,1.0,1.0


In [14]:
fullDF.to_csv("../data/simreads-compare.dnainput.processed.mrcc.csv.gz", index=False)

In [15]:
# print information from a sequence comparison that initially failed
simDF.iloc[9622]

comparison_name          data-d0.15-f1-nogam-seed188
sig1_name           data-d0.15-f1-nogam-seed188-seq1
sig2_name           data-d0.15-f1-nogam-seed188-seq2
alphabet                                         dna
ksize                                             51
scaled                                          1000
jaccard                                  0.000412159
max_containment                          0.000837696
sig1_containment                         0.000810701
sig2_containment                         0.000837696
sig1_hashes                                     4934
sig2_hashes                                     4775
num_common                                         4
alpha-ksize                                   dna-51
num_unique_kmers                         4.85339e+06
true p-distance                             0.133207
Name: 9622, dtype: object