In [1]:
import os
import sys
import glob
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

import mrcc
from mrcc.p_from_scaled_containment import compute_confidence_intervals

sns.set_context("paper")

In [2]:
pd.__version__

'0.24.2'

## Run the Test Data with mrcc.p_from_scaled_containment

In [3]:
%%bash
# run on command line
python -m mrcc.p_from_scaled_containment -L 100K -k 21 -c 0.95 --sccon 0.10605

L	k	conf	Cks	CLow	CHigh	pLow	pHigh
100000	21	0.95	0.10605	0.10046	0.11191	0.09623	0.10655


In [4]:
# run the same within python:

num_unique_kmers = 100000
ksize = 21
confidence=0.95
observed_containment=[0.10605]
scaled=0.1
compute_confidence_intervals(observed_containment, num_unique_kmers, ksize, confidence, scaled)

[[100000,
  21,
  0.95,
  0.10605,
  mpf('0.10046044759723280359624752115346686087576683874616503'),
  mpf('0.11191185994396290217198335874293883856388369710875699'),
  0.09623087565457938,
  0.10655865394035784]]

# Import Preprocessed Simulated Read Data

In [5]:
%%bash
mkdir -p ../data
curl -L https://osf.io/f4a9k/download -o ../data/simreads-compare.dnainput.processed.csv.gz
ls ../data

simreads-compare.dnainput.csv.gz
simreads-compare.dnainput.processed.csv.gz


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   459  100   459    0     0    692      0 --:--:-- --:--:-- --:--:--   692
100 7421k  100 7421k    0     0  2289k      0  0:00:03  0:00:03 --:--:-- 3100k


In [6]:
simDF = pd.read_csv("../data/simreads-compare.dnainput.processed.csv.gz")
simDF.head()

Unnamed: 0,comparison_name,sig1_name,sig2_name,alphabet,ksize,scaled,jaccard,max_containment,sig1_containment,sig2_containment,sig1_hashes,sig2_hashes,num_common,alpha-ksize,num_unique_kmers,true p-distance
0,data-d0.05-f1-nogam-seed36,data-d0.05-f1-nogam-seed36-seq1,data-d0.05-f1-nogam-seed36-seq2,dna,21,1,0.190577,0.320984,0.319303,1.0,4993153,4967010,1594331,dna-21,4980081.5,0.047995
1,data-d0.05-f1-nogam-seed36,data-d0.05-f1-nogam-seed36-seq1,data-d0.05-f1-nogam-seed36-seq2,dna,21,100,0.192184,0.322739,0.322739,1.0,49616,49718,16013,dna-21,4980081.5,0.047995
2,data-d0.05-f1-nogam-seed36,data-d0.05-f1-nogam-seed36-seq1,data-d0.05-f1-nogam-seed36-seq2,dna,21,1000,0.197087,0.332343,0.326269,1.0,5143,5049,1678,dna-21,4980081.5,0.047995
3,data-d0.05-f1-nogam-seed36,data-d0.05-f1-nogam-seed36-seq1,data-d0.05-f1-nogam-seed36-seq2,dna,21,2000,0.20125,0.338455,0.331748,1.0,2523,2473,837,dna-21,4980081.5,0.047995
4,data-d0.05-f1-nogam-seed36,data-d0.05-f1-nogam-seed36-seq1,data-d0.05-f1-nogam-seed36-seq2,dna,31,1,0.107025,0.193865,0.19285,1.0,4993150,4967004,962929,dna-31,4980077.0,0.047995


### Try mrcc to estimate p from scaled containment

In [7]:
# works with containment <=0.15, not above?
num_unique_kmers=4980082
ksize=21
confidence=0.95
scaled=1.0
observed_containment=[0.15]

compute_confidence_intervals(observed_containment, num_unique_kmers, ksize, confidence, scaled)

[[4980082,
  21,
  0.95,
  0.15,
  mpf('0.14999999999999999444888487686254876590750012202087263'),
  mpf('0.14999999999999999444888487686254876590750012202087263'),
  0.08602929375779904,
  0.08672776348578592]]

In [8]:
## First, test some values directly
num_unique_kmers=4980082
ksize=21
confidence=0.95
scaled=1.0
observed_containment=[0.320984]

compute_confidence_intervals(observed_containment, num_unique_kmers, ksize, confidence, scaled)

[[4980082,
  21,
  0.95,
  0.320984,
  mpf('0.32098399999999999154809415814113018102829718478688624'),
  mpf('0.32098399999999999154809415814113018102829718478688624'),
  0.05243629673352131,
  0.052913080266991375]]

### Use the following function to get values for all comparisons...

In [9]:
# function run p_from_scaled_containment on full dataframe
def mrcc_maxcontain(row, confidence=0.95, return_ANI=False):
    scaled_perc = 1/row["scaled"]
    mc = row["max_containment"]
    num_kmers = row["num_unique_kmers"]
    ksize = row["ksize"]
    mc_results = compute_confidence_intervals([mc], num_kmers, ksize, confidence, scaled_perc)[0]
    pLow,pHigh = mc_results[6:]
    if return_ANI:
        row["mc_ANILow"] = 1 - pLow
        row["mc_ANIHigh"] = 1- pHigh   
    else:
        row["mc_pLow"] = pLow
        row["mc_pHigh"] = pHigh
    return row

In [10]:
# to run p_from_scaled_containment on full dataframe
fullDF = simDF.apply(mrcc_maxcontain, axis=1)
fullDF.head()

TypeError: ("cannot create mpf from mpc(real='-0.00348803037430721399437575587078819790990038705147875', imag='-0.0016112491358384129670216132400565549155284115430919072')", 'occurred at index 9622')

In [15]:
simDF.iloc[9622]

comparison_name          data-d0.15-f1-nogam-seed107
sig1_name           data-d0.15-f1-nogam-seed107-seq1
sig2_name           data-d0.15-f1-nogam-seed107-seq2
alphabet                                         dna
ksize                                             51
scaled                                          1000
jaccard                                  0.000103734
max_containment                          0.000212044
sig1_containment                         0.000212044
sig2_containment                                   1
sig1_hashes                                     4716
sig2_hashes                                     4925
num_common                                         1
alpha-ksize                                   dna-51
num_unique_kmers                         4.85846e+06
true p-distance                             0.134357
Name: 9622, dtype: object

In [None]:
fullDF.to_csv("../data/simreads-compare.dnainput.mrcc.csv.gz")