In [1]:
import os
import sys
import glob
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

import mrcc
from mrcc.p_from_scaled_containment import compute_confidence_intervals

sns.set_context("paper")

In [2]:
pd.__version__

'0.24.2'

## Import Simulated Read Data

In [3]:
%%bash
mkdir -p ../data
curl -L https://osf.io/xn7vt/download -o ../data/simreads-compare.dnainput.csv.gz
ls ../data

simreads-compare.dnainput.csv.gz
simreads-compare.dnainput.processed.csv.gz


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   459  100   459    0     0   1243      0 --:--:-- --:--:-- --:--:--  1243
100 7119k  100 7119k    0     0  2951k      0  0:00:02  0:00:02 --:--:-- 8495k


In [4]:
infoDF = pd.read_csv("../data/simreads-compare.dnainput.csv.gz")
infoDF.head()

Unnamed: 0,comparison_name,sig1_name,sig2_name,alphabet,ksize,scaled,jaccard,max_containment,sig1_containment,sig2_containment,sig1_hashes,sig2_hashes,num_common,alpha-ksize
0,data-d0.05-f1-nogam-seed36,data-d0.05-f1-nogam-seed36-seq1,data-d0.05-f1-nogam-seed36-seq2,dna,21,1,0.190577,0.320984,0.319303,1.0,4993153,4967010,1594331,dna-21
1,data-d0.05-f1-nogam-seed36,data-d0.05-f1-nogam-seed36-seq1,data-d0.05-f1-nogam-seed36-seq2,dna,21,100,0.192184,0.322739,0.322739,1.0,49616,49718,16013,dna-21
2,data-d0.05-f1-nogam-seed36,data-d0.05-f1-nogam-seed36-seq1,data-d0.05-f1-nogam-seed36-seq2,dna,21,1000,0.197087,0.332343,0.326269,1.0,5143,5049,1678,dna-21
3,data-d0.05-f1-nogam-seed36,data-d0.05-f1-nogam-seed36-seq1,data-d0.05-f1-nogam-seed36-seq2,dna,21,2000,0.20125,0.338455,0.331748,1.0,2523,2473,837,dna-21
4,data-d0.05-f1-nogam-seed54,data-d0.05-f1-nogam-seed54-seq1,data-d0.05-f1-nogam-seed54-seq2,dna,21,1,0.19241,0.323544,0.323544,1.0,4991507,5016847,1614971,dna-21


### Use `scaled=1` sketches to get an average number of unique k-mers for distance estimation.

We do not yet have the most accurate quantification of the total number of unique hashes in each sketch. 
To get around this for now, I sketched at scaled=1,100,1000,2000 and we can use the number of unique hashes/k-mers from the `scaled=1` sketches for all estimations. 
Here I create a `num_unique_kmers` column, which is the average number of k-mers/hashes in the `scaled=1` sketches for each comparison. 

In [5]:
sc1 = infoDF.loc[infoDF["scaled"]==1]
sc1.tail()

Unnamed: 0,comparison_name,sig1_name,sig2_name,alphabet,ksize,scaled,jaccard,max_containment,sig1_containment,sig2_containment,sig1_hashes,sig2_hashes,num_common,alpha-ksize
273580,data-d0.95-f3-gamma-seed412,data-d0.95-f3-gamma-seed412-seq1,data-d0.95-f3-gamma-seed412-seq2,dna,51,1,0.0,0.0,0.0,1.0,3505310,2765595,0,dna-51
273584,data-d0.95-f3-gamma-seed473,data-d0.95-f3-gamma-seed473-seq1,data-d0.95-f3-gamma-seed473-seq2,dna,51,1,0.0,0.0,0.0,1.0,3671446,3503606,0,dna-51
273588,data-d0.95-f3-gamma-seed522,data-d0.95-f3-gamma-seed522-seq1,data-d0.95-f3-gamma-seed522-seq2,dna,51,1,0.0,0.0,0.0,1.0,3404874,3321307,0,dna-51
273592,data-d0.95-f3-gamma-seed552,data-d0.95-f3-gamma-seed552-seq1,data-d0.95-f3-gamma-seed552-seq2,dna,51,1,0.0,0.0,0.0,1.0,3404821,3248995,0,dna-51
273596,data-d0.95-f3-gamma-seed562,data-d0.95-f3-gamma-seed562-seq1,data-d0.95-f3-gamma-seed562-seq2,dna,51,1,0.0,0.0,0.0,1.0,3365815,3869882,0,dna-51


In [6]:
# get a standardized num_unique k-mers per comparison (average between both sketches at scaled=1)
sc1["num_unique_kmers"] = (sc1["sig1_hashes"] + sc1["sig2_hashes"])/2 # do I need to round this??
sc1.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,comparison_name,sig1_name,sig2_name,alphabet,ksize,scaled,jaccard,max_containment,sig1_containment,sig2_containment,sig1_hashes,sig2_hashes,num_common,alpha-ksize,num_unique_kmers
0,data-d0.05-f1-nogam-seed36,data-d0.05-f1-nogam-seed36-seq1,data-d0.05-f1-nogam-seed36-seq2,dna,21,1,0.190577,0.320984,0.319303,1.0,4993153,4967010,1594331,dna-21,4980081.5
4,data-d0.05-f1-nogam-seed54,data-d0.05-f1-nogam-seed54-seq1,data-d0.05-f1-nogam-seed54-seq2,dna,21,1,0.19241,0.323544,0.323544,1.0,4991507,5016847,1614971,dna-21,5004177.0
8,data-d0.05-f1-nogam-seed79,data-d0.05-f1-nogam-seed79-seq1,data-d0.05-f1-nogam-seed79-seq2,dna,21,1,0.191926,0.323909,0.320199,1.0,4968639,4911727,1590955,dna-21,4940183.0
12,data-d0.05-f1-nogam-seed43,data-d0.05-f1-nogam-seed43-seq1,data-d0.05-f1-nogam-seed43-seq2,dna,21,1,0.18982,0.321989,0.321989,1.0,4981405,5072457,1603958,dna-21,5026931.0
16,data-d0.05-f1-nogam-seed20,data-d0.05-f1-nogam-seed20-seq1,data-d0.05-f1-nogam-seed20-seq2,dna,21,1,0.185178,0.317006,0.317006,1.0,4972356,5116083,1576268,dna-21,5044219.5


In [7]:
# merge in the scaled=1 num_unique k-mers into the full dataframe
infoDF = pd.merge(infoDF, sc1[["comparison_name", "alpha-ksize", "num_unique_kmers"]], on=["comparison_name", "alpha-ksize"])
infoDF.head()

Unnamed: 0,comparison_name,sig1_name,sig2_name,alphabet,ksize,scaled,jaccard,max_containment,sig1_containment,sig2_containment,sig1_hashes,sig2_hashes,num_common,alpha-ksize,num_unique_kmers
0,data-d0.05-f1-nogam-seed36,data-d0.05-f1-nogam-seed36-seq1,data-d0.05-f1-nogam-seed36-seq2,dna,21,1,0.190577,0.320984,0.319303,1.0,4993153,4967010,1594331,dna-21,4980081.5
1,data-d0.05-f1-nogam-seed36,data-d0.05-f1-nogam-seed36-seq1,data-d0.05-f1-nogam-seed36-seq2,dna,21,100,0.192184,0.322739,0.322739,1.0,49616,49718,16013,dna-21,4980081.5
2,data-d0.05-f1-nogam-seed36,data-d0.05-f1-nogam-seed36-seq1,data-d0.05-f1-nogam-seed36-seq2,dna,21,1000,0.197087,0.332343,0.326269,1.0,5143,5049,1678,dna-21,4980081.5
3,data-d0.05-f1-nogam-seed36,data-d0.05-f1-nogam-seed36-seq1,data-d0.05-f1-nogam-seed36-seq2,dna,21,2000,0.20125,0.338455,0.331748,1.0,2523,2473,837,dna-21,4980081.5
4,data-d0.05-f1-nogam-seed54,data-d0.05-f1-nogam-seed54-seq1,data-d0.05-f1-nogam-seed54-seq2,dna,21,1,0.19241,0.323544,0.323544,1.0,4991507,5016847,1614971,dna-21,5004177.0


In [8]:
# load fasta simulation information 
siminfo = pd.read_csv("../simreads-info.csv.gz")
# rename some useful columns
siminfo.rename(columns={"name": "comparison_name", "p-distance":"true p-distance"}, inplace=True)
siminfo.head()

Unnamed: 0,comparison_name,seed,freq(T),freq(C),freq(A),freq(G),rate(C-T),rate(A-T),rate(G-T),rate(A-C),rate(C-G),alpha,lgt1,lgt2,sites,core,true p-distance
0,data-d0.05-f1-gamma-seed1,1,0.25,0.25,0.25,0.25,1.57881,0.188961,0.184296,0.277635,0.571672,0.239,4997288,5120205,5532162,4585331,0.041891
1,data-d0.05-f1-gamma-seed2,2,0.25,0.25,0.25,0.25,1.30318,0.337478,0.282495,0.389976,0.85799,0.313,4978497,5080470,5532500,4526467,0.043604
2,data-d0.05-f1-gamma-seed3,3,0.25,0.25,0.25,0.25,1.6758,0.370299,0.254104,0.507523,0.28185,0.29,4973221,4744010,5261801,4455430,0.042999
3,data-d0.05-f1-gamma-seed4,4,0.25,0.25,0.25,0.25,1.96182,0.237414,0.177498,0.47509,0.293021,0.322,4979167,5068807,5394562,4653412,0.042863
4,data-d0.05-f1-gamma-seed5,5,0.25,0.25,0.25,0.25,1.60369,0.260495,0.183868,0.338577,0.185786,0.291,4987735,5023978,5329783,4681930,0.042532


In [9]:
# merge true p-distance info estimated distances dataframe
infoDF = pd.merge(infoDF, siminfo[["comparison_name", "true p-distance"]], on=["comparison_name"])
infoDF.head()

Unnamed: 0,comparison_name,sig1_name,sig2_name,alphabet,ksize,scaled,jaccard,max_containment,sig1_containment,sig2_containment,sig1_hashes,sig2_hashes,num_common,alpha-ksize,num_unique_kmers,true p-distance
0,data-d0.05-f1-nogam-seed36,data-d0.05-f1-nogam-seed36-seq1,data-d0.05-f1-nogam-seed36-seq2,dna,21,1,0.190577,0.320984,0.319303,1.0,4993153,4967010,1594331,dna-21,4980081.5,0.047995
1,data-d0.05-f1-nogam-seed36,data-d0.05-f1-nogam-seed36-seq1,data-d0.05-f1-nogam-seed36-seq2,dna,21,100,0.192184,0.322739,0.322739,1.0,49616,49718,16013,dna-21,4980081.5,0.047995
2,data-d0.05-f1-nogam-seed36,data-d0.05-f1-nogam-seed36-seq1,data-d0.05-f1-nogam-seed36-seq2,dna,21,1000,0.197087,0.332343,0.326269,1.0,5143,5049,1678,dna-21,4980081.5,0.047995
3,data-d0.05-f1-nogam-seed36,data-d0.05-f1-nogam-seed36-seq1,data-d0.05-f1-nogam-seed36-seq2,dna,21,2000,0.20125,0.338455,0.331748,1.0,2523,2473,837,dna-21,4980081.5,0.047995
4,data-d0.05-f1-nogam-seed36,data-d0.05-f1-nogam-seed36-seq1,data-d0.05-f1-nogam-seed36-seq2,dna,31,1,0.107025,0.193865,0.19285,1.0,4993150,4967004,962929,dna-31,4980077.0,0.047995


In [10]:
# replace any zeroes in jaccard, containment with np.nan to avoid errors
cols = ["jaccard", "max_containment", "sig1_containment", "sig2_containment"]
infoDF[cols].replace(['0', 0], np.nan, inplace=True)
infoDF.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  method=method)


Unnamed: 0,comparison_name,sig1_name,sig2_name,alphabet,ksize,scaled,jaccard,max_containment,sig1_containment,sig2_containment,sig1_hashes,sig2_hashes,num_common,alpha-ksize,num_unique_kmers,true p-distance
0,data-d0.05-f1-nogam-seed36,data-d0.05-f1-nogam-seed36-seq1,data-d0.05-f1-nogam-seed36-seq2,dna,21,1,0.190577,0.320984,0.319303,1.0,4993153,4967010,1594331,dna-21,4980081.5,0.047995
1,data-d0.05-f1-nogam-seed36,data-d0.05-f1-nogam-seed36-seq1,data-d0.05-f1-nogam-seed36-seq2,dna,21,100,0.192184,0.322739,0.322739,1.0,49616,49718,16013,dna-21,4980081.5,0.047995
2,data-d0.05-f1-nogam-seed36,data-d0.05-f1-nogam-seed36-seq1,data-d0.05-f1-nogam-seed36-seq2,dna,21,1000,0.197087,0.332343,0.326269,1.0,5143,5049,1678,dna-21,4980081.5,0.047995
3,data-d0.05-f1-nogam-seed36,data-d0.05-f1-nogam-seed36-seq1,data-d0.05-f1-nogam-seed36-seq2,dna,21,2000,0.20125,0.338455,0.331748,1.0,2523,2473,837,dna-21,4980081.5,0.047995
4,data-d0.05-f1-nogam-seed36,data-d0.05-f1-nogam-seed36-seq1,data-d0.05-f1-nogam-seed36-seq2,dna,31,1,0.107025,0.193865,0.19285,1.0,4993150,4967004,962929,dna-31,4980077.0,0.047995


In [12]:
# save this csv.gz for use elsewhere
infoDF.to_csv("../data/simreads-compare.dnainput.processed.csv.gz", index=False)