In [13]:
import os
import sys
import glob
import pandas as pd
import seaborn as sns
import numpy as  np
import matplotlib.pyplot as plt

from tqdm import tqdm # progress bars :)
tqdm.pandas()

In [2]:
pd.__version__

'0.24.2'

# Import Preprocessed Simulated Read Data

In [3]:
%%bash
mkdir -p ../data
curl -L https://osf.io/f4a9k/download -o ../data/simreads-compare.dnainput.processed.csv.gz
ls ../data

simreads-compare.dnainput.csv.gz
simreads-compare.dnainput.mrcc.csv.gz
simreads-compare.dnainput.processed.csv.gz


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   459  100   459    0     0   1289      0 --:--:-- --:--:-- --:--:--  1285
100 8139k  100 8139k    0     0  2311k      0  0:00:03  0:00:03 --:--:-- 4163k


In [8]:
simDF = pd.read_csv("../data/simreads-compare.dnainput.processed.csv.gz")
simDF.head()

Unnamed: 0,comparison_name,sig1_name,sig2_name,alphabet,ksize,scaled,jaccard,max_containment,sig1_containment,sig2_containment,sig1_hashes,sig2_hashes,num_common,alpha-ksize,num_unique_kmers,true p-distance
0,data-d0.05-f1-nogam-seed11,data-d0.05-f1-nogam-seed11-seq1,data-d0.05-f1-nogam-seed11-seq2,dna,21,1,0.192024,0.329152,0.329152,0.3155,4974666,5189923,1637423,dna-21,5082294.5,0.047855
1,data-d0.05-f1-nogam-seed11,data-d0.05-f1-nogam-seed11-seq1,data-d0.05-f1-nogam-seed11-seq2,dna,21,100,0.193861,0.332344,0.332344,0.317521,49849,52176,16567,dna-21,5082294.5,0.047855
2,data-d0.05-f1-nogam-seed11,data-d0.05-f1-nogam-seed11-seq1,data-d0.05-f1-nogam-seed11-seq2,dna,21,1000,0.203267,0.343331,0.343331,0.332559,5001,5163,1717,dna-21,5082294.5,0.047855
3,data-d0.05-f1-nogam-seed11,data-d0.05-f1-nogam-seed11-seq1,data-d0.05-f1-nogam-seed11-seq2,dna,21,2000,0.203029,0.344026,0.344026,0.331274,2494,2590,858,dna-21,5082294.5,0.047855
4,data-d0.05-f1-nogam-seed11,data-d0.05-f1-nogam-seed11-seq1,data-d0.05-f1-nogam-seed11-seq2,dna,31,1,0.108379,0.199794,0.199794,0.191507,4974660,5189923,993907,dna-31,5082291.5,0.047855


In [9]:
# replace any zeroes in jaccard, containment with np.nan to avoid errors
cols = ["jaccard", "max_containment", "sig1_containment", "sig2_containment"]
simDF[cols] = simDF[cols].replace(['0', 0], np.nan)#, inplace=True)
simDF.head()

Unnamed: 0,comparison_name,sig1_name,sig2_name,alphabet,ksize,scaled,jaccard,max_containment,sig1_containment,sig2_containment,sig1_hashes,sig2_hashes,num_common,alpha-ksize,num_unique_kmers,true p-distance
0,data-d0.05-f1-nogam-seed11,data-d0.05-f1-nogam-seed11-seq1,data-d0.05-f1-nogam-seed11-seq2,dna,21,1,0.192024,0.329152,0.329152,0.3155,4974666,5189923,1637423,dna-21,5082294.5,0.047855
1,data-d0.05-f1-nogam-seed11,data-d0.05-f1-nogam-seed11-seq1,data-d0.05-f1-nogam-seed11-seq2,dna,21,100,0.193861,0.332344,0.332344,0.317521,49849,52176,16567,dna-21,5082294.5,0.047855
2,data-d0.05-f1-nogam-seed11,data-d0.05-f1-nogam-seed11-seq1,data-d0.05-f1-nogam-seed11-seq2,dna,21,1000,0.203267,0.343331,0.343331,0.332559,5001,5163,1717,dna-21,5082294.5,0.047855
3,data-d0.05-f1-nogam-seed11,data-d0.05-f1-nogam-seed11-seq1,data-d0.05-f1-nogam-seed11-seq2,dna,21,2000,0.203029,0.344026,0.344026,0.331274,2494,2590,858,dna-21,5082294.5,0.047855
4,data-d0.05-f1-nogam-seed11,data-d0.05-f1-nogam-seed11-seq1,data-d0.05-f1-nogam-seed11-seq2,dna,31,1,0.108379,0.199794,0.199794,0.191507,4974660,5189923,993907,dna-31,5082291.5,0.047855


In [10]:
def mash_evoldist(row):
    pdist = 1- ((2*row["jaccard"]/(row["jaccard"]+1))** (1/row["ksize"]))
    row["jaccard_pdist"] = -np.log(1 - pdist) #default log base is e
    c_pdist = 1- ((2*row["max_containment"]/(row["max_containment"]+1))** (1/row["ksize"]))
    row["contain_pdist"] = -np.log(1 - c_pdist) #default log base is e
    return row

In [14]:
mDistDF = simDF.progress_apply(mash_evoldist, axis=1)
mDistDF.head()

100%|██████████| 273600/273600 [09:06<00:00, 500.19it/s]


Unnamed: 0,comparison_name,sig1_name,sig2_name,alphabet,ksize,scaled,jaccard,max_containment,sig1_containment,sig2_containment,sig1_hashes,sig2_hashes,num_common,alpha-ksize,num_unique_kmers,true p-distance,jaccard_pdist,contain_pdist
0,data-d0.05-f1-nogam-seed11,data-d0.05-f1-nogam-seed11-seq1,data-d0.05-f1-nogam-seed11-seq2,dna,21,1,0.192024,0.329152,0.329152,0.3155,4974666,5189923,1637423,dna-21,5082294.5,0.047855,0.053935,0.033459
1,data-d0.05-f1-nogam-seed11,data-d0.05-f1-nogam-seed11-seq1,data-d0.05-f1-nogam-seed11-seq2,dna,21,100,0.193861,0.332344,0.332344,0.317521,49849,52176,16567,dna-21,5082294.5,0.047855,0.053555,0.033113
2,data-d0.05-f1-nogam-seed11,data-d0.05-f1-nogam-seed11-seq1,data-d0.05-f1-nogam-seed11-seq2,dna,21,1000,0.203267,0.343331,0.343331,0.332559,5001,5163,1717,dna-21,5082294.5,0.047855,0.051673,0.031955
3,data-d0.05-f1-nogam-seed11,data-d0.05-f1-nogam-seed11-seq1,data-d0.05-f1-nogam-seed11-seq2,dna,21,2000,0.203029,0.344026,0.344026,0.331274,2494,2590,858,dna-21,5082294.5,0.047855,0.051719,0.031884
4,data-d0.05-f1-nogam-seed11,data-d0.05-f1-nogam-seed11-seq1,data-d0.05-f1-nogam-seed11-seq2,dna,31,1,0.108379,0.199794,0.199794,0.191507,4974660,5189923,993907,dna-31,5082291.5,0.047855,0.052641,0.035467


In [15]:
mDistDF.to_csv("../data/simreads-compare.dnainput.processed.mashD.csv.gz", index=False)