# Running BioDendro Pipeline

In [1]:
# listed are all BioDendro pipeline arguments

!BioDendro -h

usage: BioDendro [-h] [-n] [-c CUTOFF] [-b BIN_THRESHOLD]
                 [-d {jaccard,braycurtis}] [-p PROCESSED] [-o OUT_HTML]
                 [-r RESULTS_DIR] [-x WIDTH_PX] [-y HEIGHT_PX] [-q]
                 mgf components

Run the BioDendro pipeline.

positional arguments:
  mgf                   MGF input file.
  components            Listed components file.

optional arguments:
  -h, --help            show this help message and exit
  -n, --neutral         Apply neutral loss.
  -c CUTOFF, --cutoff CUTOFF
                        Distance threshold for selecting clusters from tree.
  -b BIN_THRESHOLD, --bin-threshold BIN_THRESHOLD
                        Threshold for binning m/z values prior to clustering.
  -d {jaccard,braycurtis}, --cluster-method {jaccard,braycurtis}
                        The distance metric used during tree construction.
  -p PROCESSED, --processed PROCESSED
                        Path to write preprocessed output to.
  -o OUT_HTML

In [2]:
# Run the complete pipeline

! BioDendro ./MSMS.mgf ./component_list.txt

Running BioDendro v0.0.1

- input mgf file = ./MSMS.mgf
- input components file = ./component_list.txt
- neutral = False
- cutoff = 0.6
- bin_threshold = 0.0008
- clustering_method = jaccard
- output processed file = processed.xlsx
- output results directory = results_20180604130113
- output html dendrogram = simple_dendrogram.html
- dendrogram figure width = 900
- dendrogram figure height = 400


Processing inputs
Binning and clustering
This may take some time...
Writing per-cluster summaries
Writing output html dendrogram
Finished


## Running the pipeline interactively.

In [3]:
# Load required modules

import os
import plotly
from BioDendro import preprocess
from BioDendro import cluster

In [4]:
# Load the MSMS records 

with open("./MSMS.mgf") as handle:
    mgf = preprocess.MGF.parse(handle)

for record in mgf.records:
    record.title = preprocess.split_msms_title(record.title)

mgf.records[:5]

[MGFRecord(title='QE_2017_001814', retention=141.0, pepmass=Ion(mz=81.52061, intensity=1678597.25), charge='1+', ions=[Ion(mz=53.00264, intensity=1252.15), Ion(mz=60.7764, intensity=1324.65)]),
 MGFRecord(title='QE_2017_001814', retention=877.0, pepmass=Ion(mz=81.52061, intensity=19463956.0), charge='2+', ions=[Ion(mz=59.32066, intensity=14996.5), Ion(mz=131.81848, intensity=18682.3), Ion(mz=176.21725, intensity=18992.1)]),
 MGFRecord(title='QE_2017_001816', retention=447.0, pepmass=Ion(mz=81.52061, intensity=2814306.5), charge='1+', ions=[Ion(mz=53.0393, intensity=1853.67)]),
 MGFRecord(title='QE_2017_001814', retention=436.0, pepmass=Ion(mz=81.52064, intensity=2750625.5), charge='1+', ions=[Ion(mz=63.59929, intensity=2020.82)]),
 MGFRecord(title='QE_2017_001815', retention=887.0, pepmass=Ion(mz=81.52064, intensity=15775805.0), charge='2+', ions=[Ion(mz=124.5074, intensity=13727.9), Ion(mz=184.32289, intensity=14260.0)])]

In [5]:
# Load the list of components to compare to the MSMS file

with open("./component_list.txt") as handle:
    components = preprocess.SampleRecord.parse(handle)

components[:5]

[<BioDendro.preprocess.SampleRecord at 0x7fd97c94eba8>,
 <BioDendro.preprocess.SampleRecord at 0x7fd97c94eb70>,
 <BioDendro.preprocess.SampleRecord at 0x7fd97c94e860>,
 <BioDendro.preprocess.SampleRecord at 0x7fd97c94ea90>,
 <BioDendro.preprocess.SampleRecord at 0x7fd97c94eac8>]

In [6]:
# Remove redundant records with mass and retention time tolerance 

df = preprocess.remove_redundancy(components, mgf, mz_tol=0.002, retention_tol=5, neutral=False)
df.head()

Unnamed: 0,sample,mz
0,QE_2017_001814_353.26889_609.0,50.06792
1,QE_2017_001816_177.05449_217.0,50.44534
2,QE_2017_001815_235.14398_226.0,50.4863
3,QE_2017_001816_211.16916_405.0,50.49868
4,QE_2017_001815_613.4826_734.0,50.57634


In [7]:
# Using the non-redundant dataframe, bin analytes on threshold=8e-4 and return a data matrix
# Cluster the data matrix using clustering_method="jaccard"
# Set threshold to color dendrogram and output clusters at cutoff=0.4

tree = cluster.Tree(threshold=8e-4, clustering_method="jaccard", cutoff=0.4)
tree.fit(df)

In [9]:
# One hot encoded matrix output from mz binning.
tree.onehot_df.head()

bins,100.0246_100.0246_100.0246,100.0394_100.0394_100.0394,100.0759_100.0759_100.0759,100.1122_100.1122_100.1122,100.1311_100.1311_100.1311,100.5562_100.5562_100.5562,100.6662_100.6662_100.6662,101.0235_101.0235_101.0235,101.0598_101.0596_101.0599,101.0711_101.0711_101.0712,...,98.5418_98.5418_98.5418,98.5942_98.5942_98.5942,98.8081_98.8081_98.8081,98.9756_98.9755_98.9757,98.9844_98.9844_98.9845,99.0034_99.0034_99.0034,99.0080_99.0080_99.0080,99.0443_99.0441_99.0445,99.0807_99.0804_99.0809,99.1172_99.1172_99.1172
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
QE_2017_001814_129.12737_348.0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
QE_2017_001814_171.14908_419.0,False,False,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
QE_2017_001814_177.05418_236.0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
QE_2017_001814_191.14291_389.0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
QE_2017_001814_194.04424_237.0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [10]:
# Clusters derived from the tree using cutoff=0.4
tree.clusters

array([ 87,  88, 143,  69, 133, 134, 135, 144, 144,  85,  72,  81,  82,
        70,  70,  71, 117, 139,  78, 131, 131,  40,  38, 128, 150, 100,
       102,  75, 120, 118, 110, 116,   7,  19,  20,  75, 112,  76,  89,
       105, 103, 104,  97,  95,   9,   6,   6,   6,  14,  36,  53,  18,
         8,  62,  60,  57,  60,  61,  91,  12,  41,  31,  29,   4,  24,
        34,  22,  47,  26,   1, 153, 143,  80, 146,  67, 137, 138, 140,
       137, 136, 144, 145, 144,  65,  79,  81,  72,  81, 121, 141, 138,
       124, 122,  83, 109, 108, 108,  70,  98,  73,  39,  38, 129, 114,
       115, 151, 101,  15, 125, 120, 127, 152,   9, 154,  55, 111, 111,
        77, 112,  43, 106,  89,  96,  93,   6,   7,   9,  42,  54,   8,
        59,  63, 147,  64,  48,  11,  51,  35,  13,  10,  32,  30,  33,
       149,   2,  27, 148,   5, 155, 130,  84, 143,  80,  68, 145, 145,
       145, 145,  66,  86,  74,  72,  81, 142, 123,  78, 132, 128,  99,
       126, 127, 119,  56,  21, 113,  45,  44, 107,  92,  95,  9

In [11]:
# To pick new clusters without computing a new tree...
print("Cutoff:", tree.cutoff, "n clusters:", len(set(tree.clusters)))

tree.cut_tree(cutoff=0.6)
print("Cutoff:", tree.cutoff, "n clusters:", len(set(tree.clusters)))

# Note that the object's cutoff value is changed.

Cutoff: 0.4 n clusters: 155
Cutoff: 0.6 n clusters: 110


In [12]:
# Make sure you take a copy to avoid editing the data.
oh = tree.onehot_df.copy()
oh["cluster"] = tree.clusters

# reorder columns
oh = oh[["cluster"] + [col for col in oh.columns if col != "cluster"]]

# Sort values by cluster
oh.sort_values(by="cluster", inplace=True)

oh.head()

bins,cluster,100.0246_100.0246_100.0246,100.0394_100.0394_100.0394,100.0759_100.0759_100.0759,100.1122_100.1122_100.1122,100.1311_100.1311_100.1311,100.5562_100.5562_100.5562,100.6662_100.6662_100.6662,101.0235_101.0235_101.0235,101.0598_101.0596_101.0599,...,98.5418_98.5418_98.5418,98.5942_98.5942_98.5942,98.8081_98.8081_98.8081,98.9756_98.9755_98.9757,98.9844_98.9844_98.9845,99.0034_99.0034_99.0034,99.0080_99.0080_99.0080,99.0443_99.0441_99.0445,99.0807_99.0804_99.0809,99.1172_99.1172_99.1172
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
QE_2017_001815_613.4826_734.0,1,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
QE_2017_001814_792.56207_723.0,1,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
QE_2017_001816_613.48218_838.0,2,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
QE_2017_001814_596.30963_602.0,3,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
QE_2017_001815_792.5625_866.0,4,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [13]:
# Linkage tree from scipy.
tree.tree[:6]

array([[8.10000000e+01, 1.57000000e+02, 1.81818182e-01, 2.00000000e+00],
       [1.10000000e+01, 8.70000000e+01, 1.92307692e-01, 2.00000000e+00],
       [8.50000000e+01, 2.00000000e+02, 2.11538462e-01, 3.00000000e+00],
       [5.10000000e+01, 1.87000000e+02, 2.37623762e-01, 2.00000000e+00],
       [4.60000000e+01, 1.81000000e+02, 2.60000000e-01, 2.00000000e+00],
       [3.60000000e+01, 1.18000000e+02, 2.60869565e-01, 2.00000000e+00]])

In [14]:
# Generate the plots of clusters.
os.makedirs("results", exist_ok=True)

tree.write_summaries(path="results")

In [15]:
! ls -l results | head

total 13760
-rw-rw-r--. 1 darcyabjones darcyabjones    4493 Jun  4 13:02 cluster_100_9.csv
-rw-rw-r--. 1 darcyabjones darcyabjones  140867 Jun  4 13:02 cluster_100_9.png
-rw-rw-r--. 1 darcyabjones darcyabjones     877 Jun  4 13:02 cluster_101_1.csv
-rw-rw-r--. 1 darcyabjones darcyabjones   74004 Jun  4 13:02 cluster_101_1.png
-rw-rw-r--. 1 darcyabjones darcyabjones     614 Jun  4 13:02 cluster_102_1.csv
-rw-rw-r--. 1 darcyabjones darcyabjones   57197 Jun  4 13:02 cluster_102_1.png
-rw-rw-r--. 1 darcyabjones darcyabjones     759 Jun  4 13:02 cluster_103_1.csv
-rw-rw-r--. 1 darcyabjones darcyabjones   72163 Jun  4 13:02 cluster_103_1.png
-rw-rw-r--. 1 darcyabjones darcyabjones    3491 Jun  4 13:02 cluster_10_3.csv
ls: write error: Broken pipe


In [16]:
# To write out the tree.
#iplot = tree.iplot(filename="simple_dendrogram.html")

iplot = tree.iplot(width=900, height=500)

In [17]:
plotly.offline.init_notebook_mode(connected=True) # for visualising plot inline
plotly.offline.iplot(iplot)