In [1]:
!BioDendro -h

usage: BioDendro [-h] [-n] [-c CUTOFF] [-b BIN_THRESHOLD]
                 [-d {jaccard,braycurtis}] [-m MATRIX] [-p PROCESSED]
                 [-o OUT_HTML] [-r RESULTS_DIR] [-x WIDTH_PX] [-y HEIGHT_PX]
                 mgf components

Process MGF file into appropriate input for biodendro.

positional arguments:
  mgf                   MGF input file (file1.mgf)
  components            Listed components file (file2.txt)

optional arguments:
  -h, --help            show this help message and exit
  -n, --neutral         Apply neutral loss.
  -c CUTOFF, --cutoff CUTOFF
  -b BIN_THRESHOLD, --bin-threshold BIN_THRESHOLD
  -d {jaccard,braycurtis}, --cluster-method {jaccard,braycurtis}
  -m MATRIX, --matrix MATRIX
  -p PROCESSED, --processed PROCESSED
                        Path to write output to.
  -o OUT_HTML, --output OUT_HTML
                        Path to write output to.
  -r RESULTS_DIR, --results-dir RESULTS_DIR
  -x WIDTH_PX, --width WIDTH_PX
  -y HEIGHT

In [2]:
%%bash
rm -rf -- ./results
BioDendro --processed processed.xlsx ./MSMS.mgf ./component_list.txt

Please be patient..It may take a while to compute..


In [112]:
from importlib import reload
import plotly
import pandas as pd
import numpy as np
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster


import BioDendro.plot

reload(BioDendro.plot)

from BioDendro.plot import Dendrogram


In [18]:
from BioDendro import preprocess
reload(preprocess)

with open("./MSMS.mgf") as handle:
    x = preprocess.MGFRecord.parse(handle)

for record in x:
    record.title = preprocess.split_msms_title(record.title)

with open("./component_list.txt") as handle:
    y = preprocess.get_csv_record(handle, x)

df = preprocess.remove_redundancy(y)
#df["sample"] = [s.rstrip(".0") for s in df["sample"]]
#df.to_excel("test.xlsx", index=False)
df.reset_index(inplace=True, drop=True)
df.head()

Unnamed: 0,sample,mz
0,QE_2017_001815_132.0444_282.0,50.01563
1,QE_2017_001814_353.26889_609.0,50.06792
2,QE_2017_001816_177.05449_217.0,50.44534
3,QE_2017_001815_235.14398_226.0,50.4863
4,QE_2017_001816_211.16916_405.0,50.49868


In [6]:
#data = Dendrogram.from_xlsx("./processed.xlsx")

df = pd.read_excel("../old_biodendro/out.xlsx")

In [26]:
x = [1, 2, 3]
x.insert(0, 0)
x

[0, 1, 2, 3]

In [24]:
df["mz"].diff() >= 0.002

0       False
1        True
2        True
3        True
4        True
5        True
6        True
7        True
8        True
9       False
10      False
11      False
12      False
13       True
14       True
15       True
16       True
17       True
18       True
19      False
20       True
21      False
22      False
23       True
24       True
25       True
26       True
27       True
28       True
29       True
        ...  
7381     True
7382     True
7383    False
7384    False
7385    False
7386     True
7387    False
7388     True
7389     True
7390    False
7391    False
7392     True
7393    False
7394     True
7395     True
7396     True
7397     True
7398    False
7399     True
7400    False
7401    False
7402    False
7403     True
7404    False
7405    False
7406     True
7407     True
7408    False
7409     True
7410     True
Name: mz, Length: 7411, dtype: bool

In [23]:
df[df["mz"].diff() >= 0.002].index

Int64Index([   1,    2,    3,    4,    5,    6,    7,    8,   13,   14,
            ...
            7394, 7395, 7396, 7397, 7399, 7403, 7406, 7407, 7409, 7410],
           dtype='int64', length=1760)

In [32]:
np.where(df["mz"].diff() >= 0.002)[0]

array([   1,    2,    3, ..., 7407, 7409, 7410])

In [219]:
#data = Dendrogram.from_xlsx("./processed.xlsx")

df = pd.read_excel("../old_biodendro/out.xlsx")

In [220]:


def clusterise(df, bin_threshold=8e-4):
    df = df.copy()
    
    colname = 'mz' #Look for a particular column to cluster

    diffs = df[colname].diff()
    diffs[0] = bin_threshold + 1
    clusters = np.where(diffs >= bin_threshold)[0]
    
    col_names = []
    labels = np.zeros(len(df), dtype=np.int)

    for i in range(1, clusters.shape[0]):
        labels[clusters[i - 1]: clusters[i]] = i
        arr = df[colname][clusters[i - 1]: clusters[i]]
        col_names.append(get_col_name(arr))

    labels[clusters[i]:] = i + 1
    arr = df[colname][clusters[i]:]
    col_names.append(get_col_name(arr))

    return col_names, clusters, labels

In [236]:
col_names[slice(1, 4)]

['50.0679_50.0679_50.0679',
 '50.4453_50.4453_50.4453',
 '50.4863_50.4863_50.4863']

In [234]:
slice(1, 4)

slice(1, 4, None)

In [222]:
col_names, clusters, labels = clusterise(df)

In [238]:
np.arange(5)[slice(1, 4)]

array([1, 2, 3])

In [227]:
def generate_linkage(df, labels, cutoff=0.6, clustering_method="jaccard"):
    '''
    Populates self.mycluster and self.full by filling in the full linkage (self.full)
    and the individual clusters (mycluster)
    In: cutoff=0.5,clustering_method='jaccard'
    '''
    df = df.copy()
    df["present"] = True
    df = df.pivot_table(index="sample", columns=labels, values="present", fill_value=False)
    
    tree = linkage(df, method="complete", metric=clustering_method)
    mycluster = fcluster(tree, cutoff, criterion="distance")
    return df, tree, mycluster

df, tree, clusters = generate_linkage(df, labels)

In [228]:
df

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,1853,1854,1855,1856,1857,1858,1859,1860,1861,1862
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
QE_2017_001814_129.12737_348,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
QE_2017_001814_171.14908_419,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
QE_2017_001814_177.05418_236,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
QE_2017_001814_177.05450_306,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
QE_2017_001814_181.12213_767,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
QE_2017_001814_182.08116_51,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
QE_2017_001814_191.14291_389,False,False,False,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
QE_2017_001814_194.04424_237,False,False,False,False,False,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
QE_2017_001814_194.04468_396,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
QE_2017_001814_194.11740_217,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [134]:
df.shape

(197, 1862)

In [133]:
clusters

array([ 53,  54,  90,  90,   7,  89,   3,  82,  86,  82,  83,  91,  91,
        91,  91,  91,   6,   5,  55,   9,   9,  12,  87,  85, 100,  12,
         1,  11,  11,  11,  63,  95,  86,   8,  80,  80,  35,  35,  35,
        78,  78,  93, 102,  79,  56,   5,  97,  95,   1,  94,  77,  17,
        17, 107,  18,  18,   5,  10,  92,  92,  10,  38,  67,  47,  66,
        65,  65,  47,  62,  61,  60,  17,  17,  17,  17,  50,  34,  36,
        44,  44,  18,  17,  17,  73,  72,  70,  72,  72, 104,  70,  52,
        16,  37,  42,  17,  45,  46,  27,  26,  29,  24, 106,  19,  32,
        28,  30,  41,  33,  23,  23, 105,  13,  57, 108,  89,   3,   3,
        85,  85,  85,  84,  91,   2,  12,   9,   9,  12,  98, 101,  12,
        99,   1,   1,  10,  35,  93, 103,  79,  51,  76,  97,  76,  69,
        96,  92,  92,  92,  58,  17,  39,  18,  71,  74,  20,  75,  45,
        25,  21,  17,  26,  31,  13,  15, 109,   4,  90,   7,  91,  91,
        91,   2,  12,  88,  11,   8,  81,  78,  64,  77,  69,  2

In [173]:
%matplotlib inline

In [217]:
import os
from os.path import join as pjoin

from matplotlib import pyplot as plt
os.makedirs("results", exist_ok=True)
df.columns = col_names

def generate_out(df, clusters, path="results"):
    """ . """
    df = df.copy()
    
    for cluster, subtab in df.groupby(clusters):
        nmembers = subtab.shape[0]
        
        subtab = subtab.loc[:, subtab.any(axis=0)]
        csv_filename = pjoin(path, "cluster_{}_{}.csv".format(cluster, nmembers))
        subtab.to_csv(csv_filename, sep="\t")
        
        fig, ax = plot_bin_freqs(subtab)
        fig.suptitle("Cluster {} with {} members".format(cluster, subtab.shape[0]))
        plt_filename = pjoin(path, "cluster_{}_{}.png".format(cluster, nmembers))
        fig.savefig(plt_filename)
        
        # Prevents plotting these plots in interactive mode.
        plt.close()

    filename = pjoin(path, "clusters.csv")
    df["cluster"] = clusters
    df = df[["cluster"] + [c for c in df.columns if c != "cluster"]]
    df.to_csv(filename, sep="\t")
    return

generate_out(df, clusters)

In [216]:
def plot_bin_freqs(df, height=4.5, width_base=1, width_multiplier=0.2):
    """ . """

    frequencies = df.apply(lambda x: np.mean(x), axis=0)

    width = width_base + width_multiplier * df.shape[1]
    fig, ax = plt.subplots(figsize=(width, height))

    xticks = np.arange(frequencies.shape[0])
    xticklabels = df.columns.values

    ax.bar(xticks, frequencies)
    ax.set_xticks(xticks)
    ax.set_xticklabels(xticklabels, rotation=90)
    ax.set_ylabel("Frequency")
    ax.set_xlabel("m/z bin")

    # Helps keep the long xticklabels in the output figure.
    fig.tight_layout()
    
    return fig, ax

In [229]:
col_names

['50.0156_50.0156_50.0156',
 '50.0679_50.0679_50.0679',
 '50.4453_50.4453_50.4453',
 '50.4863_50.4863_50.4863',
 '50.4987_50.4987_50.4987',
 '50.5763_50.5763_50.5763',
 '50.7138_50.7138_50.7138',
 '50.9775_50.9775_50.9775',
 '51.0235_51.0231_51.0239',
 '51.1057_51.1057_51.1057',
 '51.2115_51.2115_51.2115',
 '51.3509_51.3509_51.3509',
 '51.5851_51.5851_51.5851',
 '52.0176_52.0176_52.0176',
 '52.0221_52.0221_52.0221',
 '52.0313_52.0312_52.0314',
 '52.1516_52.1516_52.1516',
 '52.1912_52.1912_52.1912',
 '52.2788_52.2788_52.2788',
 '52.8019_52.8019_52.8019',
 '52.8789_52.8789_52.8789',
 '52.9684_52.9684_52.9684',
 '52.9725_52.9725_52.9725',
 '53.0028_53.0027_53.0028',
 '53.0391_53.0389_53.0392',
 '53.1455_53.1455_53.1455',
 '53.1689_53.1689_53.1689',
 '53.5102_53.5102_53.5102',
 '53.8549_53.8549_53.8549',
 '54.0078_54.0078_54.0078',
 '54.0344_54.0344_54.0344',
 '54.0655_54.0655_54.0655',
 '54.1024_54.1024_54.1024',
 '54.2154_54.2154_54.2154',
 '54.2450_54.2450_54.2450',
 '54.4914_54.4914_54

In [117]:
clusters.shape

(197,)

In [135]:
df.iloc[clusters]

labels,1,2,3,4,5,6,7,8,9,10,...,1853,1854,1855,1856,1857,1858,1859,1860,1861,1862
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
QE_2017_001814_265.01541_33.0,False,False,False,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
QE_2017_001814_278.24756_683.0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
QE_2017_001814_483.27118_651.0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
QE_2017_001814_483.27118_651.0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
QE_2017_001814_194.04424_237.0,False,False,False,False,False,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
QE_2017_001814_466.40994_731.0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
QE_2017_001814_177.0545_306.0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
QE_2017_001814_353.26889_609.0,False,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
QE_2017_001814_421.35211_732.0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
QE_2017_001814_353.26889_609.0,False,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [132]:
clusters

array([ 53,  54,  90,  90,   7,  89,   3,  82,  86,  82,  83,  91,  91,
        91,  91,  91,   6,   5,  55,   9,   9,  12,  87,  85, 100,  12,
         1,  11,  11,  11,  63,  95,  86,   8,  80,  80,  35,  35,  35,
        78,  78,  93, 102,  79,  56,   5,  97,  95,   1,  94,  77,  17,
        17, 107,  18,  18,   5,  10,  92,  92,  10,  38,  67,  47,  66,
        65,  65,  47,  62,  61,  60,  17,  17,  17,  17,  50,  34,  36,
        44,  44,  18,  17,  17,  73,  72,  70,  72,  72, 104,  70,  52,
        16,  37,  42,  17,  45,  46,  27,  26,  29,  24, 106,  19,  32,
        28,  30,  41,  33,  23,  23, 105,  13,  57, 108,  89,   3,   3,
        85,  85,  85,  84,  91,   2,  12,   9,   9,  12,  98, 101,  12,
        99,   1,   1,  10,  35,  93, 103,  79,  51,  76,  97,  76,  69,
        96,  92,  92,  92,  58,  17,  39,  18,  71,  74,  20,  75,  45,
        25,  21,  17,  26,  31,  13,  15, 109,   4,  90,   7,  91,  91,
        91,   2,  12,  88,  11,   8,  81,  78,  64,  77,  69,  2

In [11]:
data = Dendrogram(df)

In [14]:
data.clusterize()

In [16]:
data.generate_linkage()

Please be patient..It may take a while to compute..


ValueError: Shape of passed values is (1862, 197), indices imply (1861, 197)

In [5]:
data.generate_out()

In [6]:
data.mycluster

array([117,  28, 119,  92,  33,  38,  28, 125,  20,  81,  85,  57,  16,
        25,  74,  22,  24, 101,  85,  86,  75, 104,  31,  19,  15, 113,
        34,  25,  23,  24,   1,  15,  21,  28, 119, 115,  33,   8,   9,
         8,  23,   8, 116,  17,  28,  23, 112, 102, 120,  55,  91, 120,
       118,  43,  53, 105,  96,  27,  96,  33,  26,  18,  23,  18,  26,
        21,  64,  63,  16,  22,  29, 109,  44,  13,  28,  28,  36, 107,
        31,  28,  33, 107, 114,  28,  47,  35,  28,  28, 110,  28,  62,
        33, 111,  31,  30,  28,  16,  44,  58,   5,  79, 120,  87, 119,
       120, 120, 120, 120, 120, 120, 122, 123, 121, 122, 123, 124, 106,
        91,  68,  73, 121, 118,  83,  59,  60,  59,  57,  11,  69,  67,
        92,  32,  67, 108,  70,  37,  94,  50,  97,  67,  99,  54, 103,
        98, 105, 104,  82,  84, 100,  95,  41, 126,   2,  45,  91,  91,
        91,  10,   3,   4,   6,   7,  89,  66,   7,  93,  88,  86,  90,
        77,  80,  42,  39,  13,  40,  49,  14,  72,  13,  71,  7

In [7]:
k = data.visualize(cutoff=0.5)

In [11]:
plotly.offline.init_notebook_mode(connected=True) # for visualising plot inline
plotly.offline.iplot(k, filename='simple_dendrogram')

In [16]:
from BioDendro import preprocess
reload(preprocess)

with open("./MSMS.mgf") as handle:
    x = preprocess.MGFRecord.parse(handle)

for record in x:
    record.title = preprocess.split_msms_title(record.title)

with open("./component_list.txt") as handle:
    y = preprocess.get_csv_record(handle, x)

In [230]:
x = [1.234, 2.345678, 3.45678910]
np.mean(x)

2.345489033333333

In [231]:
np.min(x)

1.234

In [232]:
np.max(x)

3.4567891

In [248]:
x = np.zeros(10, dtype=object)

In [249]:
x

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=object)

In [252]:
x[:2] = "testing"
x[2:] = "this"

In [261]:
x.astype("|S{}".format(7))

array([b'testing', b'testing', b'this', b'this', b'this', b'this',
       b'this', b'this', b'this', b'this'], dtype='|S7')

In [259]:
x[slice(1, None)]

array(['testing', 'this', 'this', 'this', 'this', 'this', 'this', 'this',
       'this'], dtype=object)

In [263]:
np.ones(10, dtype=np.bool)

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True])

In [265]:
x = pd.DataFrame({"sample": [1, 2, 23, 4]})

In [266]:
list(x.iterrows())

[(0, sample    1
  Name: 0, dtype: int64), (1, sample    2
  Name: 1, dtype: int64), (2, sample    23
  Name: 2, dtype: int64), (3, sample    4
  Name: 3, dtype: int64)]