# tidy chemical similarity

Load required packages (ChemmineR needs ChemmineOB to use `openbabel`)

In [1]:
library(purrr)
library(ChemmineR)
library(tidyverse)
library(tidygraph)

── Attaching packages ─────────────────────────────────────── tidyverse 1.2.1 ──
✔ ggplot2 3.1.0       ✔ readr   1.3.1  
✔ tibble  2.0.1       ✔ dplyr   0.8.0.1
✔ tidyr   0.8.2       ✔ stringr 1.4.0  
✔ ggplot2 3.1.0       ✔ forcats 0.4.0  
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::groups() masks ChemmineR::groups()
✖ dplyr::lag()    masks stats::lag()
✖ tibble::view()  masks ChemmineR::view()

Attaching package: ‘tidygraph’

The following object is masked from ‘package:ChemmineR’:

    groups

The following object is masked from ‘package:stats’:

    filter



# Tanimoto similarity calculation

Let's define a function which calculates tanimoto similarity between two molecules provided as SMILES

In [2]:
# function two calculate tanimoto similarity from two smiles
smiles2tanimoto <- function(mol1,mol2){
  sdfset <- smiles2sdf(c(cmp1=mol1,cmp2=mol2))
  apset <- sdf2ap(sdfset)
  fpset <- desc2fp(x=apset, descnames=512, type="FPset")
  result <- fpSim(fpset[1], fpset[2], method="Tanimoto")
  result
}

Let's test it on a tiny data frame

In [3]:
# test data frame
# glucose vs. glucose-6-P
# glucose vs. NADH
compare_mol <- tibble(mol1=c("glucose","glucose"),
                      mol2=c("glucose-6-P","NADH"),
                      smi1=c("C([C@@H]1[C@H]([C@@H]([C@H](C(O1)O)O)O)O)O",
                             "C([C@@H]1[C@H]([C@@H]([C@H](C(O1)O)O)O)O)O"),
                      smi2=c("C([C@H]([C@H]([C@@H]([C@H](C=O)O)O)O)O)OP(=O)(O)O",
                          "C1C=CN(C=C1C(=O)N)[C@H]2[C@@H]([C@@H]([C@H](O2)COP(=O)(O)OP(=O)(O)OC[C@@H]3[C@H]([C@H]([C@@H](O3)N4C=NC5=C4N=CN=C5N)O)O)O)O"))

compare_mol %>% 
  mutate(tanim_sim = map2_dbl(smi1,smi2,~ smiles2tanimoto(.x,.y))) %>% 
  select(mol1,mol2,tanim_sim)

mol1,mol2,tanim_sim
<chr>,<chr>,<dbl>
glucose,glucose-6-P,0.9090909
glucose,NADH,0.4761905


# fmcs trial

In [4]:
library(fmcsR)

In [5]:
smiles2fmcs <- function(mol1,mol2){
  sdfset <- smiles2sdf(c(cmp1=mol1,cmp2=mol2))
  result <- fmcs(sdfset[1], sdfset[2], fast=TRUE)
  result
}

In [6]:
fmcs_test <- compare_mol %>% 
  mutate(fmcs_info = map2(smi1,smi2,~ smiles2fmcs(.x,.y))) 

fmcs_test

mol1,mol2,smi1,smi2,fmcs_info
<chr>,<chr>,<chr>,<chr>,<list>
glucose,glucose-6-P,C([C@@H]1[C@H]([C@@H]([C@H](C(O1)O)O)O)O)O,C([C@H]([C@H]([C@@H]([C@H](C=O)O)O)O)O)OP(=O)(O)O,"12.0000000, 16.0000000, 10.0000000, 0.5555556, 0.8333333"
glucose,NADH,C([C@@H]1[C@H]([C@@H]([C@H](C(O1)O)O)O)O)O,C1C=CN(C=C1C(=O)N)[C@H]2[C@@H]([C@@H]([C@H](O2)COP(=O)(O)OP(=O)(O)OC[C@@H]3[C@H]([C@H]([C@@H](O3)N4C=NC5=C4N=CN=C5N)O)O)O)O,"12.0000000, 44.0000000, 8.0000000, 0.1666667, 0.6666667"


In [7]:
tibble(fmcs_test$fmcs_info[2])

fmcs_test$fmcs_info[2]
<list>
"12.0000000, 44.0000000, 8.0000000, 0.1666667, 0.6666667"


In [8]:
fmcs_test_rev <- compare_mol %>% 
  mutate(fmcs_info = map2(smi1,smi2,~ smiles2fmcs(.y,.x))) 

fmcs_test_rev

mol1,mol2,smi1,smi2,fmcs_info
<chr>,<chr>,<chr>,<chr>,<list>
glucose,glucose-6-P,C([C@@H]1[C@H]([C@@H]([C@H](C(O1)O)O)O)O)O,C([C@H]([C@H]([C@@H]([C@H](C=O)O)O)O)O)OP(=O)(O)O,"16.0000000, 12.0000000, 10.0000000, 0.5555556, 0.8333333"
glucose,NADH,C([C@@H]1[C@H]([C@@H]([C@H](C(O1)O)O)O)O)O,C1C=CN(C=C1C(=O)N)[C@H]2[C@@H]([C@@H]([C@H](O2)COP(=O)(O)OP(=O)(O)OC[C@@H]3[C@H]([C@H]([C@@H](O3)N4C=NC5=C4N=CN=C5N)O)O)O)O,"44.0000000, 12.0000000, 8.0000000, 0.1666667, 0.6666667"


In [9]:
fmcs_test_all <- compare_mol %>% 
  mutate(fmcs_info = map2(smi1,smi2,~ smiles2fmcs(.x,.y))) 

fmcs_test_all %>% unnest(fmcs_info, .id = "name")

mol1,mol2,smi1,smi2,fmcs_info,name
<chr>,<chr>,<chr>,<chr>,<dbl>,<int>
glucose,glucose-6-P,C([C@@H]1[C@H]([C@@H]([C@H](C(O1)O)O)O)O)O,C([C@H]([C@H]([C@@H]([C@H](C=O)O)O)O)O)OP(=O)(O)O,12.0,1
glucose,glucose-6-P,C([C@@H]1[C@H]([C@@H]([C@H](C(O1)O)O)O)O)O,C([C@H]([C@H]([C@@H]([C@H](C=O)O)O)O)O)OP(=O)(O)O,16.0,1
glucose,glucose-6-P,C([C@@H]1[C@H]([C@@H]([C@H](C(O1)O)O)O)O)O,C([C@H]([C@H]([C@@H]([C@H](C=O)O)O)O)O)OP(=O)(O)O,10.0,1
glucose,glucose-6-P,C([C@@H]1[C@H]([C@@H]([C@H](C(O1)O)O)O)O)O,C([C@H]([C@H]([C@@H]([C@H](C=O)O)O)O)O)OP(=O)(O)O,0.5555556,1
glucose,glucose-6-P,C([C@@H]1[C@H]([C@@H]([C@H](C(O1)O)O)O)O)O,C([C@H]([C@H]([C@@H]([C@H](C=O)O)O)O)O)OP(=O)(O)O,0.8333333,1
glucose,NADH,C([C@@H]1[C@H]([C@@H]([C@H](C(O1)O)O)O)O)O,C1C=CN(C=C1C(=O)N)[C@H]2[C@@H]([C@@H]([C@H](O2)COP(=O)(O)OP(=O)(O)OC[C@@H]3[C@H]([C@H]([C@@H](O3)N4C=NC5=C4N=CN=C5N)O)O)O)O,12.0,2
glucose,NADH,C([C@@H]1[C@H]([C@@H]([C@H](C(O1)O)O)O)O)O,C1C=CN(C=C1C(=O)N)[C@H]2[C@@H]([C@@H]([C@H](O2)COP(=O)(O)OP(=O)(O)OC[C@@H]3[C@H]([C@H]([C@@H](O3)N4C=NC5=C4N=CN=C5N)O)O)O)O,44.0,2
glucose,NADH,C([C@@H]1[C@H]([C@@H]([C@H](C(O1)O)O)O)O)O,C1C=CN(C=C1C(=O)N)[C@H]2[C@@H]([C@@H]([C@H](O2)COP(=O)(O)OP(=O)(O)OC[C@@H]3[C@H]([C@H]([C@@H](O3)N4C=NC5=C4N=CN=C5N)O)O)O)O,8.0,2
glucose,NADH,C([C@@H]1[C@H]([C@@H]([C@H](C(O1)O)O)O)O)O,C1C=CN(C=C1C(=O)N)[C@H]2[C@@H]([C@@H]([C@H](O2)COP(=O)(O)OP(=O)(O)OC[C@@H]3[C@H]([C@H]([C@@H](O3)N4C=NC5=C4N=CN=C5N)O)O)O)O,0.1666667,2
glucose,NADH,C([C@@H]1[C@H]([C@@H]([C@H](C(O1)O)O)O)O)O,C1C=CN(C=C1C(=O)N)[C@H]2[C@@H]([C@@H]([C@H](O2)COP(=O)(O)OP(=O)(O)OC[C@@H]3[C@H]([C@H]([C@@H](O3)N4C=NC5=C4N=CN=C5N)O)O)O)O,0.6666667,2


# glucose-6-p data

below is the data frame which contains compund pairs for `glucose + ATP -> glucose-6-P + ADP` reaction which was successfully mapped with reaction mapper(rdt)

In [10]:
glucose <- "C([CH]1[CH]([CH]([CH](C(O1)O)O)O)O)O"
glucose6p <- "C([CH]1[CH]([CH]([CH](C(O1)O)O)O)O)OP(=O)(O)O"
atp <- "c1nc(c2c(n1)n(cn2)[CH]3[CH]([CH]([CH](O3)COP(=O)(O)OP(=O)(O)OP(=O)(O)O)O)O)N"
adp <- "c1nc(c2c(n1)n(cn2)[CH]3[CH]([CH]([CH](O3)COP(=O)(O)OP(=O)(O)O)O)O)N"

glu_rxn <- tibble(mol1=c("gluose","glucose","atp","atp"),
                 mol2=c("adp","gluose-6-p","adp","glucose-6-p"),
                 mol1_smi=c(glucose,glucose,atp,atp),
                 mol2_smi=c(adp,glucose6p,adp,glucose6p))

In [11]:
glu_rxn

mol1,mol2,mol1_smi,mol2_smi
<chr>,<chr>,<chr>,<chr>
gluose,adp,C([CH]1[CH]([CH]([CH](C(O1)O)O)O)O)O,c1nc(c2c(n1)n(cn2)[CH]3[CH]([CH]([CH](O3)COP(=O)(O)OP(=O)(O)O)O)O)N
glucose,gluose-6-p,C([CH]1[CH]([CH]([CH](C(O1)O)O)O)O)O,C([CH]1[CH]([CH]([CH](C(O1)O)O)O)O)OP(=O)(O)O
atp,adp,c1nc(c2c(n1)n(cn2)[CH]3[CH]([CH]([CH](O3)COP(=O)(O)OP(=O)(O)OP(=O)(O)O)O)O)N,c1nc(c2c(n1)n(cn2)[CH]3[CH]([CH]([CH](O3)COP(=O)(O)OP(=O)(O)O)O)O)N
atp,glucose-6-p,c1nc(c2c(n1)n(cn2)[CH]3[CH]([CH]([CH](O3)COP(=O)(O)OP(=O)(O)OP(=O)(O)O)O)O)N,C([CH]1[CH]([CH]([CH](C(O1)O)O)O)O)OP(=O)(O)O


Below is the output of rdt
```
# rdt output
C([CH]1[CH]([CH]([CH](C(O1)O)O)O)O)O.c1nc(c2c(n1)n(cn2)[CH]3[CH]([CH]([CH](O3)COP(=O)(O)OP(=O)(O)OP(=O)(O)O)O)O)N>>C([CH]1[CH]([CH]([CH](C(O1)O)O)O)O)OP(=O)(O)O.c1nc(c2c(n1)n(cn2)[CH]3[CH]([CH]([CH](O3)COP(=O)(O)OP(=O)(O)O)O)O)N

[O:1]=[P:2]([OH:3])([OH:4])[O:5][P:6](=[O:7])([OH:8])[O:9][P:10](=[O:11])([OH:12])[O:13][CH2:14][CH:15]1[O:16][CH:17]([n:18]2[cH:19][n:20][c:21]3[c:22]([n:23][cH:24][n:25][c:26]23)[NH2:27])[CH:28]([OH:29])[CH:30]1[OH:31]
.
[OH:32][CH2:33][CH:34]1[O:35][CH:36]([OH:37])[CH:38]([OH:39])[CH:40]([OH:41])[CH:42]1[OH:43]
>>
[OH:5][P:6](=[O:7])([OH:8])[O:9][P:10](=[O:11])([OH:12])[O:13][CH2:14][CH:15]1[O:16][CH:17]([n:18]2[cH:19][n:20][c:21]3[c:22]([n:23][cH:24][n:25][c:26]23)[NH2:27])[CH:28]([OH:29])[CH:30]1[OH:31]
.
[O:1]=[P:2]([OH:3])([OH:4])[O:32][CH2:33][CH:34]1[O:35][CH:36]([OH:37])[CH:38]([OH:39])[CH:40]([OH:41])[CH:42]1[OH:43]
```