In [3]:
%%capture
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem import Descriptors
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import AllChem
from rdkit import DataStructs
import numpy as np
import pandas as pd
from rdkit.Chem import BRICS
from collections import Counter

In [5]:
naprore = pd.read_excel('naprore_sample.xlsx')


smiles_list=naprore['SMILES']

#Los SMILES se transforman a moléculas
mol_list = []
for i in smiles_list:
    mol=Chem.MolFromSmiles(i)
    mol_list.append(mol)


naprore['mols'] = mol_list

In [6]:
terpenoids = naprore[naprore['NPC_PATHWAY_1'] == 'Terpenoids']
shikimates = naprore[naprore['NPC_PATHWAY_1'] == 'Shikimates and Phenylpropanoids']
polyketides = naprore[naprore['NPC_PATHWAY_1'] == 'Polyketides']
no_label = naprore[naprore['NPC_PATHWAY_1'].isna()]
fatty_acids = naprore[naprore['NPC_PATHWAY_1'] == 'Fatty acids']
peptides = naprore[naprore['NPC_PATHWAY_1'] == 'Amino acids and Peptides']
alkaloids = naprore[naprore['NPC_PATHWAY_1'] == 'Alkaloids']

In [7]:
# Function to fragment molecules using BRICS decomposition
def fragment_molecule(mol):
    fragments = list(BRICS.BRICSDecompose(mol))
    return fragments

## **Most common fragments in Terpenoids**

In [8]:
# List to store all fragments from the dataset
all_fragments = []

# Fragment each molecule and collect the fragments
for mol in terpenoids['mols']:
    if mol is not None:
        fragments = fragment_molecule(mol)
        all_fragments.extend(fragments)

# Count the occurrence of each fragment
fragment_counts = Counter(all_fragments)

# Display the most common fragments
print("Most common fragments:")
for fragment, count in fragment_counts.most_common():
    print(f"{fragment}: {count} occurrences")

Most common fragments:
[6*]C(=O)O: 20 occurrences
[3*]O[3*]: 18 occurrences
[7*]CC: 8 occurrences
[1*]C(=O)C([7*])C: 7 occurrences
[3*]OC: 6 occurrences
[1*]C(C)=O: 6 occurrences
[1*]C(=O)C[7*]: 6 occurrences
[16*]c1ccoc1: 5 occurrences
[7*]CCCC([7*])C: 5 occurrences
[16*]c1ccccc1: 5 occurrences
[7*]C[8*]: 5 occurrences
[15*][C@@]1(C)[C@H](C)CC[C@@]2(C)C(C(=O)O)=CCC[C@H]12: 4 occurrences
[7*]C(C)C: 4 occurrences
[8*]CCC(=O)O: 3 occurrences
[1*]C([6*])=O: 3 occurrences
[8*]C(C)C: 3 occurrences
[7*]CC(=O)O: 3 occurrences
[3*]OO: 3 occurrences
[8*]CC[8*]: 2 occurrences
[15*][C@H]1CC[C@H]2C(C(=O)O)=CCC[C@@H]2C1: 2 occurrences
[4*]CC[8*]: 2 occurrences
[15*][C@H]1C(=C)[C@@H]2CC[C@]3(O)[C@@]1(CC[C@@H]1[C@@]([15*])(C)CCC[C@]13C)C2: 2 occurrences
[1*]C(=O)C1=CO[C@@H]([13*])[C@H](C=C)[C@@H]1[15*]: 2 occurrences
[7*]CC([7*])C: 2 occurrences
[7*]CC[7*]: 2 occurrences
[7*]CC[8*]: 2 occurrences
[15*][C@H]1C[C@H]2[C@@](C)(CCC[C@]2([15*])C)[C@@H]2CC[C@@H]3C[C@]12[C@@H](O)C3=C: 2 occurrences
[8*]CC(=O

## **Most common fragments in Shikimates and Phenylpropanoids**

In [9]:
# List to store all fragments from the dataset
all_fragments = []

# Fragment each molecule and collect the fragments
for mol in shikimates['mols']:
    if mol is not None:
        fragments = fragment_molecule(mol)
        all_fragments.extend(fragments)

# Count the occurrence of each fragment
fragment_counts = Counter(all_fragments)

# Display the most common fragments
print("Most common fragments:")
for fragment, count in fragment_counts.most_common():
    print(f"{fragment}: {count} occurrences")

Most common fragments:
[3*]OC: 56 occurrences
[7*]C[8*]: 43 occurrences
[3*]O[3*]: 39 occurrences
[16*]c1ccc(O)c(O)c1: 33 occurrences
[16*]c1ccc(O)c([16*])c1: 29 occurrences
[16*]c1ccc(O)cc1: 21 occurrences
[16*]c1cc(O)c(O)c(O)c1: 20 occurrences
[6*]C(=O)O: 19 occurrences
[1*]C([6*])=O: 18 occurrences
[1*]C(=O)C[7*]: 12 occurrences
[8*]CO: 10 occurrences
[16*]c1ccccc1: 9 occurrences
[13*]C1OC([13*])C(O)C(O)C1O: 9 occurrences
[16*]c1cc(O)c([16*])c(O)c1: 9 occurrences
[4*]C[8*]: 8 occurrences
[7*]CC(=O)O: 7 occurrences
[14*]c1oc2cc(O)cc(O)c2c(=O)c1[16*]: 6 occurrences
[7*]CC(=O)C[7*]: 6 occurrences
[13*][C@H]1Oc2cc(O)cc(O)c2C[C@H]1[15*]: 6 occurrences
[16*]c1ccc([16*])c(O)c1: 5 occurrences
[8*]C(C)(C)O: 5 occurrences
[16*]c1cc([16*])c(O)c([16*])c1: 4 occurrences
[14*]c1ccc2c([16*])c(O)c(O)cc2[o+]1: 4 occurrences
[14*]c1[o+]c2cc(O)cc(O)c2cc1[16*]: 4 occurrences
[7*]C(C)C: 4 occurrences
[6*]C(=O)CC[8*]: 3 occurrences
[13*]C1OC(C)C(O)C(O)C1O: 3 occurrences
[13*]C1OCC(O)C(O)C1O: 3 occurrence

## **Most common fragments in Polyketides**

In [10]:
# List to store all fragments from the dataset
all_fragments = []

# Fragment each molecule and collect the fragments
for mol in polyketides['mols']:
    if mol is not None:
        fragments = fragment_molecule(mol)
        all_fragments.extend(fragments)

# Count the occurrence of each fragment
fragment_counts = Counter(all_fragments)

# Display the most common fragments
print("Most common fragments:")
for fragment, count in fragment_counts.most_common():
    print(f"{fragment}: {count} occurrences")

Most common fragments:
[3*]O[3*]: 16 occurrences
[3*]OC: 12 occurrences
[5*]N[5*]: 10 occurrences
[1*]C([1*])=O: 7 occurrences
[13*][C@H]1C[C@@H]([15*])[C@@H](O)[C@H](C)O1: 7 occurrences
[1*]C(C)=O: 6 occurrences
[13*][C@@H]1C[C@@H](O)[C@@H]([15*])[C@H](C)O1: 6 occurrences
[13*][C@H]1C[C@@H](O)[C@@H]([15*])[C@H](C)O1: 6 occurrences
[13*][C@H]1C[C@](C)([N+](=O)[O-])[C@@H]([15*])[C@@H](C)O1: 5 occurrences
[8*]CO: 5 occurrences
[4*]C[8*]: 4 occurrences
[15*][C@]1(C)C/C=C\C=C(\C(=O)O)C(=C)CC[C@H]1C: 3 occurrences
[6*]C(=O)O: 3 occurrences
[15*][C@@H]1[C@H]2C=C[C@H]3/C(C)=C\C[C@H]([15*])/C(C)=C\[C@@H]4C=C(CO)[C@H](C)C[C@]45OC(=O)C(=C(O)[C@@]3(C)[C@@H]2[C@@H](C)C[C@@H]1C)C5=O: 3 occurrences
[13*][C@]12Oc3ccc([16*])c(O)c3C(O)=C1C(=O)C[C@@H](C)[C@H]2[15*]: 3 occurrences
[15*][C@@H]1[C@H](C)CC(=O)C2=C(O)c3c(ccc([16*])c3O)O[C@@]21[15*]: 3 occurrences
[15*][C@]1(C)CC=CC=C(C(=O)O)C(=C)CC[C@H]1C: 3 occurrences
[16*]c1ccc(O)c([16*])c1: 2 occurrences
[1*]C(=O)C[7*]: 2 occurrences
[4*]CC(C)C: 2 occurr

## **Most common fragments in unalabeled compounds**

In [11]:
# List to store all fragments from the dataset
all_fragments = []

# Fragment each molecule and collect the fragments
for mol in no_label['mols']:
    if mol is not None:
        fragments = fragment_molecule(mol)
        all_fragments.extend(fragments)

# Count the occurrence of each fragment
fragment_counts = Counter(all_fragments)

# Display the most common fragments
print("Most common fragments:")
for fragment, count in fragment_counts.most_common():
    print(f"{fragment}: {count} occurrences")

Most common fragments:
[16*]c1ccc(O)c([16*])c1: 6 occurrences
[3*]OC: 6 occurrences
[5*]N[5*]: 4 occurrences
[3*]O[3*]: 3 occurrences
[1*]C(=O)C[7*]: 3 occurrences
[7*]C[8*]: 3 occurrences
[4*]CC(C)C: 3 occurrences
[16*]c1ccc2c(c1)OCO2: 3 occurrences
[1*]C(C)=O: 2 occurrences
[7*]CC[7*]: 2 occurrences
[7*]CCCCCCCC[7*]: 2 occurrences
[8*]CCC(=O)C[C@@H](O)CCCCC: 1 occurrences
[8*]CCC(=O)C[C@@H](O)CCCCCCC: 1 occurrences
[8*]CC[C@@H](O)C[C@@H](O)CCCCC: 1 occurrences
[7*]CC(=O)CC[8*]: 1 occurrences
[7*]CCCCCC: 1 occurrences
[4*]C(CC[8*])CC([4*])CCC: 1 occurrences
[4*][C@H](CC[8*])C[C@@H]([4*])CCCCC: 1 occurrences
[7*]CCCCCC[7*]: 1 occurrences
[8*]C(C)(C)O: 1 occurrences
[13*][C@H]1Cc2cc([16*])ccc2O1: 1 occurrences
[6*]C(=O)O: 1 occurrences
[16*]c1ccccc1: 1 occurrences
[1*]C(=O)C([4*])C[8*]: 1 occurrences
[1*]C([6*])=O: 1 occurrences
[4*]CC([4*])C[8*]: 1 occurrences


## **Most common fragments in Fatty acids**

In [12]:
# List to store all fragments from the dataset
all_fragments = []

# Fragment each molecule and collect the fragments
for mol in fatty_acids['mols']:
    if mol is not None:
        fragments = fragment_molecule(mol)
        all_fragments.extend(fragments)

# Count the occurrence of each fragment
fragment_counts = Counter(all_fragments)

# Display the most common fragments
print("Most common fragments:")
for fragment, count in fragment_counts.most_common():
    print(f"{fragment}: {count} occurrences")

Most common fragments:
[1*]C(=O)C[7*]: 4 occurrences
[5*]N[5*]: 4 occurrences
[4*]CC(C)C: 4 occurrences
[7*]CC[7*]: 4 occurrences
[7*]CCCCCC: 3 occurrences
[7*]CCCCC: 1 occurrences
[7*]CCCCCCCCC[7*]: 1 occurrences
[7*]CCCCCCCCCCCCCC: 1 occurrences
[7*]CCCCCCCCCC[7*]: 1 occurrences
[7*]CC: 1 occurrences
[7*]C(CO)C(=O)O: 1 occurrences
CCCCCCCCCCCCCCCC(=O)O: 1 occurrences
CCCCCCCC(=O)O: 1 occurrences
CCCCCCCCCC(=O)O: 1 occurrences
CCCCCCCCCCCC(=O)O: 1 occurrences
CCCCCCCCCCCCCC(=O)O: 1 occurrences
[7*]CCC[7*]: 1 occurrences
[7*]CCCCCCCCC(=O)O: 1 occurrences
CCCC/C=C\O: 1 occurrences


## **Most common fragments in Amino acids and Peptides**

In [13]:
# List to store all fragments from the dataset
all_fragments = []

# Fragment each molecule and collect the fragments
for mol in peptides['mols']:
    if mol is not None:
        fragments = fragment_molecule(mol)
        all_fragments.extend(fragments)

# Count the occurrence of each fragment
fragment_counts = Counter(all_fragments)

# Display the most common fragments
print("Most common fragments:")
for fragment, count in fragment_counts.most_common():
    print(f"{fragment}: {count} occurrences")

Most common fragments:
[5*]N[5*]: 11 occurrences
[1*]C([6*])=O: 10 occurrences
[16*]c1ccccc1: 8 occurrences
[3*]OC: 5 occurrences
[1*]C(C)=O: 4 occurrences
[1*]C(=O)[C@@H](C)[C@H](O)[C@@H]([4*])C[8*]: 4 occurrences
[1*]C(=O)C[4*]: 4 occurrences
[3*]O[3*]: 3 occurrences
[13*][C@H]1OC(=O)[C@H]([13*])N(C)C(=O)[C@@H]([13*])OC(=O)[C@H]([13*])N(C)C(=O)[C@@H]([13*])OC(=O)[C@H]([13*])N(C)C1=O: 3 occurrences
[8*]C(C)C: 3 occurrences
[4*]CCC(=O)O: 3 occurrences
[1*]C(=O)[C@@H]([4*])CO: 3 occurrences
[10*]N1CC[C@H]([15*])C1=O: 3 occurrences
[4*][C@@H](C(=O)N1NCCC[C@H]1[13*])[C@H](O)C(C)C: 3 occurrences
[1*]C(=O)C([4*])C[8*]: 2 occurrences
[4*]C[C@@H]([4*])C[8*]: 2 occurrences
[8*]C[8*]: 2 occurrences
[8*]CCC(=O)O: 2 occurrences
[13*][C@H]1NC(=O)[C@@H]2CCCN2C1=O: 2 occurrences
[1*]C(N)=O: 2 occurrences
[1*]C(=O)C[7*]: 1 occurrences
[7*]C[8*]: 1 occurrences
[16*]c1ccc(O)cc1: 1 occurrences
[4*]CC[8*]: 1 occurrences
[16*]c1ccc([16*])c(O)c1: 1 occurrences
[8*][C@@H](C)CC: 1 occurrences
[15*][C@H]1[C@@

## **Most common fragments in Alkaloids**

In [14]:

# List to store all fragments from the dataset
all_fragments = []

# Fragment each molecule and collect the fragments
for mol in alkaloids['mols']:
    if mol is not None:
        fragments = fragment_molecule(mol)
        all_fragments.extend(fragments)

# Count the occurrence of each fragment
fragment_counts = Counter(all_fragments)

# Display the most common fragments
print("Most common fragments:")
for fragment, count in fragment_counts.most_common():
    print(f"{fragment}: {count} occurrences")


Most common fragments:
[3*]OC: 18 occurrences
[1*]C(=O)C[7*]: 17 occurrences
[7*]C[8*]: 14 occurrences
[3*]O[3*]: 13 occurrences
[16*]c1ccc2c(c1)OCO2: 11 occurrences
[7*]CC[7*]: 9 occurrences
[5*]N1CCCCC1: 9 occurrences
[8*]CO: 7 occurrences
[1*]C(=O)C1=CO[C@@H]([13*])[C@H](C=C)[C@@H]1[15*]: 7 occurrences
[13*][C@@H]1O[C@H]([13*])[C@@H](O)[C@H](O)[C@H]1O: 6 occurrences
[8*]C[8*]: 6 occurrences
[14*]c1nccc2c1[nH]c1ccccc12: 5 occurrences
[5*]N1CCCC1: 4 occurrences
[1*]C(C)=O: 4 occurrences
[4*]C[8*]: 4 occurrences
[7*]CCCCCCCC[7*]: 3 occurrences
[16*]c1cc([16*])c([16*])c([16*])c1: 2 occurrences
[7*]CCC[8*]: 2 occurrences
[7*]CCCC[7*]: 2 occurrences
[7*]CCCCCC[7*]: 2 occurrences
[14*]c1ccc[nH]1: 2 occurrences
[1*]C([6*])=O: 2 occurrences
[7*]CCCCC: 2 occurrences
[7*]CCCCCCCCC[7*]: 2 occurrences
[8*]CC: 2 occurrences
[13*][C@@H]1O[C@H]([13*])[C@@H]([15*])[C@H]([15*])[C@H]1[15*]: 2 occurrences
[10*]N1CCC=CC1=O: 1 occurrences
[1*]C(=O)CCCCCCC[7*]: 1 occurrences
[4*]CC1=C[C@@H]([13*])OC1: 1 o