In [1]:
def assess_two_letter_elements(df):
    """
    Find the two letter elements in dataframe.

    Parameters
    ----------
    df : pandas.DataFrame
        Dataframe which requires preprocessing.

    Returns
    -------
    two_letter_elements : list
        List with found two letter elements
    """

    # Search for unique characters in SMILES strings
    unique_chars = set(df.canonical_smiles.apply(list).sum())
    # Get upper and lower case letters only
    upper_chars = []
    lower_chars = []
    for entry in unique_chars:
        if entry.isalpha():
            if entry.isupper():
                upper_chars.append(entry)
            elif entry.islower():
                lower_chars.append(entry)
    print(f"Upper letter characters {sorted(upper_chars)}")
    print(f"Lower letter characters {sorted(lower_chars)}")

    # List of all possible periodic elements
    periodic_elements = [
        "Ac",
        "Al",
        "Am",
        "Sb",
        "Ar",
        "As",
        "At",
        "Ba",
        "Bk",
        "Be",
        "Bi",
        "Bh",
        "B",
        "Br",
        "Cd",
        "Ca",
        "Cf",
        "C",
        "Ce",
        "Cs",
        "Cl",
        "Cr",
        "Co",
        "Cn",
        "Cu",
        "Cm",
        "Ds",
        "Db",
        "Dy",
        "Es",
        "Er",
        "Eu",
        "Fm",
        "Fl",
        "F",
        "Fr",
        "Gd",
        "Ga",
        "Ge",
        "Au",
        "Hf",
        "Hs",
        "He",
        "Ho",
        "H",
        "In",
        "I",
        "Ir",
        "Fe",
        "Kr",
        "La",
        "Lr",
        "Pb",
        "Li",
        "Lv",
        "Lu",
        "Mg",
        "Mn",
        "Mt",
        "Md",
        "Hg",
        "Mo",
        "Mc",
        "Nd",
        "Ne",
        "Np",
        "Ni",
        "Nh",
        "Nb",
        "N",
        "No",
        "Og",
        "Os",
        "O",
        "Pd",
        "P",
        "Pt",
        "Pu",
        "Po",
        "K",
        "Pr",
        "Pm",
        "Pa",
        "Ra",
        "Rn",
        "Re",
        "Rh",
        "Rg",
        "Rb",
        "Ru",
        "Rf",
        "Sm",
        "Sc",
        "Sg",
        "Se",
        "Si",
        "Ag",
        "Na",
        "Sr",
        "S",
        "Ta",
        "Tc",
        "Te",
        "Ts",
        "Tb",
        "Tl",
        "Th",
        "Tm",
        "Sn",
        "Ti",
        "W",
        "U",
        "V",
        "Xe",
        "Yb",
        "Y",
        "Zn",
        "Zr",
    ]

    # The two_char_elements list contains all two letter elements
    # which can be generated by all possible combination of upper x lower characters
    # and are valid periodic elements.
    two_char_elements = []
    for upper in upper_chars:
        for lower in lower_chars:
            ch = upper + lower
            if ch in periodic_elements:
                two_char_elements.append(ch)

    # This list is then reduced to the subset of two-letter elements
    # that actually appear in the SMILES strings, specific to our data set.
    two_char_elements_smiles = set()
    for char in two_char_elements:
        if df.canonical_smiles.str.contains(char).any():
            two_char_elements_smiles.add(char)

    return two_char_elements_smiles

In [2]:
import pandas as pd
from rdkit import Chem

with open("chebi_smiles_com_classes.txt", "r") as f:
    smiles = [i.strip().split(' ') for i in f.readlines()]
    f.close()

molecules_by_classes = []
for s in smiles:
    molecules_by_classes.append({"canonical_smiles":s[0],"classes":s[1].split("@")})

df = pd.DataFrame.from_dict(molecules_by_classes)

smiles = [i["canonical_smiles"] for i in molecules_by_classes]

In [3]:
# Validity test
# import time

# for smile in smiles:
#     response = Chem.MolFromSmiles(smile)
#     print(response)
#     time.sleep(5)

In [4]:
# Index of the longest SMILES string
longest_smiles = max(df["canonical_smiles"], key=len)
longest_smiles_index = df.canonical_smiles[df.canonical_smiles == longest_smiles].index.tolist()
print(f"Longest SMILES: {longest_smiles}")
print(f"Contains {len(longest_smiles)} characters, index in dataframe: {longest_smiles_index[0]}.")
smiles_maxlen = len(longest_smiles)
# NBVAL_CHECK_OUTPUT

Longest SMILES: Nc1ccn([C@@H]2O[C@H](COP(O)(=O)O[C@@H]3[C@@H](COP(O)(=O)O[C@@H]4[C@@H](COP(O)(=O)O[C@@H]5[C@@H](COP(O)(=O)O[C@@H]6[C@@H](COP(O)(=O)O[C@@H]7[C@@H](COP(O)(=O)O[C@@H]8[C@@H](COP(O)(=O)O[C@@H]9[C@@H](COP(O)(=O)O[C@@H]%10[C@@H](COP(O)(=O)O[C@@H]%11[C@@H](COP(O)(=O)O[C@@H]%12[C@@H](COP(O)(=O)O[C@@H]%13[C@@H](COP(O)(=O)O[C@@H]%14[C@@H](COP(O)(=O)O[C@@H]%15[C@@H](COP(O)(=O)O[C@@H]%16[C@@H](COP(O)(=O)O[C@@H]%17[C@@H](COP(O)(=O)O[C@@H]%18[C@@H](COP(O)(=O)O[C@@H]%19[C@@H](COP(O)(=O)O[C@@H]%20[C@@H](COP(O)(=O)O[C@@H]%21[C@@H](COP(O)(=O)O[C@@H]%22[C@@H](COP(O)(=O)O[C@@H]%23[C@@H](COP(O)(=O)O[C@@H]%24[C@@H](COP(O)(=O)O[C@@H]%25[C@@H](CO)O[C@H]([C@@H]%25O)n%25cnc%26c%25nc(N)[nH]c%26=O)O[C@H]([C@@H]%24O)n%24ccc(=O)[nH]c%24=O)O[C@H]([C@@H]%23O)n%23ccc(=O)[nH]c%23=O)O[C@H]([C@@H]%22O)n%22ccc(=O)[nH]c%22=O)O[C@H]([C@@H]%21O)n%21cnc%22c%21nc(N)[nH]c%22=O)O[C@H]([C@@H]%20O)n%20cnc%21c%20nc(N)[nH]c%21=O)O[C@H]([C@@H]%19O)n%19ccc(N)nc%19=O)O[C@H]([C@@H]%18O)n%18cnc%19c%18nc(N)[nH]c%19=O)O[C@H

In [5]:
# Index of the shortest SMILES string
shortest_smiles = min(df["canonical_smiles"], key=len)
shortest_smiles_index = df.canonical_smiles[df.canonical_smiles == shortest_smiles].index.tolist()
print(f"Shortest SMILES: {shortest_smiles}")
print(
    f"Contains {len(shortest_smiles)} characters, index in dataframe: {shortest_smiles_index[0]}."
)
# NBVAL_CHECK_OUTPUT

Shortest SMILES: NO
Contains 2 characters, index in dataframe: 7251.


In [6]:
classes = df['classes'].explode()

set_classes = list(set(classes))
print(f"There are {len(set_classes)} different classes from {len(classes)} set in the data.")

There are 347 different classes from 65849 set in the data.


In [7]:
new_data = []
for row in df['classes']:
    new_row = [1 if elem in row else 0 for elem in set_classes]
    new_data.append(new_row)

one_hot_classes = pd.DataFrame(new_data, columns=set_classes)

In [8]:
one_hot_classes

Unnamed: 0,CHEBI_58945,CHEBI_35406,CHEBI_35715,CHEBI_36043,CHEBI_2580,CHEBI_17478,CHEBI_36047,CHEBI_24432,CHEBI_26004,CHEBI_38101,...,CHEBI_61355,CHEBI_61296,CHEBI_22695,CHEBI_26561,CHEBI_25608,CHEBI_78298,CHEBI_51689,CHEBI_35284,CHEBI_68495,CHEBI_37143
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36882,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
36883,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
36884,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
36885,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
from collections import Counter

contagem = Counter(classes)

item_mais_comum = contagem.most_common(1)[0]
print(f'O item mais comum é: {item_mais_comum[0]}, ocorrendo {item_mais_comum[1]} vezes. Correspondendo a {100*item_mais_comum[1]/len(classes)}% do dataset.')

item_menos_comum = contagem.most_common()[:-2:-1][0]
print(f'O item menos comum é: {item_menos_comum[0]}, ocorrendo {item_menos_comum[1]} vez.')

O item mais comum é: CHEBI_16670, ocorrendo 6176 vezes. Correspondendo a 9.379033850172364% do dataset.
O item menos comum é: CHEBI_26469, ocorrendo 1 vez.


Encontrando classes amostragem baixa

In [44]:
threshold = 1.5

less_than_threshold = [element for element, count in contagem.items() if (count / len(classes)) * 100 < threshold]
print(f'There are {len(less_than_threshold)} elements corresponding to less than {threshold}% of the dataset: {less_than_threshold}')

There are 340 elements corresponding to less than 1.5% of the dataset: ['CHEBI_46867', 'CHEBI_26188', 'CHEBI_35618', 'CHEBI_35620', 'CHEBI_35674', 'CHEBI_67079', 'CHEBI_33860', 'CHEBI_25384', 'CHEBI_33281', 'CHEBI_36047', 'CHEBI_25248', 'CHEBI_38163', 'CHEBI_38958', 'CHEBI_3992', 'CHEBI_76224', 'CHEBI_22587', 'CHEBI_36709', 'CHEBI_46848', 'CHEBI_35441', 'CHEBI_35703', 'CHEBI_78298', 'CHEBI_83925', 'CHEBI_33859', 'CHEBI_35748', 'CHEBI_35366', 'CHEBI_18946', 'CHEBI_26195', 'CHEBI_68495', 'CHEBI_35718', 'CHEBI_26878', 'CHEBI_35681', 'CHEBI_51689', 'CHEBI_76971', 'CHEBI_15705', 'CHEBI_23665', 'CHEBI_50996', 'CHEBI_46874', 'CHEBI_75767', 'CHEBI_22315', 'CHEBI_61778', 'CHEBI_46761', 'CHEBI_33558', 'CHEBI_26979', 'CHEBI_37581', 'CHEBI_33572', 'CHEBI_76946', 'CHEBI_78840', 'CHEBI_35475', 'CHEBI_35472', 'CHEBI_48975', 'CHEBI_50995', 'CHEBI_35358', 'CHEBI_22586', 'CHEBI_50267', 'CHEBI_23990', 'CHEBI_29347', 'CHEBI_32955', 'CHEBI_149553', 'CHEBI_33575', 'CHEBI_22723', 'CHEBI_37143', 'CHEBI_39447'

In [45]:
import statistics

contagens_classes = list(contagem.values())

contagens_classes.sort()

mediana_contagens = statistics.median(contagens_classes)

print(f"A mediana da quantidade de repetições dos itens é: {mediana_contagens}")

A mediana da quantidade de repetições dos itens é: 103


In [46]:
print(f"The max number of classes for smile is: {max(one_hot_classes.sum(axis=1))}")

The max number of classes for smile is: 14


In [47]:
df_selected = df.copy()
df_selected['classes'] = df_selected['classes'].apply(lambda x: [elem for elem in x if elem not in less_than_threshold])
df_selected = df_selected[df_selected['classes'].astype(bool)]
df_selected.reset_index(drop=True, inplace=True)
df_selected

Unnamed: 0,canonical_smiles,classes
0,[H][C@@]12COc3cc(O)ccc3[C@]1([H])Oc1cc(OC)ccc21,[CHEBI_76924]
1,COC1=C2C=COC2=CC2=C1C(=O)C=C(C)O2,[CHEBI_76924]
2,OC[C@@H](O)[C@@H](O)C(O)[C@H](O)[C@H](O)CO,[CHEBI_25212]
3,COc1cc(O)c2c(c1)oc(=O)c1c3cc(O)c(O)cc3oc21,"[CHEBI_25212, CHEBI_35610]"
4,CC(C)=CCc1c(O)cc2occ(-c3ccc(O)cc3)c(=O)c2c1O,[CHEBI_76924]
...,...,...
19962,N\C(C(O)=O)=C(\C=C/C=O)C(O)=O,[CHEBI_75771]
19963,CC1=CC(=O)[C@@H]2C[C@H]1C2(C)C,[CHEBI_76924]
19964,COc1ccc([C@H]2COc3cc(O)ccc3C2)c(O)c1,[CHEBI_76924]
19965,[H][C@]12N3CCC[C@@]1(CC)C[C@](O)(C(=O)OC)n1c2c...,[CHEBI_25212]


In [48]:
import numpy as np

df_selected['string_size'] = df_selected['canonical_smiles'].apply(len)

median_size = np.median(df_selected['string_size'])

Q1 = df_selected['string_size'].quantile(0.25)
Q3 = df_selected['string_size'].quantile(0.75)
IQR = Q3 - Q1

outlier_threshold = 1.5 * IQR

df_filtered = df_selected[(df_selected['string_size'] >= Q1 - outlier_threshold) & (df_selected['string_size'] <= Q3 + outlier_threshold)]

In [49]:
almost_final_dict = []

for i, row in df_filtered.iterrows():
    for classe in row['classes']:
        almost_final_dict.append({'smile':row['canonical_smiles'], 'classe':classe})

almost_final_df = pd.DataFrame(almost_final_dict)
almost_final_df

Unnamed: 0,smile,classe
0,[H][C@@]12COc3cc(O)ccc3[C@]1([H])Oc1cc(OC)ccc21,CHEBI_76924
1,COC1=C2C=COC2=CC2=C1C(=O)C=C(C)O2,CHEBI_76924
2,OC[C@@H](O)[C@@H](O)C(O)[C@H](O)[C@H](O)CO,CHEBI_25212
3,COc1cc(O)c2c(c1)oc(=O)c1c3cc(O)c(O)cc3oc21,CHEBI_25212
4,COc1cc(O)c2c(c1)oc(=O)c1c3cc(O)c(O)cc3oc21,CHEBI_35610
...,...,...
19714,N\C(C(O)=O)=C(\C=C/C=O)C(O)=O,CHEBI_75771
19715,CC1=CC(=O)[C@@H]2C[C@H]1C2(C)C,CHEBI_76924
19716,COc1ccc([C@H]2COc3cc(O)ccc3C2)c(O)c1,CHEBI_76924
19717,[H][C@]12N3CCC[C@@]1(CC)C[C@](O)(C(=O)OC)n1c2c...,CHEBI_25212


In [50]:
min_class_count = almost_final_df['classe'].value_counts().min()

grouped_df = almost_final_df.groupby('classe', group_keys=False)

balanced_df = grouped_df.apply(lambda x: x.sample(min_class_count))

balanced_df = balanced_df.sample(frac=1).reset_index(drop=True)

balanced_df

Unnamed: 0,smile,classe
0,CC(C)C(=O)c1c(O)c2c(cc(=O)oc2c2cc(oc12)C(C)(C)...,CHEBI_35610
1,[H][C@]12C[C@H](OC(=O)c3ccccc3)[C@]3(C)[C@@H](...,CHEBI_35610
2,Oc1ccc2c(c1)oc(-c1cc(O)c(O)c(O)c1)c(O)c2=O,CHEBI_76924
3,O=C(N[C@@H](CCCCN)C(=O)N[C@@H](CO)C(O)=O)[C@@H...,CHEBI_16670
4,SC[C@H](NC(=O)[C@@H](N)C(C)C)C(=O)N[C@@H](C)C(...,CHEBI_16670
...,...,...
9158,COc1ccc(cc1)C([O-])=O,CHEBI_76924
9159,O=C(N[C@@H](CCC(O)=O)C(O)=O)[C@@H](NC(=O)[C@@H...,CHEBI_16670
9160,O=C(N[C@@H]([C@H](O)C)C(O)=O)[C@@H](NC(=O)[C@@...,CHEBI_16670
9161,COC(=O)C(C)C,CHEBI_25212


In [51]:
len(balanced_df['classe'].unique())

7

In [17]:
with open('chebi_selected_smiles.txt', 'w') as f:
    for i, row in balanced_df.iterrows():
        f.write(f"{row['smile']} {row['classe']}\n")
f.close()