In [105]:
import pandas as pd
from rdkit import Chem

with open("chebi_smiles_com_classes.txt", "r") as f:
    smiles = [i.strip().split(' ') for i in f.readlines()]
    f.close()

molecules_by_classes = []
for s in smiles:
    molecules_by_classes.append({"canonical_smiles":s[0],"classes":s[1].split("@")})

df = pd.DataFrame.from_dict(molecules_by_classes)

smiles = [i["canonical_smiles"] for i in molecules_by_classes]

In [106]:
# Validity test
# import time

# for smile in smiles:
#     response = Chem.MolFromSmiles(smile)
#     print(response)
#     time.sleep(5)

In [107]:
# Index of the longest SMILES string
longest_smiles = max(df["canonical_smiles"], key=len)
longest_smiles_index = df.canonical_smiles[df.canonical_smiles == longest_smiles].index.tolist()
print(f"Longest SMILES: {longest_smiles}")
print(f"Contains {len(longest_smiles)} characters, index in dataframe: {longest_smiles_index[0]}.")
smiles_maxlen = len(longest_smiles)
# NBVAL_CHECK_OUTPUT

Longest SMILES: Nc1ccn([C@@H]2O[C@H](COP(O)(=O)O[C@@H]3[C@@H](COP(O)(=O)O[C@@H]4[C@@H](COP(O)(=O)O[C@@H]5[C@@H](COP(O)(=O)O[C@@H]6[C@@H](COP(O)(=O)O[C@@H]7[C@@H](COP(O)(=O)O[C@@H]8[C@@H](COP(O)(=O)O[C@@H]9[C@@H](COP(O)(=O)O[C@@H]%10[C@@H](COP(O)(=O)O[C@@H]%11[C@@H](COP(O)(=O)O[C@@H]%12[C@@H](COP(O)(=O)O[C@@H]%13[C@@H](COP(O)(=O)O[C@@H]%14[C@@H](COP(O)(=O)O[C@@H]%15[C@@H](COP(O)(=O)O[C@@H]%16[C@@H](COP(O)(=O)O[C@@H]%17[C@@H](COP(O)(=O)O[C@@H]%18[C@@H](COP(O)(=O)O[C@@H]%19[C@@H](COP(O)(=O)O[C@@H]%20[C@@H](COP(O)(=O)O[C@@H]%21[C@@H](COP(O)(=O)O[C@@H]%22[C@@H](COP(O)(=O)O[C@@H]%23[C@@H](COP(O)(=O)O[C@@H]%24[C@@H](COP(O)(=O)O[C@@H]%25[C@@H](CO)O[C@H]([C@@H]%25O)n%25cnc%26c%25nc(N)[nH]c%26=O)O[C@H]([C@@H]%24O)n%24ccc(=O)[nH]c%24=O)O[C@H]([C@@H]%23O)n%23ccc(=O)[nH]c%23=O)O[C@H]([C@@H]%22O)n%22ccc(=O)[nH]c%22=O)O[C@H]([C@@H]%21O)n%21cnc%22c%21nc(N)[nH]c%22=O)O[C@H]([C@@H]%20O)n%20cnc%21c%20nc(N)[nH]c%21=O)O[C@H]([C@@H]%19O)n%19ccc(N)nc%19=O)O[C@H]([C@@H]%18O)n%18cnc%19c%18nc(N)[nH]c%19=O)O[C@H

In [108]:
# Index of the shortest SMILES string
shortest_smiles = min(df["canonical_smiles"], key=len)
shortest_smiles_index = df.canonical_smiles[df.canonical_smiles == shortest_smiles].index.tolist()
print(f"Shortest SMILES: {shortest_smiles}")
print(
    f"Contains {len(shortest_smiles)} characters, index in dataframe: {shortest_smiles_index[0]}."
)
# NBVAL_CHECK_OUTPUT

Shortest SMILES: NO
Contains 2 characters, index in dataframe: 7251.


In [109]:
classes = df['classes'].explode()

set_classes = list(set(classes))
print(f"There are {len(set_classes)} different classes from {len(classes)} set in the data.")

There are 347 different classes from 65849 set in the data.


In [110]:
new_data = []
for row in df['classes']:
    new_row = [1 if elem in row else 0 for elem in set_classes]
    new_data.append(new_row)

one_hot_classes = pd.DataFrame(new_data, columns=set_classes)

In [111]:
one_hot_classes

Unnamed: 0,CHEBI_35475,CHEBI_50860,CHEBI_59132,CHEBI_51689,CHEBI_22315,CHEBI_33710,CHEBI_24651,CHEBI_23824,CHEBI_36785,CHEBI_61379,...,CHEBI_15734,CHEBI_35674,CHEBI_61355,CHEBI_62732,CHEBI_24043,CHEBI_35757,CHEBI_35222,CHEBI_35618,CHEBI_36087,CHEBI_23849
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36882,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
36883,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
36884,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
36885,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [112]:
from collections import Counter

contagem = Counter(classes)

item_mais_comum = contagem.most_common(1)[0]
print(f'O item mais comum é: {item_mais_comum[0]}, ocorrendo {item_mais_comum[1]} vezes. Correspondendo a {100*item_mais_comum[1]/len(classes)}% do dataset.')

item_menos_comum = contagem.most_common()[:-2:-1][0]
print(f'O item menos comum é: {item_menos_comum[0]}, ocorrendo {item_menos_comum[1]} vez.')

O item mais comum é: CHEBI_16670, ocorrendo 6176 vezes. Correspondendo a 9.379033850172364% do dataset.
O item menos comum é: CHEBI_26469, ocorrendo 1 vez.


Encontrando classes amostragem baixa

In [113]:
threshold = 1.5

less_than_threshold = [element for element, count in contagem.items() if (count / len(classes)) * 100 < threshold]
print(f'There are {len(less_than_threshold)} elements corresponding to less than {threshold}% of the dataset: {less_than_threshold}')

There are 340 elements corresponding to less than 1.5% of the dataset: ['CHEBI_46867', 'CHEBI_26188', 'CHEBI_35618', 'CHEBI_35620', 'CHEBI_35674', 'CHEBI_67079', 'CHEBI_33860', 'CHEBI_25384', 'CHEBI_33281', 'CHEBI_36047', 'CHEBI_25248', 'CHEBI_38163', 'CHEBI_38958', 'CHEBI_3992', 'CHEBI_76224', 'CHEBI_22587', 'CHEBI_36709', 'CHEBI_46848', 'CHEBI_35441', 'CHEBI_35703', 'CHEBI_78298', 'CHEBI_83925', 'CHEBI_33859', 'CHEBI_35748', 'CHEBI_35366', 'CHEBI_18946', 'CHEBI_26195', 'CHEBI_68495', 'CHEBI_35718', 'CHEBI_26878', 'CHEBI_35681', 'CHEBI_51689', 'CHEBI_76971', 'CHEBI_15705', 'CHEBI_23665', 'CHEBI_50996', 'CHEBI_46874', 'CHEBI_75767', 'CHEBI_22315', 'CHEBI_61778', 'CHEBI_46761', 'CHEBI_33558', 'CHEBI_26979', 'CHEBI_37581', 'CHEBI_33572', 'CHEBI_76946', 'CHEBI_78840', 'CHEBI_35475', 'CHEBI_35472', 'CHEBI_48975', 'CHEBI_50995', 'CHEBI_35358', 'CHEBI_22586', 'CHEBI_50267', 'CHEBI_23990', 'CHEBI_29347', 'CHEBI_32955', 'CHEBI_149553', 'CHEBI_33575', 'CHEBI_22723', 'CHEBI_37143', 'CHEBI_39447'

In [114]:
import statistics

contagens_classes = list(contagem.values())

contagens_classes.sort()

mediana_contagens = statistics.median(contagens_classes)

print(f"A mediana da quantidade de repetições dos itens é: {mediana_contagens}")

A mediana da quantidade de repetições dos itens é: 103


In [115]:
print(f"The max number of classes for smile is: {max(one_hot_classes.sum(axis=1))}")

The max number of classes for smile is: 14


In [116]:
df_selected = df.copy()
df_selected['classes'] = df_selected['classes'].apply(lambda x: [elem for elem in x if elem not in less_than_threshold])
df_selected = df_selected[df_selected['classes'].astype(bool)]
df_selected.reset_index(drop=True, inplace=True)
df_selected

Unnamed: 0,canonical_smiles,classes
0,[H][C@@]12COc3cc(O)ccc3[C@]1([H])Oc1cc(OC)ccc21,[CHEBI_76924]
1,COC1=C2C=COC2=CC2=C1C(=O)C=C(C)O2,[CHEBI_76924]
2,OC[C@@H](O)[C@@H](O)C(O)[C@H](O)[C@H](O)CO,[CHEBI_25212]
3,COc1cc(O)c2c(c1)oc(=O)c1c3cc(O)c(O)cc3oc21,"[CHEBI_25212, CHEBI_35610]"
4,CC(C)=CCc1c(O)cc2occ(-c3ccc(O)cc3)c(=O)c2c1O,[CHEBI_76924]
...,...,...
19962,N\C(C(O)=O)=C(\C=C/C=O)C(O)=O,[CHEBI_75771]
19963,CC1=CC(=O)[C@@H]2C[C@H]1C2(C)C,[CHEBI_76924]
19964,COc1ccc([C@H]2COc3cc(O)ccc3C2)c(O)c1,[CHEBI_76924]
19965,[H][C@]12N3CCC[C@@]1(CC)C[C@](O)(C(=O)OC)n1c2c...,[CHEBI_25212]


In [117]:
import numpy as np

df_selected['string_size'] = df_selected['canonical_smiles'].apply(len)

median_size = np.median(df_selected['string_size'])

Q1 = df_selected['string_size'].quantile(0.25)
Q3 = df_selected['string_size'].quantile(0.75)
IQR = Q3 - Q1

outlier_threshold = 1.5 * IQR

df_filtered = df_selected[(df_selected['string_size'] >= Q1 - outlier_threshold) & (df_selected['string_size'] <= Q3 + outlier_threshold)]

In [118]:
almost_final_dict = []

for i, row in df_filtered.iterrows():
    for classe in row['classes']:
        almost_final_dict.append({'smile':row['canonical_smiles'], 'classe':classe})

almost_final_df = pd.DataFrame(almost_final_dict)
almost_final_df

Unnamed: 0,smile,classe
0,[H][C@@]12COc3cc(O)ccc3[C@]1([H])Oc1cc(OC)ccc21,CHEBI_76924
1,COC1=C2C=COC2=CC2=C1C(=O)C=C(C)O2,CHEBI_76924
2,OC[C@@H](O)[C@@H](O)C(O)[C@H](O)[C@H](O)CO,CHEBI_25212
3,COc1cc(O)c2c(c1)oc(=O)c1c3cc(O)c(O)cc3oc21,CHEBI_25212
4,COc1cc(O)c2c(c1)oc(=O)c1c3cc(O)c(O)cc3oc21,CHEBI_35610
...,...,...
19714,N\C(C(O)=O)=C(\C=C/C=O)C(O)=O,CHEBI_75771
19715,CC1=CC(=O)[C@@H]2C[C@H]1C2(C)C,CHEBI_76924
19716,COc1ccc([C@H]2COc3cc(O)ccc3C2)c(O)c1,CHEBI_76924
19717,[H][C@]12N3CCC[C@@]1(CC)C[C@](O)(C(=O)OC)n1c2c...,CHEBI_25212


In [119]:
min_class_count = almost_final_df['classe'].value_counts().min()

grouped_df = almost_final_df.groupby('classe', group_keys=False)

full_df = grouped_df.apply(lambda x: x.sample(min_class_count))

full_df = full_df.sample(frac=1)

full_df.reset_index(drop=True, inplace=True)

full_df

Unnamed: 0,smile,classe
0,COc1cccc2C(=O)c3c(O)c4C[C@](O)(C[C@H](O[C@H]5C...,CHEBI_35610
1,CCCCCCCC=O,CHEBI_76924
2,O[C@@H]1[C@@H](O)[C@@H](O[C@@H]([C@H]1O)C(O)=O...,CHEBI_25212
3,[C@@]123[C@@H](NC4=C1C=CC=C4)[C@](C[C@]5([C@@H...,CHEBI_76924
4,[H][C@]12CCN(C(=O)[C@@H](NC(=O)[C@H](C)[N+](C)...,CHEBI_25212
...,...,...
9158,CNc1cc[nH]c(=O)n1,CHEBI_25212
9159,CC1(C)Oc2cc3oc(=O)ccc3cc2C[C@@H]1O,CHEBI_35610
9160,O=C(N[C@@H](CC=1NC=NC1)C(O)=O)[C@@H](NC(=O)[C@...,CHEBI_25676
9161,O=C(N[C@@H](CC1=CC=CC=C1)C(O)=O)[C@@H](NC(=O)[...,CHEBI_16670


In [120]:
from sklearn.model_selection import train_test_split

In [121]:
seed = 42

train_df, test_val_df = train_test_split(full_df, test_size=0.3, random_state=seed, stratify=full_df['classe'])
test_df, val_df = train_test_split(test_val_df, test_size=0.5, random_state=seed, stratify=test_val_df['classe'])

In [122]:
train_df.to_csv('train.csv', index=False)
test_df.to_csv('test.csv', index=False)
val_df.to_csv('val.csv', index=False)