In [27]:
!pip install rdkit



In [36]:
import pandas as pd
import rdkit
import rdkit.Chem as Chem

fragment_library = pd.read_csv("/Users/charlie/projects/vibes/molexplorer/data/fragment_library.tsv", sep="\t")

smiles_ccd = pd.read_csv("/Users/charlie/projects/vibes/molexplorer/data/Components-smiles-oe.smi", sep="\t", header=None)

# Add headers
smiles_ccd.columns = ["smiles", "ccd", "name"]

def normalise_smiles(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles)
        return Chem.MolToSmiles(mol)
    except:
        return None

# Normalise smiles
smiles_ccd["smiles"] = smiles_ccd["smiles"].apply(normalise_smiles)

fragment_library["query"] = fragment_library["query"].apply(normalise_smiles)

for index, row in fragment_library.iterrows():
    query_smiles = row["query"]

    # Find smiles in smiles_ccd that are exactly equal to query_smiles and add ccd to fragment_library
    similar_smiles = smiles_ccd[smiles_ccd["smiles"] == query_smiles]
    if not similar_smiles.empty:
        # If there are multiple matches, join them with comma, else just assign the single value
        fragment_library.loc[index, "ccd"] = ",".join(similar_smiles["ccd"].astype(str))
    else:
        fragment_library.loc[index, "ccd"] = None

# Add in_ccd column to fragment_library
fragment_library["in_ccd"] = fragment_library["ccd"].notna()

fragment_library

[21:45:44] Explicit valence for atom # 0 Be, 4, is greater than permitted
[21:45:44] Explicit valence for atom # 0 B, 4, is greater than permitted
[21:45:44] Explicit valence for atom # 16 N, 4, is greater than permitted
[21:45:44] Explicit valence for atom # 1 C, 5, is greater than permitted
[21:45:44] Explicit valence for atom # 3 N, 4, is greater than permitted
[21:45:44] Explicit valence for atom # 0 B, 6, is greater than permitted
[21:45:44] Explicit valence for atom # 0 B, 6, is greater than permitted
[21:45:44] Explicit valence for atom # 0 B, 5, is greater than permitted
[21:45:44] Explicit valence for atom # 3 N, 4, is greater than permitted
[21:45:44] Explicit valence for atom # 3 C, 5, is greater than permitted
[21:45:44] Explicit valence for atom # 24 N, 4, is greater than permitted
[21:45:44] Explicit valence for atom # 0 B, 5, is greater than permitted
[21:45:44] Explicit valence for atom # 0 B, 5, is greater than permitted
[21:45:44] Explicit valence for atom # 0 B, 4, i

Unnamed: 0,name,kind,query,description,comment,url,source,ccd,in_ccd
0,acetylurea,SMILES,O=C1CNC(=O)N1,,unchecked,,PDBe,HYN,True
1,acridine,SMILES,c1ccc2nc3ccccc3cc2c1,,unchecked,,PDBe,,False
2,acridone,SMILES,OC1c2ccccc2Nc2ccccc21,,unchecked,,PDBe,,False
3,actinophenoxazine,SMILES,Nc1cc2c(cc1O)Oc1ccccc1N2,,unchecked,,PDBe,,False
4,adenine,SMILES,Nc1ncnc2nc[nH]c12,Adenine is a nucleobase (a purine derivative).,unchecked,https://en.wikipedia.org/wiki/Adenine,PDBe,,False
...,...,...,...,...,...,...,...,...,...
2152,Z19731563,SMILES,Cc1ccc(OCC(=O)Nc2cc(C)on2)cc1,,DSI library,,DSI,,False
2153,Z2856434834,SMILES,Cc1cccc(NC(=O)CN2CCOCC2)c1C,,DSI library,,DSI,NT7,True
2154,Z57282999,SMILES,O=c1[nH]cc(N2CCOCC2)c(=O)[nH]1,,DSI library,,DSI,,False
2155,Z31480458,SMILES,CC(C)COc1ccc(C(=O)N2CCCC2)cc1,,DSI library,,DSI,,False


In [33]:
smiles_ccd.isna().smiles.value_counts()

smiles
False    47478
True       264
Name: count, dtype: int64

In [38]:
fragment_library.to_csv("/Users/charlie/projects/vibes/molexplorer/data/fragment_library_ccd.tsv", sep="\t")

In [37]:
fragment_library.in_ccd.value_counts()

in_ccd
False    1655
True      502
Name: count, dtype: int64