# Solubility

In [181]:
import torch
import numpy as np
import pandas as pd
from pandas.api.types import CategoricalDtype

from rdkit import Chem
from rdkit.Chem import AllChem

import torchgraphs as tg

In [182]:
df = pd.read_csv('../data/delaney-processed.csv')
df.head()

Unnamed: 0,Compound ID,ESOL predicted log solubility in mols per litre,Minimum Degree,Molecular Weight,Number of H-Bond Donors,Number of Rings,Number of Rotatable Bonds,Polar Surface Area,measured log solubility in mols per litre,smiles
0,Amigdalin,-0.974,1,457.432,7,3,7,202.32,-0.77,OCC3OC(OCC2OC(OC(C#N)c1ccccc1)C(O)C(O)C2O)C(O)...
1,Fenfuram,-2.885,1,201.225,1,2,2,42.24,-3.3,Cc1occc1C(=O)Nc2ccccc2
2,citral,-2.579,1,152.237,0,0,4,17.07,-2.06,CC(C)=CCCC(C)=CC(=O)
3,Picene,-6.618,2,278.354,0,5,0,0.0,-7.87,c1ccc2c(c1)ccc3c2ccc4c5ccccc5ccc43
4,Thiophene,-2.232,2,84.143,0,1,0,0.0,-1.33,c1ccsc1


In [183]:
molecule = Chem.MolFromSmiles(df.smiles[0])

## Atom features

In [184]:
symbols = CategoricalDtype([
   'C', 'N', 'O', 'S', 'F', 'Si', 'P', 'Cl', 'Br', 'Mg', 'Na',
   'Ca', 'Fe', 'As', 'Al', 'I', 'B', 'V', 'K', 'Tl', 'Yb',
   'Sb', 'Sn', 'Ag', 'Pd', 'Co', 'Se', 'Ti', 'Zn', 'H',    # H?
   'Li', 'Ge', 'Cu', 'Au', 'Ni', 'Cd', 'In', 'Mn', 'Zr',
   'Cr', 'Pt', 'Hg', 'Pb', 'Unknown'
], ordered=True)

In [185]:
atoms_df = []
for i in range(molecule.GetNumAtoms()):
    atom = molecule.GetAtomWithIdx(i)
    atoms_df.append({
        'index': i,
        'symbol': atom.GetSymbol(),
        'degree': atom.GetDegree(),
        'hydrogens': atom.GetTotalNumHs(),
        'impl_valence': atom.GetImplicitValence(),
    })
atoms_df = pd.DataFrame.from_records(atoms_df, index='index')
#atoms_df.degree.cat.set_categories([0, 1, 2, 3, 4, 5])
#atoms_df.hydrogens.cat.set_categories([0, 1, 2, 3, 4])
#atoms_df.impl_valence.cat.set_categories([0, 1, 2, 3, 4, 5])
atoms_df.symbol = atoms_df.symbol.astype(symbols)
atoms_df.head()

Unnamed: 0_level_0,degree,hydrogens,impl_valence,symbol
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,1,1,1,O
1,2,2,2,C
2,3,1,1,C
3,2,0,0,O
4,3,1,1,C


In [186]:
node_features = torch.tensor(pd.get_dummies(atoms_df, columns=['symbol']).values, dtype=torch.float)

## Bond features

In [187]:
bonds = CategoricalDtype([
   'SINGLE',
    'DOUBLE',
    'TRIPLE',
    'AROMATIC'
], ordered=True)

In [188]:
bonds_df = []
for bond in molecule.GetBonds():
    bonds_df.append({
        'sender': bond.GetBeginAtomIdx(),
        'receiver': bond.GetEndAtomIdx(),
        'type': bond.GetBondType().name,
        'conj': bond.GetIsConjugated(),
        'ring': bond.IsInRing()
    })
    bonds_df.append({
        'receiver': bond.GetBeginAtomIdx(),
        'sender': bond.GetEndAtomIdx(),
        'type': bond.GetBondType().name,
        'conj': bond.GetIsConjugated(),
        'ring': bond.IsInRing()
    })
bonds_df = pd.DataFrame.from_records(bonds_df, index=['sender', 'receiver'])
bonds_df.conj = bonds_df.conj * 2. - 1
bonds_df.ring = bonds_df.ring * 2. - 1
bonds_df.type = bonds_df.type.astype(bonds)
bonds_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,conj,ring,type
sender,receiver,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,1,-1.0,-1.0,SINGLE
1,0,-1.0,-1.0,SINGLE
1,2,-1.0,-1.0,SINGLE
2,1,-1.0,-1.0,SINGLE
2,3,-1.0,1.0,SINGLE


In [189]:
edge_features = torch.tensor(pd.get_dummies(bonds_df, columns=['type']).values, dtype=torch.float)
senders = torch.tensor(bonds_df.index.get_level_values('sender'))
receivers = torch.tensor(bonds_df.index.get_level_values('receiver'))

In [191]:
def smiles_to_graph(smiles: str) -> tg.Graph:
    molecule = Chem.MolFromSmiles(df.smiles[0])
    
    atoms_df = []
    for i in range(molecule.GetNumAtoms()):
        atom = molecule.GetAtomWithIdx(i)
        atoms_df.append({
            'index': i,
            'symbol': atom.GetSymbol(),
            'degree': atom.GetDegree(),
            'hydrogens': atom.GetTotalNumHs(),
            'impl_valence': atom.GetImplicitValence(),
        })
    atoms_df = pd.DataFrame.from_records(atoms_df, index='index')
    atoms_df.symbol = atoms_df.symbol.astype(symbols)
    
    node_features = torch.tensor(pd.get_dummies(atoms_df, columns=['symbol']).values, dtype=torch.float)
    
    bonds_df = []
    for bond in molecule.GetBonds():
        bonds_df.append({
            'sender': bond.GetBeginAtomIdx(),
            'receiver': bond.GetEndAtomIdx(),
            'type': bond.GetBondType().name,
            'conj': bond.GetIsConjugated(),
            'ring': bond.IsInRing()
        })
        bonds_df.append({
            'receiver': bond.GetBeginAtomIdx(),
            'sender': bond.GetEndAtomIdx(),
            'type': bond.GetBondType().name,
            'conj': bond.GetIsConjugated(),
            'ring': bond.IsInRing()
        })
    bonds_df = pd.DataFrame.from_records(bonds_df, index=['sender', 'receiver'])
    bonds_df.conj = bonds_df.conj * 2. - 1
    bonds_df.ring = bonds_df.ring * 2. - 1
    bonds_df.type = bonds_df.type.astype(bonds)
    
    edge_features = torch.tensor(pd.get_dummies(bonds_df, columns=['type']).values, dtype=torch.float)
    senders = torch.tensor(bonds_df.index.get_level_values('sender'))
    receivers = torch.tensor(bonds_df.index.get_level_values('receiver'))
    
    return tg.Graph(
        num_nodes=molecule.GetNumAtoms(),
        num_edges=molecule.GetNumBonds() * 2,
        node_features=node_features,
        edge_features=edge_features,
        senders=senders,
        receivers=receivers
    )

smiles_to_graph(df.smiles[0])

Graph(n=32, e=68, n_shape=torch.Size([47]), e_shape=torch.Size([6]), g_shape=None)

In [219]:
class SolubilityDataset(torch.utils.data.Dataset):
    def __init__(self, path):
        self.df = pd.read_csv(path)
        self.df['molecules'] = self.df.smiles.apply(smiles_to_graph)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, item):
        mol = self.df['molecules'].iloc[item]
        target = self.df['measured log solubility in mols per litre'].iloc[item]
        return mol, target
    
sd = SolubilityDataset('../data/delaney-processed.csv')