In [5]:
import deepchem as dc
from openpom.feat.graph_featurizer import GraphFeaturizer, GraphConvConstants
from openpom.utils.data_utils import get_class_imbalance_ratio, IterativeStratifiedSplitter
from openpom.models.mpnn_pom import MPNNPOMModel
from datetime import datetime
from tqdm import tqdm
import torch
import numpy as np
from sklearn.metrics import roc_auc_score

print(torch.__version__)

2.0.0+cu117


In [6]:
TASKS = [
'alcoholic', 'aldehydic', 'alliaceous', 'almond', 'amber', 'animal',
'anisic', 'apple', 'apricot', 'aromatic', 'balsamic', 'banana', 'beefy',
'bergamot', 'berry', 'bitter', 'black currant', 'brandy', 'burnt',
'buttery', 'cabbage', 'camphoreous', 'caramellic', 'cedar', 'celery',
'chamomile', 'cheesy', 'cherry', 'chocolate', 'cinnamon', 'citrus', 'clean',
'clove', 'cocoa', 'coconut', 'coffee', 'cognac', 'cooked', 'cooling',
'cortex', 'coumarinic', 'creamy', 'cucumber', 'dairy', 'dry', 'earthy',
'ethereal', 'fatty', 'fermented', 'fishy', 'floral', 'fresh', 'fruit skin',
'fruity', 'garlic', 'gassy', 'geranium', 'grape', 'grapefruit', 'grassy',
'green', 'hawthorn', 'hay', 'hazelnut', 'herbal', 'honey', 'hyacinth',
'jasmin', 'juicy', 'ketonic', 'lactonic', 'lavender', 'leafy', 'leathery',
'lemon', 'lily', 'malty', 'meaty', 'medicinal', 'melon', 'metallic',
'milky', 'mint', 'muguet', 'mushroom', 'musk', 'musty', 'natural', 'nutty',
'odorless', 'oily', 'onion', 'orange', 'orangeflower', 'orris', 'ozone',
'peach', 'pear', 'phenolic', 'pine', 'pineapple', 'plum', 'popcorn',
'potato', 'powdery', 'pungent', 'radish', 'raspberry', 'ripe', 'roasted',
'rose', 'rummy', 'sandalwood', 'savory', 'sharp', 'smoky', 'soapy',
'solvent', 'sour', 'spicy', 'strawberry', 'sulfurous', 'sweaty', 'sweet',
'tea', 'terpenic', 'tobacco', 'tomato', 'tropical', 'vanilla', 'vegetable',
'vetiver', 'violet', 'warm', 'waxy', 'weedy', 'winey', 'woody'
]

print("No of tasks: ", len(TASKS))
n_tasks = len(TASKS)

No of tasks:  138


In [7]:
# # uncomment and run if no splits saved yet

# # download curated dataset
# !wget https://raw.githubusercontent.com/ARY2260/openpom/main/openpom/data/curated_datasets/curated_GS_LF_merged_4983.csv

# # The curated dataset can also found at `openpom/data/curated_datasets/curated_GS_LF_merged_4983.csv` in the repo.

input_file = './openpom/data/curated_datasets/curated_GS_LF_merged_4983.csv' # or new downloaded file path

# # get dataset

featurizer = GraphFeaturizer()
smiles_field = 'nonStereoSMILES'
loader = dc.data.CSVLoader(tasks=TASKS,
                    feature_field=smiles_field,
                    featurizer=featurizer)
dataset = loader.create_dataset(inputs=[input_file])
n_tasks = len(dataset.tasks)

# get train valid test splits
splitter = IterativeStratifiedSplitter(order=2)
train_dataset, test_dataset = splitter.train_test_split(dataset, frac_train=0.8, train_dir='./splits/train_data', test_dir='./splits/test_data')

print("train_dataset: ", len(train_dataset))
print("test_dataset: ", len(test_dataset))

train_dataset:  3989
test_dataset:  994


In [8]:
train_dataset = dc.data.DiskDataset('./splits/train_data')
test_dataset = dc.data.DiskDataset('./splits/test_data')
print("train_dataset: ", len(train_dataset))
print("test_dataset: ", len(test_dataset))

train_dataset:  3989
test_dataset:  994


In [9]:
train_ratios = get_class_imbalance_ratio(train_dataset)
assert len(train_ratios) == n_tasks

# learning_rate = 0.001
learning_rate = dc.models.optimizers.ExponentialDecay(initial_rate=0.001, decay_rate=0.5, decay_steps=32*20, staircase=True)

metric = dc.metrics.Metric(dc.metrics.roc_auc_score)

In [10]:
import os
# no of models in the ensemble
n_models = 10

# no of epochs each model is trained for
nb_epoch = 62

#os.environ['CUDA_MODULE_LOADING'] = 'LAZY'

#print(os.environ['CUDA_MODULE_LOADING'])

print(torch.__version__)

2.0.0+cu117


In [11]:
for i in tqdm(range(n_models)):
    model = MPNNPOMModel(n_tasks = n_tasks,
                            batch_size=128,
                            learning_rate=learning_rate,
                            class_imbalance_ratio = train_ratios,
                            loss_aggr_type = 'sum',
                            node_out_feats = 100,
                            edge_hidden_feats = 75,
                            edge_out_feats = 100,
                            num_step_message_passing = 5,
                            mpnn_residual = True,
                            message_aggregator_type = 'sum',
                            mode = 'classification',
                            number_atom_features = GraphConvConstants.ATOM_FDIM,
                            number_bond_features = GraphConvConstants.BOND_FDIM,
                            n_classes = 1,
                            readout_type = 'set2set',
                            num_step_set2set = 3,
                            num_layer_set2set = 2,
                            ffn_hidden_list= [392, 392],
                            ffn_embeddings = 256,
                            ffn_activation = 'relu',
                            ffn_dropout_p = 0.12,
                            ffn_dropout_at_input_no_act = False,
                            weight_decay = 1e-5,
                            self_loop = False,
                            optimizer_name = 'adam',
                            log_frequency = 32,
                            model_dir = f'./ensemble_models/experiments_{i+1}',
                            device_name='cuda')

    start_time = datetime.now()
    
    # fit model
    loss = model.fit(
          train_dataset,
          nb_epoch=nb_epoch,
          max_checkpoints_to_keep=1,
          deterministic=False,
          restore=False)
    end_time = datetime.now()
    
    train_scores = model.evaluate(train_dataset, [metric])['roc_auc_score']
    test_scores = model.evaluate(test_dataset, [metric])['roc_auc_score']
    print(f"loss = {loss}; train_scores = {train_scores}; test_scores = {test_scores}; time_taken = {str(end_time-start_time)}")
    model.save_checkpoint() # saves final checkpoint => `checkpoint2.pt`
    del model
    torch.cuda.empty_cache()

 10%|████████████▊                                                                                                                   | 1/10 [03:12<28:48, 192.09s/it]

loss = 1.6020852327346802; train_scores = 0.9543161812751279; test_scores = 0.9228994188170052; time_taken = 0:03:08.041317


 20%|█████████████████████████▌                                                                                                      | 2/10 [06:25<25:40, 192.60s/it]

loss = 1.6156800985336304; train_scores = 0.9551961489734778; test_scores = 0.9208305701745249; time_taken = 0:03:09.727845


 30%|██████████████████████████████████████▍                                                                                         | 3/10 [09:37<22:27, 192.55s/it]

loss = 1.7234430313110352; train_scores = 0.949978000698836; test_scores = 0.9216680504023683; time_taken = 0:03:09.136671


 40%|███████████████████████████████████████████████████▏                                                                            | 4/10 [12:50<19:17, 192.87s/it]

loss = 1.6565388441085815; train_scores = 0.9531796749677158; test_scores = 0.923116306521264; time_taken = 0:03:09.988811


 50%|████████████████████████████████████████████████████████████████                                                                | 5/10 [16:03<16:03, 192.64s/it]

loss = 1.7520861625671387; train_scores = 0.9482968735282024; test_scores = 0.9184018649615192; time_taken = 0:03:08.806996


 60%|████████████████████████████████████████████████████████████████████████████▊                                                   | 6/10 [19:16<12:51, 192.99s/it]

loss = 1.6842081546783447; train_scores = 0.952913259694973; test_scores = 0.9234091371904676; time_taken = 0:03:10.352347


 70%|█████████████████████████████████████████████████████████████████████████████████████████▌                                      | 7/10 [22:28<09:37, 192.43s/it]

loss = 1.7800602912902832; train_scores = 0.9480311750235398; test_scores = 0.9171209958444607; time_taken = 0:03:07.941896


 80%|██████████████████████████████████████████████████████████████████████████████████████████████████████▍                         | 8/10 [25:40<06:24, 192.40s/it]

loss = 1.6380679607391357; train_scores = 0.9545555886319219; test_scores = 0.9205052182602322; time_taken = 0:03:08.996396


 90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏            | 9/10 [28:54<03:13, 193.07s/it]

loss = 1.5197309255599976; train_scores = 0.9583405699672309; test_scores = 0.9233782136238741; time_taken = 0:03:11.178886


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [32:13<00:00, 193.31s/it]

loss = 1.6620908975601196; train_scores = 0.9526541283349462; test_scores = 0.9203709562654294; time_taken = 0:03:14.813140





In [12]:
list_preds = []
for i in range(n_models):
    model = MPNNPOMModel(n_tasks = n_tasks,
                            batch_size=128,
                            learning_rate=learning_rate,
                            class_imbalance_ratio = train_ratios,
                            loss_aggr_type = 'sum',
                            node_out_feats = 100,
                            edge_hidden_feats = 75,
                            edge_out_feats = 100,
                            num_step_message_passing = 5,
                            mpnn_residual = True,
                            message_aggregator_type = 'sum',
                            mode = 'classification',
                            number_atom_features = GraphConvConstants.ATOM_FDIM,
                            number_bond_features = GraphConvConstants.BOND_FDIM,
                            n_classes = 1,
                            readout_type = 'set2set',
                            num_step_set2set = 3,
                            num_layer_set2set = 2,
                            ffn_hidden_list= [392, 392],
                            ffn_embeddings = 256,
                            ffn_activation = 'relu',
                            ffn_dropout_p = 0.12,
                            ffn_dropout_at_input_no_act = False,
                            weight_decay = 1e-5,
                            self_loop = False,
                            optimizer_name = 'adam',
                            log_frequency = 32,
                            model_dir = f'./ensemble_models/experiments_{i+1}',
                            device_name='cuda')
    model.restore(f"./ensemble_models/experiments_{i+1}/checkpoint2.pt")
    # test_scores = model.evaluate(test_dataset, [metric])['roc_auc_score']
    # print("test_score: ", test_scores)
    preds = model.predict(test_dataset)
    list_preds.append(preds)

preds_arr = np.asarray(list_preds)
ensemble_preds = np.mean(preds_arr, axis=0)
print("average ensemble score: ", roc_auc_score(test_dataset.y, ensemble_preds, average="macro"))

average ensemble score:  0.9322736346078848


In [13]:
df = test_dataset.to_dataframe()
#for X, y, w, id in test_dataset.itersamples():
#    print(X, w, id)
df
test_file = './data/fishy.csv'
dataset = loader.create_dataset(inputs=[test_file])
dataset

FileNotFoundError: [Errno 2] No such file or directory: './data/fishy.csv'

# Predict odours given a chemical structure

In [14]:
import deepchem as dc
from openpom.feat.graph_featurizer import GraphFeaturizer, GraphConvConstants
from openpom.utils.data_utils import get_class_imbalance_ratio, IterativeStratifiedSplitter
from openpom.models.mpnn_pom import MPNNPOMModel
import torch
import numpy as np



In [15]:
TASKS = [
'alcoholic', 'aldehydic', 'alliaceous', 'almond', 'amber', 'animal',
'anisic', 'apple', 'apricot', 'aromatic', 'balsamic', 'banana', 'beefy',
'bergamot', 'berry', 'bitter', 'black currant', 'brandy', 'burnt',
'buttery', 'cabbage', 'camphoreous', 'caramellic', 'cedar', 'celery',
'chamomile', 'cheesy', 'cherry', 'chocolate', 'cinnamon', 'citrus', 'clean',
'clove', 'cocoa', 'coconut', 'coffee', 'cognac', 'cooked', 'cooling',
'cortex', 'coumarinic', 'creamy', 'cucumber', 'dairy', 'dry', 'earthy',
'ethereal', 'fatty', 'fermented', 'fishy', 'floral', 'fresh', 'fruit skin',
'fruity', 'garlic', 'gassy', 'geranium', 'grape', 'grapefruit', 'grassy',
'green', 'hawthorn', 'hay', 'hazelnut', 'herbal', 'honey', 'hyacinth',
'jasmin', 'juicy', 'ketonic', 'lactonic', 'lavender', 'leafy', 'leathery',
'lemon', 'lily', 'malty', 'meaty', 'medicinal', 'melon', 'metallic',
'milky', 'mint', 'muguet', 'mushroom', 'musk', 'musty', 'natural', 'nutty',
'odorless', 'oily', 'onion', 'orange', 'orangeflower', 'orris', 'ozone',
'peach', 'pear', 'phenolic', 'pine', 'pineapple', 'plum', 'popcorn',
'potato', 'powdery', 'pungent', 'radish', 'raspberry', 'ripe', 'roasted',
'rose', 'rummy', 'sandalwood', 'savory', 'sharp', 'smoky', 'soapy',
'solvent', 'sour', 'spicy', 'strawberry', 'sulfurous', 'sweaty', 'sweet',
'tea', 'terpenic', 'tobacco', 'tomato', 'tropical', 'vanilla', 'vegetable',
'vetiver', 'violet', 'warm', 'waxy', 'weedy', 'winey', 'woody'
]

print("No of tasks: ", len(TASKS))
n_tasks = len(TASKS)


No of tasks:  138


In [16]:

odour_model = MPNNPOMModel(n_tasks = n_tasks,
                        batch_size=128,
                        learning_rate=0.001, #learning_rate,
                        class_imbalance_ratio = list(np.random.rand(n_tasks)), #train_ratios,
                        loss_aggr_type = 'sum',
                        node_out_feats = 100,
                        edge_hidden_feats = 75,
                        edge_out_feats = 100,
                        num_step_message_passing = 5,
                        mpnn_residual = True,
                        message_aggregator_type = 'sum',
                        mode = 'classification',
                        number_atom_features = GraphConvConstants.ATOM_FDIM,
                        number_bond_features = GraphConvConstants.BOND_FDIM,
                        n_classes = 1,
                        readout_type = 'set2set',
                        num_step_set2set = 3,
                        num_layer_set2set = 2,
                        ffn_hidden_list= [392, 392],
                        ffn_embeddings = 256,
                        ffn_activation = 'relu',
                        ffn_dropout_p = 0.12,
                        ffn_dropout_at_input_no_act = False,
                        weight_decay = 1e-5,
                        self_loop = False,
                        optimizer_name = 'adam',
                        log_frequency = 32,
                        model_dir = f'./ensemble_models/experiments_9',
                        device_name='cuda')
odour_model.restore(f"./ensemble_models/experiments_9/checkpoint2.pt")


In [17]:
def predict_odours(molecule):
    x = featurizer.featurize([molecule])
    molecule_data = dc.data.NumpyDataset(X=x, ids=[molecule])
    preds = odour_model.predict(molecule_data)
    #print(preds[0])

    odours = np.argwhere(preds[0] > 0.5)
    #print(odours)
    return [TASKS[s] for s in odours.flatten()]

molecule = 'C=CC(C)(O)CCC=C(C)C'
odours = predict_odours(molecule)
print(odours)


['bergamot', 'citrus', 'floral', 'fresh', 'green', 'herbal', 'sweet', 'woody']
