In [None]:
#belowing four line for reproduce results use(could make varialbe_initiator predictable)
#from numpy.random import seed
#seed(1)
#from tensorflow import set_random_seed
#set_random_seed(2)    
import os
import random
import numpy as np
import tensorflow as tf
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem
from tqdm import tqdm, trange
from collections import defaultdict
from highway_layer import Highway
#匯入深度學習的框架函式庫：keras
import keras
from keras import backend as K
from keras.initializers import Constant
from keras.utils import plot_model
#keras用以建立模型架構的函數
from keras.models import Sequential, load_model, Model

#keras中建立深度學習layer的函數

from keras.layers import Dense, Dropout, BatchNormalization, Activation, Multiply, Add, Lambda, Input

#keras訓練演算法函數
from keras import regularizers
from keras.optimizers import Adam

#keras提早判停的函數
from keras.callbacks import EarlyStopping, ModelCheckpoint

#it's hard to reproduce results, so close all seeds
#os.environ['PYTHONHASHSEED'] = '0'
#np.random.seed(0)
#tf.set_random_seed(0)
#random.seed(0)

#to solve problem:Blas GEMM launch failed
from keras.backend.tensorflow_backend import set_session
config = tf.ConfigProto()
#config = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
config.gpu_options.allocator_type = 'BFC' #A "Best-fit with coalescing" algorithm, simplified from a version of dlmalloc.
config.gpu_options.per_process_gpu_memory_fraction = 0.95
config.gpu_options.allow_growth = True
set_session(tf.Session(config=config)) 


def fps_to_arr(fps):
    """Faster conversion to ndarray"""
    arrs = []
    for fp in fps:
        onbits = list(fp.GetOnBits())
        arr = np.zeros(fp.GetNumBits())
        arr[onbits] = 1
        arrs.append(arr)
    arrs = np.array(arrs)
    return arrs




def fingerprint_mols(mols, fp_dim):
    fps = []
    for mol in mols:
        mol = Chem.MolFromSmiles(mol)

        # Necessary for fingerprinting
        # Chem.GetSymmSSSR(mol)

        # "When comparing the ECFP/FCFP fingerprints and
        # the Morgan fingerprints generated by the RDKit,
        # remember that the 4 in ECFP4 corresponds to the
        # diameter of the atom environments considered,
        # while the Morgan fingerprints take a radius parameter.
        # So the examples above, with radius=2, are roughly
        # equivalent to ECFP4 and FCFP4."
        # <http://www.rdkit.org/docs/GettingStartedInPython.html>
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=int(fp_dim), useChirality=1)
        # fold_factor = fp.GetNumBits()//fp_dim
        # fp = DataStructs.FoldFingerprint(fp, fold_factor)
        fps.append(fp)
    return fps
'''
def fps_to_arr(fps):
    """Faster conversion to ndarray"""
    arrs = []
    for fp, info in zip(fps[0],fps[1]):
        onbits = list(fp.GetOnBits())
        arr = np.zeros(fp.GetNumBits())
        for onbit in onbits:
            arr[onbit] = len(info[onbit])
        arrs.append(arr)
    arrs = np.array(arrs)
    return arrs




def fingerprint_mols(mols, fp_dim):
    fps = []
    infos = []
    for mol in mols:
        mol = Chem.MolFromSmiles(mol)
        info={}
        # Necessary for fingerprinting
        # Chem.GetSymmSSSR(mol)

        # "When comparing the ECFP/FCFP fingerprints and
        # the Morgan fingerprints generated by the RDKit,
        # remember that the 4 in ECFP4 corresponds to the
        # diameter of the atom environments considered,
        # while the Morgan fingerprints take a radius parameter.
        # So the examples above, with radius=2, are roughly
        # equivalent to ECFP4 and FCFP4."
        # <http://www.rdkit.org/docs/GettingStartedInPython.html>
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=int(fp_dim), useChirality=1, bitInfo=info)
        # fold_factor = fp.GetNumBits()//fp_dim
        # fp = DataStructs.FoldFingerprint(fp, fold_factor)
        fps.append(fp)
        infos.append(info)
    return fps, infos
'''
def preprocess(X,fp_dim):
    # Compute fingerprints
    dataX = fps_to_arr(fingerprint_mols(X, fp_dim))
    FPs = np.log(dataX+1)
    return FPs

class DataGenerator(keras.utils.Sequence):
    
    def __init__(self, X, y, batch_size=1, shuffle=True, fp_dim=8192 ):
        self.batch_size = batch_size
        self.X = X
        self.y = y
        self.indexes = np.arange(len(self.X))
        self.shuffle = shuffle
        self.fp_dim = fp_dim

    def __len__(self):
        #计算每一个epoch的迭代次数
        return int(np.floor(len(self.X) / int(self.batch_size)))

    def __getitem__(self, index):
        #生成每个batch数据，这里就根据自己对数据的读取方式进行发挥了
        # 生成batch_size个索引
        batch_indexs = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
        # 根据索引获取datas集合中的数据
        batch_datasX = [self.X[k] for k in batch_indexs]
        batch_datasy = [self.y[k] for k in batch_indexs]
        # 生成数据
        X = preprocess(batch_datasX, self.fp_dim)
        y = np.array(batch_datasy)
#        y = y.astype(np.int64)
        return X, y

    def on_epoch_end(self):
        #在每一次epoch结束是否需要进行一次随机，重新随机一下index
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

#    def fps_to_arr(fps):
        """Faster conversion to ndarray"""
#        arrs = []
#        for fp in fps:
#            onbits = list(fp.GetOnBits())
#            arr = np.zeros(fp.GetNumBits())
#            arr[onbits] = 1
#            arrs.append(arr)
#        arrs = np.array(arrs)
#        return arrs




#    def fingerprint_mols(self,mols):
#        fps = []
#        for mol in mols:
#            mol = Chem.MolFromSmiles(mol)

        # Necessary for fingerprinting
        # Chem.GetSymmSSSR(mol)

        # "When comparing the ECFP/FCFP fingerprints and
        # the Morgan fingerprints generated by the RDKit,
        # remember that the 4 in ECFP4 corresponds to the
        # diameter of the atom environments considered,
        # while the Morgan fingerprints take a radius parameter.
        # So the examples above, with radius=2, are roughly
        # equivalent to ECFP4 and FCFP4."
        # <http://www.rdkit.org/docs/GettingStartedInPython.html>
#            fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=int(self.fp_dim))
        # fold_factor = fp.GetNumBits()//fp_dim
        # fp = DataStructs.FoldFingerprint(fp, fold_factor)
#            fps.append(fp)
#        return fps

#    def preprocess(self,X):
    # Compute fingerprints
#        return self.fps_to_arr(self.fingerprint_mols(X, self.fp_dim))


print('Loading data...')
prod_to_rules = defaultdict(set)
with open('data/templates_rollout.dat', 'r') as f:
    for l in tqdm(f, desc='products'):
        rule, prod = l.strip().split('\t')
        prod_to_rules[prod].add(rule)

rollout_rules = {}
with open('data/rollout_rollout.dat', 'r') as f:
    for i, l in tqdm(enumerate(f), desc='rollout'):
        rule = l.strip()
        rollout_rules[rule] = i
# Rollout training
print('Rollout training...')
X, y = [], []
for prod, rules in tqdm(prod_to_rules.items(), desc='data prep'):
    rules = [r for r in rules if r in rollout_rules]
    if not rules: continue
    rules.sort()
    # Ideally trained as multilabel,
    # but multiclass, single label is easier atm
    for r in rules:
        id = rollout_rules[r]
        y.append(id)
        X.append(prod)
        
totrec = 0
for prod, rules in tqdm(prod_to_rules.items(), desc='total reactions'):
    totrec += len(rules) 
        
print('total products:', len(prod_to_rules))
print('total reactions:', totrec)
print('Training size:', len(X))

#設定訓練參數和訓練模型存放路徑
batch_size = 256
#num_classes = 6
epochs = 100
seed=0
#validation spilt
spilt=0.1
fp_dim=8192
n_rules=len(rollout_rules)
model_name = 'trained_model_rollout_'+str(seed)
save_dir = os.path.join(os.getcwd(), 'saved_models')


# Shuffle
# p = np.random.permutation(len(X))
# X, y = X[p], y[p]
xy = list(zip(X, y))
xy.sort()
random.seed(seed)
random.shuffle(xy)
X, y = zip(*xy)
data_spilt= round(len(X)*(1-spilt))
x_train = X[:data_spilt]
x_test = X[data_spilt:]
y_train = y[:data_spilt]
y_test = y[data_spilt:]
print('shuffle is over...')


#將訓練資料轉成ndarray
#x_train=preprocess(X,fp_dim)
#y_train=np.array(y)
#print('preprocess is over...')

visible = Input(shape=(fp_dim,))
hidden = Dense(512, activation='elu')(visible)
hidden = Dropout(0.4)(hidden)

# only for expansion rule policynet
#for _ in range(5):
#    hidden = Highway()(hidden)
#    hidden = Dropout(0.4)(hidden)
    
output = Dense(n_rules, activation='softmax')(hidden)
    
model = Model(inputs=visible, outputs=output)
# summarize layers
print(model.summary())
# plot graph
#plot_model(model, to_file='rolloutpolicynet_graph.png')
# 初始化Adam optimizer
opt = keras.optimizers.Adam(lr=0.0001)

# 設定訓練方式，包含loss、optimizer..)
def acc_top50(y_true, y_pred):
    return keras.metrics.sparse_top_k_categorical_accuracy(y_true, y_pred, k=50)

def acc_top10(y_true, y_pred):
    return keras.metrics.sparse_top_k_categorical_accuracy(y_true, y_pred, k=10)

model.compile(loss='sparse_categorical_crossentropy',
              optimizer=opt,
              metrics=['sparse_categorical_accuracy',acc_top10 ,acc_top50])


# early stop存放模型設置


if not os.path.isdir(save_dir):
    os.makedirs(save_dir)
model_path = os.path.join(save_dir, model_name)
checkpoint = ModelCheckpoint(model_path, monitor='val_acc_top10', save_best_only=True, verbose=1)

# early stop參數設定
earlystop = EarlyStopping(monitor='val_acc_top10', patience=6, verbose=1)

#continue training
#del model  # 删掉存在的模型

#返回一个编译好的模型
#与删掉的模型相同
#model = load_model(model_path, custom_objects={'acc_top10': acc_top10,'acc_top50': acc_top50})
##model.compile(loss='sparse_categorical_crossentropy',
##              optimizer=opt,
##              metrics=['sparse_categorical_accuracy',acc_top10,acc_top50])

# 開始訓練
training_generator = DataGenerator(X=x_train, y=y_train, batch_size=batch_size, shuffle=True, fp_dim=fp_dim)
validation_gen = DataGenerator(X=x_test, y=y_test, batch_size=batch_size*16, shuffle=True, fp_dim=fp_dim)
if __name__ == '__main__':
    model_history = model.fit_generator( 
                    generator=training_generator,
                    epochs=epochs,
                    
                    validation_data=validation_gen,
                    verbose=1,
                    initial_epoch=0,
#                    workers=3, 
#                    use_multiprocessing=True, 
#                    shuffle=False,
#                    max_queue_size = 5, 
                    callbacks=[earlystop, checkpoint]
                    )

    #model_history = model.fit(x=x_train, 
#                    y=y_train,
#                    epochs=epochs,
#                    batch_size=batch_size,
#                    validation_split=0.2,
#                    verbose=1,
#                    callbacks=[earlystop, checkpoint])



Loading data...


products: 1153338it [00:03, 288673.97it/s]

In [1]:

from rollout_policy import*

if __name__ == '__main__':
    model_history = model.fit_generator( 
                    generator=training_generator,
                    epochs=epochs,
                    
                    validation_data=validation_gen,
                    verbose=2,
                    initial_epoch=0,
                    workers=3, 
                    use_multiprocessing=True, 
#                    max_queue_size = 5, 
#                    shuffle=False,
                    callbacks=[earlystop, checkpoint]
                    )

#    print(y_train[0:2])
#    print(y_test [0:2])
#    print(x_train[0:2])
#    print(x_test [0:2])    

Using TensorFlow backend.


Loading data...


products: 3184835it [00:08, 358440.79it/s]
rollout: 19728it [00:00, 706439.25it/s]


Rollout training...


data prep: 100%|██████████| 1108830/1108830 [00:00<00:00, 1372590.83it/s]
total reactions: 100%|██████████| 1108830/1108830 [00:00<00:00, 2646987.40it/s]


total products: 1108830
total reactions: 1174933
Training size: 844509
shuffle is over...
Instructions for updating:
Colocations handled automatically by placer.
Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 8192)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 512)               4194816   
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 19728)             10120464  
Total params: 14,315,280
Trainable params: 14,315,280
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/1
 - 258s - loss: 7.2637 - sparse_categorical_accuracy: 0.1040 - acc_t

In [1]:
from rollout_policy import*

if __name__ == '__main__':
    model_history = model.fit_generator( 
                    generator=training_generator,
                    epochs=epochs,
                    
                    validation_data=validation_gen,
                    verbose=2,
                    initial_epoch=0,
                    workers=3, 
                    use_multiprocessing=True, 
                   max_queue_size = 12, 
#                    shuffle=False,
                    callbacks=[earlystop, checkpoint]
                    )

Using TensorFlow backend.


Loading data...


products: 3184835it [00:08, 359774.07it/s]
rollout: 19728it [00:00, 682102.97it/s]


Rollout training...


data prep: 100%|██████████| 1108830/1108830 [00:00<00:00, 1307958.30it/s]
total reactions: 100%|██████████| 1108830/1108830 [00:00<00:00, 2772568.46it/s]


total products: 1108830
total reactions: 1174933
Training size: 844509
shuffle is over...
Instructions for updating:
Colocations handled automatically by placer.
Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 8192)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 512)               4194816   
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 19728)             10120464  
Total params: 14,315,280
Trainable params: 14,315,280
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/100
 - 260s - loss: 7.2689 - sparse_categorical_accuracy: 0.1034 - acc

In [1]:
from rollout_policy import*

if __name__ == '__main__':
    model_history = model.fit_generator( 
                    generator=training_generator,
                    epochs=34,
                    
#                    validation_data=validation_gen,
                    verbose=2,
                    initial_epoch=0,
                    workers=3, 
                    use_multiprocessing=True, 
                   max_queue_size = 12, 
#                    shuffle=False,
                    callbacks=[checkpoint]
                    )

Using TensorFlow backend.


Loading data...


products: 3184835it [00:08, 360708.43it/s]
rollout: 19728it [00:00, 707115.40it/s]


Rollout training...


data prep: 100%|██████████| 1108830/1108830 [00:00<00:00, 1298790.51it/s]
total reactions: 100%|██████████| 1108830/1108830 [00:00<00:00, 2772750.29it/s]


total products: 1108830
total reactions: 1174933
Training size: 844509
shuffle is over...
Instructions for updating:
Colocations handled automatically by placer.
Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 8192)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 512)               4194816   
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 19728)             10120464  
Total params: 14,315,280
Trainable params: 14,315,280
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/34
 - 258s - loss: 7.1321 - sparse_categorical_accuracy: 0.1137 - acc_