In [3]:
import rdkit
print(rdkit.__version__)
from rdkit import Chem
from rdkit.Chem import AllChem
import csv
import pickle
from map4 import MAP4Calculator
import numpy as np

2022.03.5


# Load products

## ECREACT

In [3]:
try:
    with open("./data/bio_products_uni.pkl","rb") as f:
        bio_products_uni = pickle.load(f)
    print('Loading bio_products_uni')
    print('bio_products_uni',len(bio_products_uni))
except:
    def get_ecreact_prod():
        bioReactions = []
        with open('../data/ECREACT/ecreact-1.0.csv') as f:
            reader = csv.DictReader(f)
            for row in reader:
                reaction = row['rxn_smiles']
                bioReactions.append(reaction)
        bio_reactants = []
        bio_products = []
        for reaction in bioReactions:
            reactant = reaction.split("|")[0]
            product = reaction.split(">>")[1]
            bio_reactants.append(reactant)
            bio_products.append(product)
        print('bio_products',len(bio_products))
        bio_products_moltosmi = [Chem.MolToSmiles(Chem.MolFromSmiles(m)) for m in bio_products]
        bio_products_uni = list(set(bio_products_moltosmi))
        print('bio_products_uni',len(bio_products_uni))
        return bio_products_uni
    bio_products_uni = get_ecreact_prod()
    with open("./data/bio_products_uni.pkl", "wb") as f:
        pickle.dump(bio_products_uni, f)


Loading bio_products_uni
bio_products_uni 37939


## USPTO

In [4]:
try:
    with open("./data/chem_products_uni.pkl","rb") as f:
        chem_products_uni = pickle.load(f)
    print('Loading chem_products_uni')
    print('chem_products_uni',len(chem_products_uni))
except:
    def get_uspto_prod():
        tgt_train = []
        with open ('../data/USPTO480k/tgt-train.txt', 'r') as f:
            tgt_train = [s for line in f.readlines() for s in line.replace(' ','').split('.')]
        #print(len(tgt_train))
        tgt_test = []
        with open ('../data/USPTO480k/tgt-test.txt', 'r') as f:
            tgt_test = [s for line in f.readlines() for s in line.replace(' ','').split('.')]
        #print(len(tgt_test))
        tgt_val = []
        with open ('../data/USPTO480k/tgt-val.txt', 'r') as f:
            tgt_val = [s for line in f.readlines() for s in line.replace(' ','').split('.')]
        #print(len(tgt_val))

        chem_products = tgt_train + tgt_test + tgt_val
        print('chem_products',len(chem_products))
        chem_products_moltosmi = [Chem.MolToSmiles(Chem.MolFromSmiles(m)) for m in chem_products]
        chem_products_uni = list(set(chem_products_moltosmi))
        print('chem_products_uni',len(chem_products_uni))
        return chem_products_uni
    chem_products_uni = get_uspto_prod()
    with open("./data/chem_products_uni.pkl", "wb") as f:
        pickle.dump(chem_products_uni, f)

Loading chem_products_uni
chem_products_uni 437821


# MAP4 Encoding

In [10]:
def map4_many_encoding(bio_products_smi,dim):
    
    MAP4 = MAP4Calculator(dimensions=dim,is_folded=True)
    #ENC = tm.Minhash(dim)
    bio_products_map4_smi = bio_products_smi
    bio_products_map4_fps = MAP4.calculate_many([Chem.MolFromSmiles(mol) for mol in bio_products_map4_smi])
    return bio_products_map4_smi, bio_products_map4_fps

def map4_chem_encoding(chem_products_smi,dim):
    MAP4 = MAP4Calculator(dimensions=dim,is_folded=True)
    #ENC = tm.Minhash(dim)
    chem_products_map4_fps = []
    i = 0
    err_n = 0
    err_mol = []
    chem_products_map4_smi = []
    for mol in chem_products_smi:
        try:
            fp = MAP4.calculate(Chem.MolFromSmiles(mol))
            chem_products_map4_fps.append(fp)
            chem_products_map4_smi.append(mol)
            i += 1
            if i%10000 == 0:
                print("Progress: {:0.2f}. Error: {}".format(i/len(chem_products_smi),err_n), end='\r')
        except:
            err_n += 1
            err_mol.append(mol)
            print("Progress: {:0.2f}. Error: {}".format(i/len(chem_products_smi),err_n), end='\r')
    return chem_products_map4_smi, chem_products_map4_fps, err_mol

## MAP4 1024 (Remove molecules whose MAP4 fingerprint is unfeasible )

MAP4 bio mols encoding (~10 min) chem mols encoding (~40 min)

In [4]:
try:
    with open("./data/bio_products_smi_cor.pkl","rb") as f:
        bio_products_smi_cor = pickle.load(f)
    with open("./data/bio_products_map4_1024_fps.pkl","rb") as f:
        bio_products_map4_1024_fps = pickle.load(f)
    print('Loading bio_products_smi_cor and bio_products_map4_1024_fps')
    print(f'Loading bio_products_smi_cor: {len(bio_products_smi_cor)}')
except:
    print('Processing bio_products_smi_cor and bio_products_map4_1024_fps')
    bio_products_smi_cor, bio_products_map4_1024_fps = map4_many_encoding(bio_products_uni,dim=1024)
    with open("./data/bio_products_smi_cor.pkl", "wb") as f:
        pickle.dump(bio_products_smi_cor, f)
    with open("./data/bio_products_map4_1024_fps.pkl", "wb") as f:
        pickle.dump(bio_products_map4_1024_fps, f)


Loading bio_products_smi_cor and bio_products_map4_1024_fps
Loading bio_products_smi_cor: 37939


In [5]:
### MAP4 1024 encoding Chem
try:
    with open("./data/chem_products_smi_cor.pkl","rb") as f:
        chem_products_smi_cor = pickle.load(f)
    with open("./data/chem_products_map4_1024_fps.pkl","rb") as f:
        chem_products_map4_1024_fps = pickle.load(f)
    print('Loading chem_products_smi_cor and chem_products_map4_1024_fps')
    print(f'Loading chem_products_map4_1024_fps: {len(chem_products_map4_1024_fps)}')
except:
    print('Processing chem_products_smi_cor and chem_products_map4_1024_fps')
    chem_products_smi_cor, chem_products_map4_1024_fps, chem_err_mol_map4_1024 = map4_chem_encoding(chem_products_uni,dim=1024)
    with open("./data/chem_products_smi_cor.pkl", "wb") as f:
        pickle.dump(chem_products_smi_cor, f)
    with open("./data/chem_products_map4_1024_fps.pkl", "wb") as f:
        pickle.dump(chem_products_map4_1024_fps, f)
    with open("./data/chem_err_mol_map4_1024.pkl", "wb") as f:
        pickle.dump(chem_err_mol_map4_1024, f)


Loading chem_products_smi_cor and chem_products_map4_1024_fps
Loading chem_products_map4_1024_fps: 437781


## MAP4 2048

In [13]:
try:
    with open("./data/bio_products_map4_2048_fps.pkl","rb") as f:
        bio_products_map4_2048_fps = pickle.load(f)
    print('Loading bio_products_map4_2048_fps')
except:
    print('Processing bio_products_map4_2048_fps')
    bio_products_map4_2048_smi, bio_products_map4_2048_fps = map4_many_encoding(bio_products_smi_cor,dim=2048)
    with open("./data/bio_products_map4_2048_fps.pkl", "wb") as f:
        pickle.dump(bio_products_map4_2048_fps, f)

Processing bio_products_map4_2048_smi and bio_products_map4_2048_fps


In [14]:
try:
    with open("./data/chem_products_map4_2048_fps.pkl","rb") as f:
        chem_products_map4_2048_fps = pickle.load(f)
    print('Loading chem_products_map4_2048_fps')
except:
    print('Processing chem_products_map4_2048_fps')
    chem_products_map4_2048_smi, chem_products_map4_2048_fps = map4_many_encoding(chem_products_smi_cor,dim=2048)
    print("Done, saving the result")
    with open("./data/chem_products_map4_2048_fps.pkl", "wb") as f:
        pickle.dump(chem_products_map4_2048_fps, f)

Processing chem_products_map4_2048_fps
Done, saving the result


## MAP4 4096

In [15]:
try:
    with open("./data/bio_products_map4_4096_fps.pkl","rb") as f:
        bio_products_map4_4096_fps = pickle.load(f)
    print('Loading bio_products_map4_4096_fps')
except:
    print('Processing bio_products_map4_4096_fps')
    bio_products_map4_4096_smi, bio_products_map4_4096_fps = map4_many_encoding(bio_products_smi_cor,dim=4096)
    with open("./data/bio_products_map4_4096_fps.pkl", "wb") as f:
        pickle.dump(bio_products_map4_4096_fps, f)

Processing bio_products_map4_4096_fps


In [16]:
try:
    with open("./data/chem_products_map4_4096_fps.pkl","rb") as f:
        chem_products_map4_4096_fps = pickle.load(f)
    print('Loading chem_products_map4_4096_smi and chem_products_map4_4096_fps')
except:
    print('Processing chem_products_map4_4096_smi and chem_products_map4_4096_fps')
    chem_products_map4_4096_smi, chem_products_map4_4096_fps = map4_many_encoding(chem_products_smi_cor,dim=4096)
    print("Done, saving the result")
    with open("./data/chem_products_map4_4096_fps.pkl", "wb") as f:
        pickle.dump(chem_products_map4_4096_fps, f)

Processing chem_products_map4_4096_smi and chem_products_map4_4096_fps
Done, saving the result


# ECFP4 Encoding

In [17]:
def ecfp4_encoding(bio_products_smi,chem_products_smi,dim):
    bio_products_mol = [Chem.MolFromSmiles(m) for m in bio_products_smi]
    bio_products_ecfp_fps  = [np.array(AllChem.GetMorganFingerprintAsBitVect(m, 2, nBits=dim)) for m in bio_products_mol]
    chem_products_mol = [Chem.MolFromSmiles(m) for m in chem_products_smi]
    chem_products_ecfp_fps  = [np.array(AllChem.GetMorganFingerprintAsBitVect(m, 2, nBits=dim)) for m in chem_products_mol]
    return bio_products_ecfp_fps, chem_products_ecfp_fps

10 15 25 min

## ECFP4 1024

In [18]:
try:
    with open("./data/bio_products_ecfp4_1024_fps.pkl","rb") as f:
        bio_products_ecfp4_1024_fps = pickle.load(f)
    with open("./data/chem_products_ecfp4_1024_fps.pkl","rb") as f:
        chem_products_ecfp4_1024_fps = pickle.load(f)
    print('Loading bio_products_ecfp4_1024_fps and chem_products_ecfp4_1024_fps')

except:
    print('Processing ecfp4_1024')
    dim = 1024
    bio_products_ecfp4_1024_fps, chem_products_ecfp4_1024_fps = ecfp4_encoding(bio_products_smi_cor,chem_products_smi_cor,dim)
    with open("./data/bio_products_ecfp4_1024_fps.pkl", "wb") as f:
        pickle.dump(bio_products_ecfp4_1024_fps, f)
    with open("./data/chem_products_ecfp4_1024_fps.pkl", "wb") as f:
        pickle.dump(chem_products_ecfp4_1024_fps, f)

Processing ecfp4_1024


## ECFP4 2048

In [19]:
try:
    with open("./data/bio_products_ecfp4_2048_fps.pkl","rb") as f:
        bio_products_ecfp4_2048_fps = pickle.load(f)
    with open("./data/chem_products_ecfp4_2048_fps.pkl","rb") as f:
        chem_products_ecfp4_2048_fps = pickle.load(f)
    print('Loading bio_products_ecfp4_2048_fps and chem_products_ecfp4_2048_fps')

except:
    print('Processing ecfp4_2048')
    dim=2048
    bio_products_ecfp4_2048_fps, chem_products_ecfp4_2048_fps = ecfp4_encoding(bio_products_smi_cor,chem_products_smi_cor,dim)
    with open("./data/bio_products_ecfp4_2048_fps.pkl", "wb") as f:
        pickle.dump(bio_products_ecfp4_2048_fps, f)
    with open("./data/chem_products_ecfp4_2048_fps.pkl", "wb") as f:
        pickle.dump(chem_products_ecfp4_2048_fps, f)

Processing ecfp4_2048


## ECFP4 4096

In [20]:
try:
    with open("./data/bio_products_ecfp4_4096_fps.pkl","rb") as f:
        bio_products_ecfp4_4096_fps = pickle.load(f)
    with open("./data/chem_products_ecfp4_4096_fps.pkl","rb") as f:
        chem_products_ecfp4_4096_fps = pickle.load(f)
    print('Loading bio_products_ecfp4_4096_fps and chem_products_ecfp4_4096_fps')

except:
    print('Processing ecfp4_4096')
    dim=4096
    bio_products_ecfp4_4096_fps, chem_products_ecfp4_4096_fps = ecfp4_encoding(bio_products_smi_cor,chem_products_smi_cor,dim)
    with open("./data/bio_products_ecfp4_4096_fps.pkl", "wb") as f:
        pickle.dump(bio_products_ecfp4_4096_fps, f)
    with open("./data/chem_products_ecfp4_4096_fps.pkl", "wb") as f:
        pickle.dump(chem_products_ecfp4_4096_fps, f)

Processing ecfp4_4096
