In [1]:
# %pip install rdkit

In [2]:
import pandas as pd
from tqdm import tqdm

from rdkit import Chem
from rdkit.Chem import rdChemReactions
from rdkit.Chem import Descriptors, AllChem, MACCSkeys
from rdkit.ML.Descriptors import MoleculeDescriptors

import warnings
warnings.filterwarnings("ignore")

## Объединяем данные и удаляем дубликаты реакций

In [3]:
path1 = './1976_Sep2016_USPTOgrants_smiles_yield_ok_cropped_data.csv'
path2 = './2001_Sep2016_USPTOapplications_smiles_yield_ok_cropped_data.csv'

df_1 = pd.read_csv(path1, sep = '\t')
df_2 = pd.read_csv(path2, sep = '\t')

df = pd.concat([df_1, df_2], axis = 0)
del df_1, df_2
df.head()

Unnamed: 0.1,Unnamed: 0,myID,ReactionSmiles,Yield
0,27,ID00000027,[CH2:1]([S:3][C:4]1[CH:26]=[CH:25][C:7]2[N:8](...,82.0
1,28,ID00000028,[CH2:1]([S:3][C:4]1[CH:25]=[CH:24][C:7]2[N:8](...,78.1
2,31,ID00000031,[H-].[Al+3].[Li+].[H-].[H-].[H-].[CH2:7]([S:9]...,82.1
3,36,ID00000036,[NH2:1][C:2]1[CH:3]=[CH:4][C:5]2[N:9]=[C:8]3[C...,67.9
4,41,ID00000041,[C:1]([NH:5][NH:6][C:7]([C:12]#[N:13])([CH:9]1...,81.9


In [4]:
df.shape

(1989679, 4)

In [5]:
df.duplicated(subset = ['myID']).sum(), df.duplicated(subset = ['ReactionSmiles']).sum()

(508368, 1210572)

Удаляем по реакциям, чтобы не было одинаковых

In [6]:
df.drop_duplicates(subset = ['ReactionSmiles'], inplace=True)

In [7]:
df.shape

(779107, 4)

Удаляем ненужные поля и добавляем новый ID

In [8]:
df = df[['ReactionSmiles', 'Yield']]
df = df.reset_index().rename({'index': 'ID'}, axis = 1)
df

Unnamed: 0,ID,ReactionSmiles,Yield
0,0,[CH2:1]([S:3][C:4]1[CH:26]=[CH:25][C:7]2[N:8](...,82.0
1,1,[CH2:1]([S:3][C:4]1[CH:25]=[CH:24][C:7]2[N:8](...,78.1
2,2,[H-].[Al+3].[Li+].[H-].[H-].[H-].[CH2:7]([S:9]...,82.1
3,3,[NH2:1][C:2]1[CH:3]=[CH:4][C:5]2[N:9]=[C:8]3[C...,67.9
4,4,[C:1]([NH:5][NH:6][C:7]([C:12]#[N:13])([CH:9]1...,81.9
...,...,...,...
779102,1065628,[Br:1][C:2]1[S:3][C:4]([Br:16])=[CH:5][C:6]=1[...,88.0
779103,1065629,[Br:1][C:2]1[S:3][C:4]([Br:16])=[CH:5][C:6]=1[...,85.0
779104,1065630,[OH:1][C:2]1[CH:7]=[CH:6][C:5](/[CH:8]=[CH:9]/...,92.3
779105,1065631,[Br:1][C:2]1[CH:14]=[CH:13][C:12]2[C:11]3[C:6]...,50.4


## Распарс реакций на молекулы

In [9]:
df.ReactionSmiles[2]

'[H-].[Al+3].[Li+].[H-].[H-].[H-].[CH2:7]([S:9][C:10]1[CH:31]=[CH:30][C:13]2[N:14]([CH3:29])[C:15]3[CH:28]=[CH:27][CH:26]=[CH:25][C:16]=3[CH:17]=[C:18]([CH2:19][C:20](=O)[N:21]([CH3:23])[CH3:22])[C:12]=2[CH:11]=1)[CH3:8].C(OCC)(=O)C.[OH-].[Na+]>O1CCCC1>[CH2:7]([S:9][C:10]1[CH:31]=[CH:30][C:13]2[N:14]([CH3:29])[C:15]3[CH:28]=[CH:27][CH:26]=[CH:25][C:16]=3[CH:17]=[C:18]([CH2:19][CH2:20][N:21]([CH3:23])[CH3:22])[C:12]=2[CH:11]=1)[CH3:8] |f:0.1.2.3.4.5,8.9|'

In [27]:
class Reaction_preparation:
    def __init__(self, reaction):
        self.reaction = list(reaction)
        self.reactions_rd = []
        
        for i in tqdm(reaction):
            try:
                self.reactions_rd.append(rdChemReactions.ReactionFromSmarts(i))
            except:
                self.reaction.remove(i)
                continue

    @staticmethod
    def starting_materials(reactions_rd):
        interm_list = []
        header = ['interm_' + str(i) for i in range(10)]
        for i in tqdm(range(len(reactions_rd))):
            ds = list(map(Chem.MolToSmiles, reactions_rd[i].GetReactants()))
            ds = ds[:10]
            while len(ds) < 10:
                ds.append(None)
            interm_list.append(ds)
        df_interm = pd.DataFrame(interm_list, columns=header)
        
        return df_interm
        # self.df_interm.insert(loc=0, column='reactions', value = self.reaction)
    
    @staticmethod
    def agents(reactions_rd):
        agents_list = []
        header = ['agents_' + str(i) for i in range(3)]
        for i in tqdm(range(len(reactions_rd))):
            ds = list(map(Chem.MolToSmiles, reactions_rd[i].GetAgents()))
            ds = ds[:3]
            while len(ds) < 3:
                ds.append(None)
            agents_list.append(ds)
        df_agents = pd.DataFrame(agents_list,columns=header)
        
        return df_agents
        #df_agents.insert(loc=0, column='reactions', self.reaction)
    
    @staticmethod
    def product(reactions_rd):
        product_list = []
        header = ['product_' + str(i) for i in range(2)]
        for i in tqdm(range(len(reactions_rd))):
            ds = list(map(Chem.MolToSmiles, reactions_rd[i].GetProducts()))
            ds = ds[:2]
            while len(ds) < 2:
                ds.append(None)
            product_list.append(ds)
        df_product = pd.DataFrame(product_list,columns=header)
        
        return df_product
        #df_product.insert(loc=0, column='reactions', self.reaction)
        
    def done_datasets(self):
        df_interm = self.starting_materials(self.reactions_rd)
        df_agents = self.agents(self.reactions_rd)
        df_product = self.product(self.reactions_rd)
        
        res = pd.concat([df_interm, df_agents, df_product], axis = 1)
        res['ReactionSmiles'] = self.reaction
        
        return res

In [28]:
rections_prepares = Reaction_preparation(df.ReactionSmiles)
df_preparation = rections_prepares.done_datasets()
df_preparation.head()

  7%|▋         | 50844/779107 [00:09<01:59, 6105.78it/s][11:04:02] 

****
Invariant Violation
could not find probe element
Violation occurred on line 71 in file /project/build/temp.linux-x86_64-cpython-310/rdkit/Code/RDGeneral/utils.h
Failed Expression: foundIt
----------
Stacktrace:
----------
****

 41%|████▏     | 322893/779107 [01:02<01:33, 4884.75it/s][11:04:55] 

****
Invariant Violation
could not find probe element
Violation occurred on line 71 in file /project/build/temp.linux-x86_64-cpython-310/rdkit/Code/RDGeneral/utils.h
Failed Expression: foundIt
----------
Stacktrace:
----------
****

 47%|████▋     | 363696/779107 [01:11<01:27, 4771.33it/s][11:05:04] 

****
Invariant Violation
could not find probe element
Violation occurred on line 71 in file /project/build/temp.linux-x86_64-cpython-310/rdkit/Code/RDGeneral/utils.h
Failed Expression: foundIt
----------
Stacktrace:
----------
****

 72%|███████▏  | 560908/779107 [01:55<00:47, 4587.82it/s][11:05:49] 

****
Invariant Violati

Unnamed: 0,interm_0,interm_1,interm_2,interm_3,interm_4,interm_5,interm_6,interm_7,interm_8,interm_9,agents_0,agents_1,agents_2,product_0,product_1,ReactionSmiles
0,O[C:12]1([CH2:13][C:14](=[O:15])[O:16][CH2:17]...,Cl,,,,,,,,,CCO,,,[CH2:1]([CH3:2])[S:3][C:4]1=[CH:5][C:6]2=[C:7]...,,[CH2:1]([S:3][C:4]1[CH:26]=[CH:25][C:7]2[N:8](...
1,CC[O:16][C:14]([CH:13]=[C:12]1[C:6]2=[C:7]([N:...,[OH-],[K+],Cl,,,,,,,CCO,,,[CH2:1]([CH3:2])[S:3][C:4]1=[CH:5][C:6]2=[C:7]...,,[CH2:1]([S:3][C:4]1[CH:25]=[CH:24][C:7]2[N:8](...
2,[H-],[Al+3],[Li+],[H-],[H-],[H-],O=[C:20]([CH2:19][C:18]1=[CH:17][C:16]2=[C:15]...,CCOC(C)=O,[OH-],[Na+],C1CCOC1,,,[CH2:7]([CH3:8])[S:9][C:10]1=[CH:11][C:12]2=[C...,,[H-].[Al+3].[Li+].[H-].[H-].[H-].[CH2:7]([S:9]...
3,[NH2:1][C:2]1=[CH:13][C:6]2=[C:5]([CH:4]=[CH:3...,CC(=O)O[C:14]([CH3:15])=[O:16],,,,,,,,,C1=CC=CC=C1,,,[NH:1]([C:2]1=[CH:13][C:6]2=[C:5]([CH:4]=[CH:3...,,[NH2:1][C:2]1[CH:3]=[CH:4][C:5]2[N:9]=[C:8]3[C...
4,[C:1]([CH3:2])([CH3:3])([CH3:4])[NH:5][NH:6][C...,ClCCl,BrBr,,,,,,,,O,,,[C:1]([CH3:2])([CH3:3])([CH3:4])[N:5]=[N:6][C:...,,[C:1]([NH:5][NH:6][C:7]([C:12]#[N:13])([CH:9]1...


In [30]:
df_preparation.tail()

Unnamed: 0,interm_0,interm_1,interm_2,interm_3,interm_4,interm_5,interm_6,interm_7,interm_8,interm_9,agents_0,agents_1,agents_2,product_0,product_1,ReactionSmiles
779098,Br[CH2:14][CH2:13][CH2:12][CH2:11][CH2:10][CH2...,[OH:17][C:18]1=[CH:19][CH:20]=[C:21](/[CH:24]=...,O=C([O-])[O-],[K+],[K+],,,,,,CN(C)C=O,Cl,,[Br:1][C:2]1=[C:6]([CH2:7][CH2:8][CH2:9][CH2:1...,,[Br:1][C:2]1[S:3][C:4]([Br:16])=[CH:5][C:6]=1[...
779099,BrCC[CH2:12][CH2:11][CH2:10][CH2:9][CH2:8][CH2...,[F:17][C:18]1=[CH:19][CH:20]=[C:21](/[CH:24]=[...,O=C([O-])[O-],[K+],[K+],,,,,,CN(C)C=O,Cl,,[Br:1][C:2]1=[C:6]([CH2:7][CH2:8][CH2:9][CH2:1...,,[Br:1][C:2]1[S:3][C:4]([Br:16])=[CH:5][C:6]=1[...
779100,[OH:1][C:2]1=[CH:3][CH:4]=[C:5](/[CH:8]=[CH:9]...,Br[CH2:21][CH2:20][CH2:19][CH2:18][CH2:17][CH2...,O=C([O-])[O-],[K+],[K+],,,,,,CN(C)C=O,Cl,,[O:1]([C:2]1=[CH:7][CH:6]=[C:5](/[CH:8]=[CH:9]...,,[OH:1][C:2]1[CH:7]=[CH:6][C:5](/[CH:8]=[CH:9]/...
779101,[Br:1][C:2]1=[CH:3][C:4]2=[C:12]([C:11]3=[CH:1...,[H-],[Na+],Br[CH2:19][CH2:20][CH2:21][CH2:22][CH2:23][CH2...,,,,,,,CN(C)C=O,Cl,,[Br:1][C:2]1=[CH:3][C:4]2=[C:12]([C:11]3=[CH:1...,,[Br:1][C:2]1[CH:14]=[CH:13][C:12]2[C:11]3[C:6]...
779102,O=C(OCC1=CC=CC=C1)[NH:9][CH:6]1[C:5]([CH3:20])...,,,,,,,,,,CCO,[Pd],,[CH3:1][N:2]([CH:3]1[CH2:4][C:5]([CH3:20])([CH...,,[CH3:1][N:2]([CH3:22])[CH:3]1[CH2:8][CH2:7][CH...


In [32]:
df_preparation.ReactionSmiles.iloc[779102]

'[CH3:1][N:2]([CH3:22])[CH:3]1[CH2:8][CH2:7][CH:6]([NH:9]C(=O)OCC2C=CC=CC=2)[C:5]([CH3:21])([CH3:20])[CH2:4]1>C(O)C.[Pd]>[CH3:1][N:2]([CH3:22])[CH:3]1[CH2:8][CH2:7][CH:6]([NH2:9])[C:5]([CH3:20])([CH3:21])[CH2:4]1'

In [37]:
df_preparation.interm_0.iloc[779102]

'O=C(OCC1=CC=CC=C1)[NH:9][CH:6]1[C:5]([CH3:20])([CH3:21])[CH2:4][CH:3]([N:2]([CH3:1])[CH3:22])[CH2:8][CH2:7]1'

Оставим нужные поля

In [40]:
df_preparation = df_preparation.iloc[:, :-2]

In [None]:
df_preparation.drop_duplicates(inplace = True)

## Посмотрим на заполняемость полей

In [59]:
df_preparation.isna().sum()/df_preparation.shape[0]

interm_0     0.000000
interm_1     0.043404
interm_2     0.262760
interm_3     0.471914
interm_4     0.665402
interm_5     0.837203
interm_6     0.932032
interm_7     0.972198
interm_8     0.986053
interm_9     0.991996
agents_0     0.132511
agents_1     0.631868
agents_2     0.850707
product_0    0.000000
dtype: float64

Сохраняем выборку для дальнейших расчётов

In [6]:
df_preparation.to_csv('/home/jupyter/datasphere/project/reaction_preparation.tsv', sep = '\t', index = 0)