In [1]:
from rdkit import Chem, DataStructs
from rdkit.Chem import Descriptors, AllChem, MACCSkeys

import pandas as pd
import numpy as np
from scipy.sparse import lil_matrix
from scipy.spatial.distance import cosine
from tqdm import tqdm

import warnings
warnings.filterwarnings("ignore")

In [2]:
class DescriptionCalc:
    def __init__(self, smiles):
        '''
        Класс расчитывает физико-хмические признаки молекулы, которые описывают молекулу

        smiles - список веществ 
        '''
        self.mols = []
        for i in tqdm(smiles):
            try:
                self.mols.append(Chem.MolFromSmiles(i))
            except:
                self.mols.append(None)
                
        self.smiles = smiles

class Similarity(DescriptionCalc):
    def __init__(self, smiles):
        super().__init__(smiles)

    def mol2fp(self, mol, radius = 3, nBits = 512):
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius = radius, nBits=nBits)
        return fp

    def compute_ECFP6(self, name, nBits = 512):
        bit_headers = ['bit' + str(i) for i in range(nBits)]
        arr = lil_matrix((len(self.mols), nBits), dtype=np.int8)
        
        for idx, mol in enumerate(tqdm(self.mols)):
            if mol != None:
                fp = self.mol2fp(mol)
                on_bits = fp.GetOnBits()

                for bit in on_bits:
                    arr[idx, bit] = 1
                    
        df_ecfp6 = pd.DataFrame.sparse.from_spmatrix(arr, columns=bit_headers)
        df_ecfp6.insert(loc=0, column= name, value=self.smiles)
        return df_ecfp6
    
def main():
    exsample_smiles = ['C1CCCCC1', 'c1ccccc1']

    ecfp6 = Similarity(exsample_smiles)
    res = ecfp6.compute_ECFP6('exsample')
    print(res)
    
if __name__ == '__main__':
    main()

100%|██████████| 2/2 [00:00<00:00, 1351.91it/s]
100%|██████████| 2/2 [00:00<00:00, 498.73it/s]

   exsample  bit0  bit1  bit2  bit3  ...  bit507  bit508  bit509  bit510  bit511
0  C1CCCCC1     0     0     1     0  ...       0       0       0       0       0
1  c1ccccc1     0     0     0     0  ...       0       0       0       0       0

[2 rows x 513 columns]





In [3]:
exsample_smiles = ['C1CCCCC1', 'C1=CC=C(C=C1)CO', 'NC1=CC=CC=C1']
ecfp6 = Similarity(exsample_smiles)
res_ecfp6 = ecfp6.compute_ECFP6('smiles')

100%|██████████| 3/3 [00:00<00:00, 6384.02it/s]
100%|██████████| 3/3 [00:00<00:00, 3887.21it/s]


In [4]:
def tanimoto_similarity(list1, list2):
    intersection = sum(x and y for x, y in zip(list1, list2))
    union = sum(x or y for x, y in zip(list1, list2))
    koef = intersection/union if union != 0 else 0
    
    return koef

In [5]:
tanimoto_similarity(res_ecfp6.drop('smiles', axis = 1).iloc[1],
                               res_ecfp6.drop('smiles', axis = 1).iloc[2])

0.30434782608695654

## Считаем вектора для первых 100тыс реакций

In [6]:
reaction = pd.read_table('/home/jupyter/datasphere/project/reaction_preparation.tsv').iloc[:10000]

In [7]:
similarity = Similarity(reaction.product_0)
vec_prod = similarity.compute_ECFP6('smiles')

  0%|          | 0/10000 [00:00<?, ?it/s][05:22:35] Explicit valence for atom # 0 Cl, 7, is greater than permitted
[05:22:35] Explicit valence for atom # 0 Cl, 7, is greater than permitted
[05:22:35] Explicit valence for atom # 0 Cl, 7, is greater than permitted
100%|██████████| 10000/10000 [00:00<00:00, 13509.05it/s]
100%|██████████| 10000/10000 [00:02<00:00, 3771.07it/s]


In [8]:
vec_prod = vec_prod.drop('smiles', axis = 1)

In [9]:
vec_prod.to_csv('vec_prod_10k.tsv', index = 0, sep = '\t')