Для расчёта эмбэдингов будем использовать roBERTa над InChi представлением молекул

In [4]:
# %pip install sentence_transformers

In [5]:
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem
from rdkit import RDLogger

import pandas as pd
import numpy as np
from tqdm import tqdm

from sentence_transformers import SentenceTransformer

import warnings
warnings.filterwarnings("ignore")
RDLogger.DisableLog('rdApp.warning')

In [6]:
model = SentenceTransformer('all-roberta-large-v1')

## Реализуем функционал расчёта эмбэдингов для молекул

In [11]:
def get_embeding(smiles):
    
    smiles = list(smiles)
    
    def to_inchi(mol_smiles):
        mol = Chem.MolFromSmiles(mol_smiles)
        inchistr = Chem.MolToInchi(mol)
        
        return inchistr
    
    inchi_list = [to_inchi(mol_smiles) for mol_smiles in tqdm(smiles)]
    
    embed = model.encode(inchi_list, device = 'cuda', show_progress_bar = True)
    header = [i for i in range(embed.shape[1])]
    
    return pd.DataFrame(embed, columns = header)


def main():
    exsample_smiles = ['C1CCCCC1', 'c1ccccc1']

    print(get_embeding(exsample_smiles))


if __name__ == '__main__':
    main()  

100%|██████████| 2/2 [00:00<00:00, 941.91it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 36.11it/s]

       0         1         2     ...      1021      1022      1023
0 -0.008900 -0.024388 -0.008768  ... -0.011442  0.001490 -0.055094
1 -0.009939 -0.021375 -0.012251  ... -0.011722 -0.001078 -0.052177

[2 rows x 1024 columns]





## Загружаем молекулы для расчёта

In [8]:
df = pd.read_table('/home/jupyter/datasphere/project/Расчёт дискрипторов/mapper_moleculares.tsv')
df.head()

Unnamed: 0,mol,prepare_mol
0,O=[N+:22]([O-])[C:25]1=[C:26]([Cl:34])[C:27]([...,O=[N+]([O-])[C]1=[C]([Cl])[C]([Cl])=N[C]([Cl])...
1,C[Si-](C)(C)(F)F,C[Si-](C)(C)(F)F
2,OC(=O)C1=CN2C=C(C(F)(F)F)[CH:19]=[C:14](Cl)[C:...,OC(=O)C1=CN2C=C(C(F)(F)F)[CH]=[C](Cl)[C]2=N1
3,CC1(C)[N:19]([C:24]([CH:25]([F:26])[F:27])=[O:...,CC1(C)[N]([C]([CH]([F])[F])=[O])[C@H]([CH2][F]...
4,[CH2:1]([CH2:2][CH2:3][CH2:4][CH2:5][CH3:6])[C...,[CH2]([CH2][CH2][CH2][CH2][CH3])[C]1=[C]([CH]=...


## Считаем

In [12]:
smileses = df.prepare_mol.unique()
res = get_embeding(smileses)
res['smiles'] = list(smileses)
res.head()

100%|██████████| 621478/621478 [05:23<00:00, 1922.71it/s]
Batches: 100%|██████████| 19422/19422 [53:35<00:00,  6.04it/s] 


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,985,986,987,988,989,990,991,992,993,994,995,996,997,998,999,1000,1001,1002,1003,1004,1005,1006,1007,1008,1009,1010,1011,1012,1013,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023,smiles
0,0.00083,-0.022231,-0.014814,-0.021208,0.054127,0.000143,0.017714,0.007352,0.055362,-0.003457,-0.014105,-0.007529,0.084835,0.047886,-0.014485,0.010789,0.012475,0.006269,-0.055975,0.026062,0.016384,0.074725,-0.016964,-0.011703,-0.029217,-0.074874,-0.002014,-0.02853,-0.028099,0.014565,-0.022698,-0.013829,-0.006004,-0.02645,0.025502,-0.006307,0.00061,0.010822,0.02466,-0.013848,...,-0.027973,-0.010546,0.00783,-0.01759,0.003498,-0.005604,0.002346,-0.002782,-0.004835,0.019709,0.014921,0.013785,-0.000341,-0.049607,-0.020649,-0.011435,0.046706,0.010263,0.019276,-0.028234,0.018882,0.024324,0.012119,0.076224,0.008438,-0.034124,-0.00423,0.017901,0.010364,0.021587,-0.013612,-0.044475,0.030995,0.015843,0.040374,-0.021908,-0.051873,-0.004955,-0.019245,O=[N+]([O-])[C]1=[C]([Cl])[C]([Cl])=N[C]([Cl])...
1,0.011283,-0.027236,-0.010673,-0.044174,0.037168,-0.028587,0.000185,0.012431,0.043843,0.030361,-0.007227,0.006757,0.087664,0.046651,-0.017152,-0.006533,0.005105,0.009875,-0.036042,-0.002087,0.016269,0.05673,0.024326,-0.014787,-0.025295,-0.050193,0.016977,0.006829,-0.034807,0.009554,-0.020566,-0.006136,-0.00653,-0.0242,-0.007499,0.014449,0.013729,-0.043415,0.001654,0.00821,...,-0.006662,0.023678,0.009909,-0.013675,0.025572,0.001373,0.020957,0.004109,-0.027418,-0.004084,0.014951,0.034013,-0.009975,-0.039443,-0.010783,0.00044,0.049945,-0.015347,0.016079,-0.024228,0.02882,0.038116,0.038836,0.070117,0.005487,-0.030585,0.037269,0.013075,0.025799,0.002993,-0.001774,-0.030105,0.022137,0.006854,0.014031,-0.008417,-0.025216,-0.01158,-0.032804,C[Si-](C)(C)(F)F
2,0.004982,-0.007456,-0.019163,-0.021443,0.043865,-0.013654,-0.010377,0.019212,0.03888,0.01411,-0.00677,0.001473,0.091695,0.048148,-0.026189,0.006743,0.016069,-0.004932,-0.03268,0.018372,0.013068,0.059514,0.011447,0.004844,-0.031715,-0.0631,0.008436,-0.007312,-0.013168,0.025609,-0.027945,-0.00607,-0.002372,-0.025018,0.02493,0.006298,0.030962,-0.00836,0.005818,7.3e-05,...,-0.022077,0.001974,-0.004375,-0.001545,0.006537,-0.007456,0.045499,-0.00247,-0.006795,0.014271,0.017306,0.010619,0.00586,-0.048609,-0.022187,0.006256,0.050127,0.013388,0.018679,-0.021636,0.023981,0.010117,0.028693,0.075482,0.016505,-0.025912,0.023793,0.009741,0.018643,0.015224,-0.003971,-0.037737,0.006037,0.005717,0.039125,-0.024685,-0.041995,-0.005179,-0.030667,OC(=O)C1=CN2C=C(C(F)(F)F)[CH]=[C](Cl)[C]2=N1
3,-0.002662,-0.008301,-0.001349,-0.016349,0.04436,0.003421,-0.005828,-0.014556,0.050168,0.020292,-0.012364,0.003894,0.082868,0.05442,-0.011844,0.035817,0.004843,0.013292,-0.037411,0.017794,0.031755,0.05575,-0.004968,-0.000706,-0.018828,-0.075079,0.010407,-0.003063,-0.010914,0.009933,-0.014725,0.002611,0.009824,-0.037102,-0.005075,0.022277,0.01892,-0.02268,-0.008103,0.004063,...,-0.004653,0.002398,-0.000822,0.002357,-0.00493,-0.024491,0.044664,0.011267,-0.006779,0.006002,5.9e-05,0.033513,0.005141,-0.058333,-0.013892,-0.011423,0.040407,-0.000365,0.006814,-0.010861,-0.000645,0.00837,0.020043,0.045144,0.002737,-0.035098,0.019031,0.017687,0.000456,0.034709,-0.017821,-0.014431,0.002915,0.02532,0.013674,-0.019032,-0.002028,0.007968,-0.022967,CC1(C)[N]([C]([CH]([F])[F])=[O])[C@H]([CH2][F]...
4,-0.005217,-0.027598,0.004003,-0.000901,0.042657,-0.007951,0.010401,-0.026921,0.065144,0.01163,-0.023318,-0.008292,0.045905,0.036776,-0.014471,0.038019,0.005425,0.028797,-0.048621,0.005736,0.026031,0.057847,-0.009876,-0.00478,-0.032598,-0.058199,0.024168,-0.004034,0.006825,0.013998,-0.003881,-0.011475,0.000169,-0.049029,0.002564,0.018919,0.022542,-0.036202,-0.009154,0.001693,...,-0.011177,0.012925,0.034042,-0.019301,-0.010352,-0.016014,0.030175,0.012985,-0.028652,0.015371,0.006851,0.027224,-0.005561,-0.057879,-0.007734,-0.022643,0.037632,0.006899,0.024209,0.001937,0.006966,0.031374,0.020624,0.049866,0.003669,-0.045,0.007323,0.007925,-0.02179,0.035445,-0.01803,-0.027442,0.023059,0.006231,0.013929,-0.003362,-0.002362,0.019527,-0.027961,[CH2]([CH2][CH2][CH2][CH2][CH3])[C]1=[C]([CH]=...


In [13]:
res.to_pickle('roBERTa_feture.pickle')