In [2]:
# %pip install karateclub

In [3]:
# %pip install rdkit-pypi

In [4]:
from rdkit import Chem
from rdkit import RDLogger
import networkx as nx
from karateclub import Graph2Vec
import joblib

import pandas as pd
import numpy as np
from scipy.spatial.distance import cosine
from math import factorial

from sklearn.model_selection import train_test_split

from tqdm import tqdm

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore")
RDLogger.DisableLog('rdApp.warning')

In [5]:
def mol_to_nx(mol):
    G = nx.Graph()

    for atom in mol.GetAtoms():
        G.add_node(atom.GetIdx(),
                   atomic_num=atom.GetAtomicNum(),
                   is_aromatic=atom.GetIsAromatic(),
                   atom_symbol=atom.GetSymbol())

    for bond in mol.GetBonds():
        G.add_edge(bond.GetBeginAtomIdx(),
                   bond.GetEndAtomIdx(),
                   bond_type=bond.GetBondType())

    return G

In [6]:
def graph_to_vec_fit(smiles):
    
    smiles = list(smiles)
    mol = [Chem.MolFromSmiles(x) for x in tqdm(smiles)]
    graph = [mol_to_nx(x) for x in tqdm(mol)]
    
    # wl_iterations_list = [1, 2, 3, 5, 10]
    # epochs_list = [5, 10, 20, 30, 40, 50]
    # learning_rate_list = [0.003, 0.03, 0.3]
    
    
    model = Graph2Vec()
    model.fit(graph)
    
    return model
    
                
                # train_vec = model.infer(train)
                # test_vec = model.infer(test)
                
#                 def cosine_distance_mean(array):
                    
#                     index = np.random.choice(array.shape[0], 1000, replace=True) 
#                     array_boost = array[index]
#                     sum_ = 0
#                     for i in tqdm(range(array_boost.shape[0]-1)):
#                         for j in range(i+1, array_boost.shape[0]):
                            
#                             sum_ += cosine(array_boost[i], array_boost[j])
                    
#                     return sum_/(999000/2)
                            
                
#                 train_dist = cosine_distance_mean(train_vec)
#                 test_dist = cosine_distance_mean(test_vec)
            
                
#                 if list(best_model.keys())[0] < test_dist:
#                     best_model = {}
#                     best_model[test_dist] = {
#                         'wl_iterations': wl_iterations,
#                         'epochs': epochs,
#                         'learning_rate': learning_rate,
#                         'train_dist': train_dist,
#                         'test_dist': test_dist
#                     }
                    
#                     print(f'best_model_new: {best_model}')
                                                           
#     print(f'best_model: {best_model}')

def main():
    exsample_smiles = ['C1CCCCC1', 'c1ccccc1', 'c1ccccc1', 'C1CCCCC1', 'c1ccccc1', 'C1CCCCC1', 'c1ccccc1', 'c1ccccc1', 'C1CCCCC1', 'c1ccccc1']

    print(graph_to_vec_fit(exsample_smiles))


if __name__ == '__main__':
    main()  

100%|██████████| 10/10 [00:00<00:00, 2415.93it/s]
100%|██████████| 10/10 [00:00<00:00, 3477.00it/s]

<karateclub.graph_embedding.graph2vec.Graph2Vec object at 0x7fb3f39a5d00>





In [7]:
def smiles_to_vec(model, smiles):
    
    smiles = list(smiles)
    mol = [Chem.MolFromSmiles(x) for x in smiles]
    graph = [mol_to_nx(x) for x in mol]
    
    hiv_graph2vec = model.infer(graph)
    
    return pd.DataFrame(hiv_graph2vec)


def main():
    exsample_smiles = ['C1CCCCC1', 'c1ccccc1']

    print(smiles_to_vec(graph_to_vec_fit(exsample_smiles), exsample_smiles))

if __name__ == '__main__':
    main()  

100%|██████████| 2/2 [00:00<00:00, 6177.18it/s]
100%|██████████| 2/2 [00:00<00:00, 2840.71it/s]

        0         1         2    ...       125       126       127
0  0.003347  0.001797 -0.001014  ...  0.002622 -0.000166  0.003349
1  0.003362  0.001827 -0.001007  ...  0.002625 -0.000153  0.003358

[2 rows x 128 columns]





## Загружаем данные

In [8]:
df = pd.read_table('/home/jupyter/datasphere/project/Расчёт дискрипторов/mapper_moleculares.tsv')
df.head()

Unnamed: 0,mol,prepare_mol
0,O=[N+:22]([O-])[C:25]1=[C:26]([Cl:34])[C:27]([...,O=[N+]([O-])[C]1=[C]([Cl])[C]([Cl])=N[C]([Cl])...
1,C[Si-](C)(C)(F)F,C[Si-](C)(C)(F)F
2,OC(=O)C1=CN2C=C(C(F)(F)F)[CH:19]=[C:14](Cl)[C:...,OC(=O)C1=CN2C=C(C(F)(F)F)[CH]=[C](Cl)[C]2=N1
3,CC1(C)[N:19]([C:24]([CH:25]([F:26])[F:27])=[O:...,CC1(C)[N]([C]([CH]([F])[F])=[O])[C@H]([CH2][F]...
4,[CH2:1]([CH2:2][CH2:3][CH2:4][CH2:5][CH3:6])[C...,[CH2]([CH2][CH2][CH2][CH2][CH3])[C]1=[C]([CH]=...


In [9]:
smileses = df.prepare_mol.unique()
model = graph_to_vec_fit(smileses)

100%|██████████| 621478/621478 [01:00<00:00, 10312.08it/s]
100%|██████████| 621478/621478 [02:04<00:00, 4989.53it/s]


In [10]:
joblib.dump(model, 'graph2vec_model.pkl')

['graph2vec_model.pkl']

In [11]:
# loaded_model = joblib.load('test_graph2vec_model.pkl')

In [12]:
res = smiles_to_vec(model, smileses)
res['smiles'] = list(smileses)
res.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,smiles
0,-0.051354,0.087967,-0.014356,0.042275,0.019352,-0.058394,-0.195565,-0.038767,-0.015204,-0.18739,0.024085,0.068903,-0.080952,-0.144423,-0.13016,0.095696,0.009216,-0.000113,-0.102194,0.009146,-0.142887,0.207979,-0.088695,0.003537,-0.031175,0.091554,0.157357,0.005718,-0.088015,0.056507,-0.196151,0.013571,-0.028703,-0.133712,0.137617,-0.042723,-0.201354,0.072024,0.043761,-0.064597,...,0.148149,0.194007,-0.006328,-0.036994,0.254839,-0.051186,0.068366,0.072579,-0.037975,0.068684,-0.081756,-0.003509,-0.080616,-0.021532,-0.016634,0.025133,-0.129788,0.064323,-0.167829,0.220812,0.117286,-0.005139,0.048469,-0.103965,-0.024427,0.196919,0.206104,-0.009217,0.119561,0.061096,0.119755,-0.021841,0.045805,-0.01073,0.064546,-0.121143,-0.109512,-0.061976,0.135195,O=[N+]([O-])[C]1=[C]([Cl])[C]([Cl])=N[C]([Cl])...
1,-0.063962,0.091596,-0.102841,0.136503,-0.007848,-0.182838,-0.105488,0.004577,-0.156534,-0.185211,-0.198046,0.077401,-0.028526,-0.049447,-0.026235,0.220464,-0.025576,-0.118466,-0.025257,-0.092993,-0.187789,0.0137,-0.020973,0.005096,-0.043338,0.01081,0.060291,-0.020494,-0.138981,0.002004,-0.16187,0.009421,-0.114317,0.008351,0.108487,0.026159,0.104457,0.012586,0.00328,-0.008,...,0.129851,0.075571,-0.052153,-0.0654,0.094688,0.021383,0.136783,-0.056514,0.051527,-0.115138,0.031939,-0.100396,-0.057302,-0.0543,0.047787,0.003584,-0.170284,0.044155,-0.123479,0.10612,0.132672,0.013262,0.095784,0.047135,-0.00879,0.058691,0.165508,0.059485,0.043461,0.209093,0.174494,0.029101,-0.031604,-0.072893,0.03615,0.093157,0.006804,0.005446,0.222209,C[Si-](C)(C)(F)F
2,0.073721,-0.136999,-0.203671,0.122562,-0.069818,0.120619,0.145807,0.017731,-0.048861,-0.123859,0.041417,0.072402,-0.101873,-0.236411,-0.105362,0.181673,0.053959,0.074752,0.042098,-0.010327,-0.065377,0.06003,-0.144542,-0.042889,-0.045844,0.196582,0.096077,-0.102533,0.017662,0.165817,-0.012666,-0.025026,-0.047578,-0.026657,-0.028139,-0.110228,-0.050254,-0.055175,0.139057,-0.153624,...,0.104221,0.014544,-0.171854,-0.222727,0.211735,0.131595,-0.100201,0.07376,0.011904,0.067598,-0.040464,0.106306,-0.233973,-0.03999,0.071391,0.037719,-0.133822,0.104651,-0.116105,0.052961,0.118885,0.02263,-0.006502,0.060281,0.026569,0.111909,-0.028039,-0.04195,0.025885,0.055683,0.00695,0.060321,0.111247,0.06603,0.004755,-0.177796,0.002926,0.186277,0.181839,OC(=O)C1=CN2C=C(C(F)(F)F)[CH]=[C](Cl)[C]2=N1
3,-0.004625,0.099429,-0.316118,-0.06079,0.058256,-0.010957,-0.121072,-0.246901,-0.001789,-0.250887,-0.08689,0.061304,-0.070438,-0.111998,0.062894,0.075152,-0.00026,0.008763,-0.236205,0.077276,-0.110265,0.019443,0.07957,-0.156092,0.069324,0.110415,0.077089,0.176627,-0.331726,0.069958,0.032932,0.065343,-0.094072,-0.002356,0.159846,-0.117539,-0.016419,0.039041,-0.151527,0.039874,...,-0.062931,0.140809,0.093579,0.082826,0.210755,-0.23952,0.002186,-0.013756,-0.045378,-0.12295,-0.095223,0.024025,-0.060832,0.065938,-0.041841,0.076955,-0.042509,-0.276208,-0.000298,0.05092,-0.196295,0.17851,-0.149646,-0.098439,0.130848,-0.175896,0.151111,-0.147389,0.140158,-0.156671,-0.134324,0.074397,0.052796,-0.106928,-0.148881,0.074799,0.135884,-0.010466,0.168583,CC1(C)[N]([C]([CH]([F])[F])=[O])[C@H]([CH2][F]...
4,0.032063,0.051389,-0.223581,0.087632,-0.159126,0.012613,0.01007,-0.083747,0.01401,-0.019439,-0.183505,0.174292,0.01893,-0.019923,-0.166069,0.028461,-0.07696,-0.160472,-0.062855,0.064193,0.083812,-0.072193,0.034404,-0.034539,-0.177064,-0.044088,0.070211,-0.028846,-0.02074,0.08019,-0.014988,-0.003308,0.201038,-0.105911,0.081521,-0.06375,-0.004004,-0.082742,0.002936,0.085619,...,-0.127781,0.008117,0.045056,0.011232,0.107526,0.050912,0.158153,0.036984,-0.037197,0.086999,0.092447,0.115532,-0.033151,0.052867,-0.0842,-0.122699,-0.040901,-0.077373,0.062531,0.058032,-0.118581,-0.105024,0.071786,-0.043961,0.05345,0.081712,0.053142,-0.14748,0.003671,0.047475,-0.083971,-0.096637,-0.218056,0.031456,-0.124518,0.085458,-0.033032,0.068185,0.144796,[CH2]([CH2][CH2][CH2][CH2][CH3])[C]1=[C]([CH]=...


In [13]:
res.to_pickle('graph2vec_feture.pickle')