## Import necessary libraries

In [1]:
from collections import defaultdict
import os
import pickle
import sys
import pandas as pd

import numpy as np

from rdkit import Chem
from rdkit.Chem import rdDepictor, Descriptors
from rdkit.Chem.Draw.MolDrawing import MolDrawing, DrawingOptions
from rdkit.Chem import AllChem
from rdkit.Chem.Draw import IPythonConsole, rdMolDraw2D
from rdkit.Chem import MACCSkeys
from IPython.display import SVG

## Helper functions

In [2]:
# dictionary of atoms where a new element gets a new index
def create_atoms(mol):
    atoms = [atom_dict[a.GetSymbol()] for a in mol.GetAtoms()]
    return np.array(atoms)

# format from_atomIDx : [to_atomIDx, bondDict]
def create_ijbonddict(mol):
    i_jbond_dict = defaultdict(lambda: [])
    for b in mol.GetBonds():
        i, j = b.GetBeginAtomIdx(), b.GetEndAtomIdx()
        bond = bond_dict[str(b.GetBondType())]
        i_jbond_dict[i].append((j, bond))
        i_jbond_dict[j].append((i, bond))
    return i_jbond_dict


def create_fingerprints(atoms, i_jbond_dict, radius):
    """Extract the r-radius subgraphs (i.e., fingerprints)
    from a molecular graph using WeisfeilerLehman-like algorithm."""

    if (len(atoms) == 1) or (radius == 0):
        fingerprints = [fingerprint_dict[a] for a in atoms]

    else:
        vertices = atoms
        for _ in range(radius):
            fingerprints = []
            for i, j_bond in i_jbond_dict.items():
                neighbors = [(vertices[j], bond) for j, bond in j_bond]
                fingerprint = (vertices[i], tuple(sorted(neighbors)))
                fingerprints.append(fingerprint_dict[fingerprint])
            vertices = fingerprints

    return np.array(fingerprints)


def create_adjacency(mol):
    adjacency  = Chem.GetAdjacencyMatrix(mol)
    n          = adjacency.shape[0]
    adjacency  = adjacency + np.eye(n)
    degree     = sum(adjacency)
    d_half     = np.sqrt(np.diag(degree))
    d_half_inv = np.linalg.inv(d_half)
    adjacency  = np.matmul(d_half_inv,np.matmul(adjacency,d_half_inv))
    return np.array(adjacency)


def dump_dictionary(dictionary, file_name):
    with open(file_name, 'wb') as f:
        pickle.dump(dict(dictionary), f)

## Dataset creation for GCNs

In [3]:
radius = 2

with open('smiles_property.txt', 'r') as f:
    data_list = f.read().strip().split('\n')
    
"""Exclude the data contains "." in the smiles, which correspond to non-bonds"""
data_list = list(filter(lambda x: '.' not in x.strip().split()[0], data_list))
N = len(data_list)

print('Total number of molecules : %d' %(N))

atom_dict = defaultdict(lambda: len(atom_dict))
bond_dict = defaultdict(lambda: len(bond_dict))
fingerprint_dict = defaultdict(lambda: len(fingerprint_dict))

Molecules, Adjacencies, Properties, MACCS_list = [], [], [], []

max_MolMR, min_MolMR     = -1000, 1000
max_MolLogP, min_MolLogP = -1000, 1000
max_MolWt, min_MolWt     = -1000, 1000
max_NumRotatableBonds, min_NumRotatableBonds = -1000, 1000
max_NumAliphaticRings, min_NumAliphaticRings = -1000, 1000
max_NumAromaticRings, min_NumAromaticRings   = -1000, 1000
max_NumSaturatedRings, min_NumSaturatedRings = -1000, 1000

for no, data in enumerate(data_list):
    
    print('/'.join(map(str, [no+1, N])))
    
    smiles, property = data.strip().split()
    
    
    mol = Chem.MolFromSmiles(smiles)
    atoms = create_atoms(mol)
    i_jbond_dict = create_ijbonddict(mol)

    fingerprints = create_fingerprints(atoms, i_jbond_dict, radius)
    Molecules.append(fingerprints)
    
    adjacency = create_adjacency(mol)
    Adjacencies.append(adjacency)
    
    property = np.array([int(property)])
    Properties.append(property)
    
    MACCS         = MACCSkeys.GenMACCSKeys(Chem.MolFromSmiles(smiles))
    MACCS_ids     = np.zeros((20,))
    MACCS_ids[0]  = Descriptors.MolMR(mol)
    MACCS_ids[1]  = Descriptors.MolLogP(mol)
    MACCS_ids[2]  = Descriptors.MolWt(mol)
    MACCS_ids[3]  = Descriptors.NumRotatableBonds(mol)
    MACCS_ids[4]  = Descriptors.NumAliphaticRings(mol)
    MACCS_ids[5]  = MACCS[108]
    MACCS_ids[6]  = Descriptors.NumAromaticRings(mol)
    MACCS_ids[7]  = MACCS[98]
    MACCS_ids[8]  = Descriptors.NumSaturatedRings(mol)
    MACCS_ids[9]  = MACCS[137]
    MACCS_ids[10] = MACCS[136]
    MACCS_ids[11] = MACCS[145]
    MACCS_ids[12] = MACCS[116]
    MACCS_ids[13] = MACCS[141]
    MACCS_ids[14] = MACCS[89]
    MACCS_ids[15] = MACCS[50]
    MACCS_ids[16] = MACCS[160]
    MACCS_ids[17] = MACCS[121]
    MACCS_ids[18] = MACCS[149]
    MACCS_ids[19] = MACCS[161]
    
    if max_MolMR < MACCS_ids[0]:
        max_MolMR = MACCS_ids[0]
    if min_MolMR > MACCS_ids[0]:
        min_MolMR = MACCS_ids[0]
        
    if max_MolLogP < MACCS_ids[1]:
        max_MolLogP = MACCS_ids[1]
    if min_MolLogP > MACCS_ids[1]:
        min_MolLogP = MACCS_ids[1]
        
    if max_MolWt < MACCS_ids[2]:
        max_MolWt = MACCS_ids[2]
    if min_MolWt > MACCS_ids[2]:
        min_MolWt = MACCS_ids[2]
        
    if max_NumRotatableBonds < MACCS_ids[3]:
        max_NumRotatableBonds = MACCS_ids[3]
    if min_NumRotatableBonds > MACCS_ids[3]:
        min_NumRotatableBonds = MACCS_ids[3]
        
    if max_NumAliphaticRings < MACCS_ids[4]:
        max_NumAliphaticRings = MACCS_ids[4]
    if min_NumAliphaticRings > MACCS_ids[4]:
        min_NumAliphaticRings = MACCS_ids[4]
        
    if max_NumAromaticRings < MACCS_ids[6]:
        max_NumAromaticRings = MACCS_ids[6]
    if min_NumAromaticRings > MACCS_ids[6]:
        min_NumAromaticRings = MACCS_ids[6]
    
    if max_NumSaturatedRings < MACCS_ids[8]:
        max_NumSaturatedRings = MACCS_ids[8]
    if min_NumSaturatedRings > MACCS_ids[8]:
        min_NumSaturatedRings = MACCS_ids[8]
    
    MACCS_list.append(MACCS_ids)

dir_input = ('inputgcn'+str(radius)+'/')
os.makedirs(dir_input, exist_ok=True)

for n in range(N):
    for b in range(20):
        if b==0:
            MACCS_list[n][b] = (MACCS_list[n][b]-min_MolMR)/(max_MolMR-min_MolMR)
        elif b==1:
            MACCS_list[n][b] = (MACCS_list[n][b]-min_MolLogP)/(max_MolMR-min_MolLogP)
        elif b==2:
            MACCS_list[n][b] = (MACCS_list[n][b]-min_MolWt)/(max_MolMR-min_MolWt)
        elif b==3:
            MACCS_list[n][b] = (MACCS_list[n][b]-min_NumRotatableBonds)/(max_MolMR-min_NumRotatableBonds)
        elif b==4:
            MACCS_list[n][b] = (MACCS_list[n][b]-min_NumAliphaticRings)/(max_MolMR-min_NumAliphaticRings)
        elif b==6:
            MACCS_list[n][b] = (MACCS_list[n][b]-min_NumAromaticRings)/(max_MolMR-min_NumAromaticRings)
        elif b==8:
            MACCS_list[n][b] = (MACCS_list[n][b]-min_NumSaturatedRings)/(max_NumSaturatedRings-min_NumSaturatedRings)

np.save(dir_input + 'molecules', Molecules)
np.save(dir_input + 'adjacencies', Adjacencies)
np.save(dir_input + 'properties', Properties)
np.save(dir_input + 'maccs', np.asarray(MACCS_list))

dump_dictionary(fingerprint_dict, dir_input + 'fingerprint_dict.pickle')

print('The preprocess has finished!')

Total number of molecules : 4539
1/4539
2/4539
3/4539
4/4539
5/4539
6/4539
7/4539
8/4539
9/4539
10/4539
11/4539
12/4539
13/4539
14/4539
15/4539
16/4539
17/4539
18/4539
19/4539
20/4539
21/4539
22/4539
23/4539
24/4539
25/4539
26/4539
27/4539
28/4539
29/4539
30/4539
31/4539
32/4539
33/4539
34/4539
35/4539
36/4539
37/4539
38/4539
39/4539
40/4539
41/4539
42/4539
43/4539
44/4539
45/4539
46/4539
47/4539
48/4539
49/4539
50/4539
51/4539
52/4539
53/4539
54/4539
55/4539
56/4539
57/4539
58/4539
59/4539
60/4539
61/4539
62/4539
63/4539
64/4539
65/4539
66/4539
67/4539
68/4539
69/4539
70/4539
71/4539
72/4539
73/4539
74/4539
75/4539
76/4539
77/4539
78/4539
79/4539
80/4539
81/4539
82/4539
83/4539
84/4539
85/4539
86/4539
87/4539
88/4539
89/4539
90/4539
91/4539
92/4539
93/4539
94/4539
95/4539
96/4539
97/4539
98/4539
99/4539
100/4539
101/4539
102/4539
103/4539
104/4539
105/4539
106/4539
107/4539
108/4539
109/4539
110/4539
111/4539
112/4539
113/4539
114/4539
115/4539
116/4539
117/4539
118/4539
119/4539
120/



277/4539
278/4539
279/4539
280/4539
281/4539
282/4539
283/4539
284/4539
285/4539
286/4539
287/4539
288/4539
289/4539
290/4539
291/4539
292/4539
293/4539
294/4539
295/4539
296/4539
297/4539
298/4539
299/4539
300/4539
301/4539
302/4539
303/4539
304/4539
305/4539
306/4539
307/4539
308/4539
309/4539
310/4539
311/4539
312/4539
313/4539
314/4539
315/4539
316/4539
317/4539
318/4539
319/4539
320/4539
321/4539
322/4539
323/4539
324/4539
325/4539
326/4539
327/4539
328/4539
329/4539
330/4539
331/4539
332/4539
333/4539
334/4539
335/4539
336/4539
337/4539
338/4539
339/4539
340/4539
341/4539
342/4539
343/4539
344/4539
345/4539
346/4539
347/4539
348/4539
349/4539
350/4539
351/4539
352/4539
353/4539
354/4539
355/4539
356/4539
357/4539
358/4539
359/4539
360/4539
361/4539
362/4539
363/4539
364/4539
365/4539
366/4539
367/4539
368/4539
369/4539
370/4539
371/4539
372/4539
373/4539
374/4539
375/4539
376/4539
377/4539
378/4539
379/4539
380/4539
381/4539
382/4539
383/4539
384/4539
385/4539
386/4539
387/4539
3



411/4539
412/4539
413/4539
414/4539
415/4539
416/4539
417/4539
418/4539
419/4539
420/4539
421/4539
422/4539
423/4539
424/4539
425/4539
426/4539
427/4539
428/4539
429/4539
430/4539
431/4539
432/4539
433/4539
434/4539
435/4539
436/4539
437/4539
438/4539
439/4539
440/4539
441/4539
442/4539
443/4539
444/4539
445/4539
446/4539
447/4539
448/4539
449/4539
450/4539
451/4539
452/4539
453/4539
454/4539
455/4539
456/4539
457/4539
458/4539
459/4539
460/4539
461/4539
462/4539
463/4539
464/4539
465/4539
466/4539
467/4539
468/4539
469/4539
470/4539
471/4539
472/4539
473/4539
474/4539
475/4539
476/4539
477/4539
478/4539
479/4539
480/4539
481/4539
482/4539
483/4539
484/4539
485/4539
486/4539
487/4539
488/4539
489/4539
490/4539
491/4539
492/4539
493/4539
494/4539
495/4539
496/4539
497/4539
498/4539
499/4539
500/4539
501/4539
502/4539
503/4539
504/4539
505/4539
506/4539
507/4539
508/4539
509/4539
510/4539
511/4539
512/4539
513/4539
514/4539
515/4539
516/4539
517/4539
518/4539
519/4539
520/4539
521/4539
5

1346/4539
1347/4539
1348/4539
1349/4539
1350/4539
1351/4539
1352/4539
1353/4539
1354/4539
1355/4539
1356/4539
1357/4539
1358/4539
1359/4539
1360/4539
1361/4539
1362/4539
1363/4539
1364/4539
1365/4539
1366/4539
1367/4539
1368/4539
1369/4539
1370/4539
1371/4539
1372/4539
1373/4539
1374/4539
1375/4539
1376/4539
1377/4539
1378/4539
1379/4539
1380/4539
1381/4539
1382/4539
1383/4539
1384/4539
1385/4539
1386/4539
1387/4539
1388/4539
1389/4539
1390/4539
1391/4539
1392/4539
1393/4539
1394/4539
1395/4539
1396/4539
1397/4539
1398/4539
1399/4539
1400/4539
1401/4539
1402/4539
1403/4539
1404/4539
1405/4539
1406/4539
1407/4539
1408/4539
1409/4539
1410/4539
1411/4539
1412/4539
1413/4539
1414/4539
1415/4539
1416/4539
1417/4539
1418/4539
1419/4539
1420/4539
1421/4539
1422/4539
1423/4539
1424/4539
1425/4539
1426/4539
1427/4539
1428/4539
1429/4539
1430/4539
1431/4539
1432/4539
1433/4539
1434/4539
1435/4539
1436/4539
1437/4539
1438/4539
1439/4539
1440/4539
1441/4539
1442/4539
1443/4539
1444/4539
1445/4539


2194/4539
2195/4539
2196/4539
2197/4539
2198/4539
2199/4539
2200/4539
2201/4539
2202/4539
2203/4539
2204/4539
2205/4539
2206/4539
2207/4539
2208/4539
2209/4539
2210/4539
2211/4539
2212/4539
2213/4539
2214/4539
2215/4539
2216/4539
2217/4539
2218/4539
2219/4539
2220/4539
2221/4539
2222/4539
2223/4539
2224/4539
2225/4539
2226/4539
2227/4539
2228/4539
2229/4539
2230/4539
2231/4539
2232/4539
2233/4539
2234/4539
2235/4539
2236/4539
2237/4539
2238/4539
2239/4539
2240/4539
2241/4539
2242/4539
2243/4539
2244/4539
2245/4539
2246/4539
2247/4539
2248/4539
2249/4539
2250/4539
2251/4539
2252/4539
2253/4539
2254/4539
2255/4539
2256/4539
2257/4539
2258/4539
2259/4539
2260/4539
2261/4539
2262/4539
2263/4539
2264/4539
2265/4539
2266/4539
2267/4539
2268/4539
2269/4539
2270/4539
2271/4539
2272/4539
2273/4539
2274/4539
2275/4539
2276/4539
2277/4539
2278/4539
2279/4539
2280/4539
2281/4539
2282/4539
2283/4539
2284/4539
2285/4539
2286/4539
2287/4539
2288/4539
2289/4539
2290/4539
2291/4539
2292/4539
2293/4539


3019/4539
3020/4539
3021/4539
3022/4539
3023/4539
3024/4539
3025/4539
3026/4539
3027/4539
3028/4539
3029/4539
3030/4539
3031/4539
3032/4539
3033/4539
3034/4539
3035/4539
3036/4539
3037/4539
3038/4539
3039/4539
3040/4539
3041/4539
3042/4539
3043/4539
3044/4539
3045/4539
3046/4539
3047/4539
3048/4539
3049/4539
3050/4539
3051/4539
3052/4539
3053/4539
3054/4539
3055/4539
3056/4539
3057/4539
3058/4539
3059/4539
3060/4539
3061/4539
3062/4539
3063/4539
3064/4539
3065/4539
3066/4539
3067/4539
3068/4539
3069/4539
3070/4539
3071/4539
3072/4539
3073/4539
3074/4539
3075/4539
3076/4539
3077/4539
3078/4539
3079/4539
3080/4539
3081/4539
3082/4539
3083/4539
3084/4539
3085/4539
3086/4539
3087/4539
3088/4539
3089/4539
3090/4539
3091/4539
3092/4539
3093/4539
3094/4539
3095/4539
3096/4539
3097/4539
3098/4539
3099/4539
3100/4539
3101/4539
3102/4539
3103/4539
3104/4539
3105/4539
3106/4539
3107/4539
3108/4539
3109/4539
3110/4539
3111/4539
3112/4539
3113/4539
3114/4539
3115/4539
3116/4539
3117/4539
3118/4539


3942/4539
3943/4539
3944/4539
3945/4539
3946/4539
3947/4539
3948/4539
3949/4539
3950/4539
3951/4539
3952/4539
3953/4539
3954/4539
3955/4539
3956/4539
3957/4539
3958/4539
3959/4539
3960/4539
3961/4539
3962/4539
3963/4539
3964/4539
3965/4539
3966/4539
3967/4539
3968/4539
3969/4539
3970/4539
3971/4539
3972/4539
3973/4539
3974/4539
3975/4539
3976/4539
3977/4539
3978/4539
3979/4539
3980/4539
3981/4539
3982/4539
3983/4539
3984/4539
3985/4539
3986/4539
3987/4539
3988/4539
3989/4539
3990/4539
3991/4539
3992/4539
3993/4539
3994/4539
3995/4539
3996/4539
3997/4539
3998/4539
3999/4539
4000/4539
4001/4539
4002/4539
4003/4539
4004/4539
4005/4539
4006/4539
4007/4539
4008/4539
4009/4539
4010/4539
4011/4539
4012/4539
4013/4539
4014/4539
4015/4539
4016/4539
4017/4539
4018/4539
4019/4539
4020/4539
4021/4539
4022/4539
4023/4539
4024/4539
4025/4539
4026/4539
4027/4539
4028/4539
4029/4539
4030/4539
4031/4539
4032/4539
4033/4539
4034/4539
4035/4539
4036/4539
4037/4539
4038/4539
4039/4539
4040/4539
4041/4539


  return array(a, dtype, copy=False, order=order, subok=True)


## Dataset creation for RFs

In [4]:
name_list = []
name_list.append('MolMR')
name_list.append('MolLogP')
name_list.append('MolWt')
name_list.append('NumRotatableBonds')
name_list.append('NumAliphaticBonds')
name_list.append('maccs'+str(108))
name_list.append('NumAromaticBonds')
name_list.append('maccs'+str(98))
name_list.append('NumSaturatedRings')

for j in range(166):
    name_list.append('maccs'+str(j+1))

In [5]:
radius = 2

with open('smiles_property.txt', 'r') as f:
    data_list = f.read().strip().split('\n')
    
"""Exclude the data contains "." in the smiles, which correspond to non-bonds"""
data_list = list(filter(lambda x: '.' not in x.strip().split()[0], data_list))
N = len(data_list)

print('Total number of molecules : %d' %(N))

atom_dict = defaultdict(lambda: len(atom_dict))
bond_dict = defaultdict(lambda: len(bond_dict))
fingerprint_dict = defaultdict(lambda: len(fingerprint_dict))

Molecules, Adjacencies, Properties, MACCS_list = [], [], [], []

max_MolMR, min_MolMR     = -1000, 1000
max_MolLogP, min_MolLogP = -1000, 1000
max_MolWt, min_MolWt     = -1000, 1000
max_NumRotatableBonds, min_NumRotatableBonds = -1000, 1000
max_NumAliphaticRings, min_NumAliphaticRings = -1000, 1000
max_NumAromaticRings, min_NumAromaticRings   = -1000, 1000
max_NumSaturatedRings, min_NumSaturatedRings = -1000, 1000

for no, data in enumerate(data_list):
    
    print('/'.join(map(str, [no+1, N])))
    
    smiles, property = data.strip().split()
    
    
    mol = Chem.MolFromSmiles(smiles)
    atoms = create_atoms(mol)
    i_jbond_dict = create_ijbonddict(mol)

    fingerprints = create_fingerprints(atoms, i_jbond_dict, radius)
    Molecules.append(fingerprints)
    
    adjacency = create_adjacency(mol)
    Adjacencies.append(adjacency)
    
    property = np.array([int(property)])
    Properties.append(property)
    
    MACCS         = MACCSkeys.GenMACCSKeys(Chem.MolFromSmiles(smiles))
    MACCS_ids     = np.zeros((175,))
    MACCS_ids[0]  = Descriptors.MolMR(mol)
    MACCS_ids[1]  = Descriptors.MolLogP(mol)
    MACCS_ids[2]  = Descriptors.MolWt(mol)
    MACCS_ids[3]  = Descriptors.NumRotatableBonds(mol)
    MACCS_ids[4]  = Descriptors.NumAliphaticRings(mol)
    MACCS_ids[5]  = MACCS[108]
    MACCS_ids[6]  = Descriptors.NumAromaticRings(mol)
    MACCS_ids[7]  = MACCS[98]
    MACCS_ids[8]  = Descriptors.NumSaturatedRings(mol)
    
    for j in range(9,9+166):
        MACCS_ids[j]  = MACCS[j-8]
    
    if max_MolMR < MACCS_ids[0]:
        max_MolMR = MACCS_ids[0]
    if min_MolMR > MACCS_ids[0]:
        min_MolMR = MACCS_ids[0]
        
    if max_MolLogP < MACCS_ids[1]:
        max_MolLogP = MACCS_ids[1]
    if min_MolLogP > MACCS_ids[1]:
        min_MolLogP = MACCS_ids[1]
        
    if max_MolWt < MACCS_ids[2]:
        max_MolWt = MACCS_ids[2]
    if min_MolWt > MACCS_ids[2]:
        min_MolWt = MACCS_ids[2]
        
    if max_NumRotatableBonds < MACCS_ids[3]:
        max_NumRotatableBonds = MACCS_ids[3]
    if min_NumRotatableBonds > MACCS_ids[3]:
        min_NumRotatableBonds = MACCS_ids[3]
        
    if max_NumAliphaticRings < MACCS_ids[4]:
        max_NumAliphaticRings = MACCS_ids[4]
    if min_NumAliphaticRings > MACCS_ids[4]:
        min_NumAliphaticRings = MACCS_ids[4]
        
    if max_NumAromaticRings < MACCS_ids[6]:
        max_NumAromaticRings = MACCS_ids[6]
    if min_NumAromaticRings > MACCS_ids[6]:
        min_NumAromaticRings = MACCS_ids[6]
    
    if max_NumSaturatedRings < MACCS_ids[8]:
        max_NumSaturatedRings = MACCS_ids[8]
    if min_NumSaturatedRings > MACCS_ids[8]:
        min_NumSaturatedRings = MACCS_ids[8]
    
    MACCS_list.append(MACCS_ids)

dir_input = ('input'+str(radius)+'/')
os.makedirs(dir_input, exist_ok=True)

for n in range(N):
    for b in range(20):
        if b==0:
            MACCS_list[n][b] = (MACCS_list[n][b]-min_MolMR)/(max_MolMR-min_MolMR)
        elif b==1:
            MACCS_list[n][b] = (MACCS_list[n][b]-min_MolLogP)/(max_MolMR-min_MolLogP)
        elif b==2:
            MACCS_list[n][b] = (MACCS_list[n][b]-min_MolWt)/(max_MolMR-min_MolWt)
        elif b==3:
            MACCS_list[n][b] = (MACCS_list[n][b]-min_NumRotatableBonds)/(max_MolMR-min_NumRotatableBonds)
        elif b==4:
            MACCS_list[n][b] = (MACCS_list[n][b]-min_NumAliphaticRings)/(max_MolMR-min_NumAliphaticRings)
        elif b==6:
            MACCS_list[n][b] = (MACCS_list[n][b]-min_NumAromaticRings)/(max_MolMR-min_NumAromaticRings)
        elif b==8:
            MACCS_list[n][b] = (MACCS_list[n][b]-min_NumSaturatedRings)/(max_NumSaturatedRings-min_NumSaturatedRings)

np.save(dir_input + 'molecules', Molecules)
np.save(dir_input + 'adjacencies', Adjacencies)
np.save(dir_input + 'properties', Properties)
np.save(dir_input + 'maccs', np.asarray(MACCS_list))

dump_dictionary(fingerprint_dict, dir_input + 'fingerprint_dict.pickle')

print('The preprocess has finished!')

Total number of molecules : 4539
1/4539
2/4539
3/4539
4/4539
5/4539
6/4539
7/4539
8/4539
9/4539
10/4539
11/4539
12/4539
13/4539
14/4539
15/4539
16/4539
17/4539
18/4539
19/4539
20/4539
21/4539
22/4539
23/4539
24/4539
25/4539
26/4539
27/4539
28/4539
29/4539
30/4539
31/4539
32/4539
33/4539
34/4539
35/4539
36/4539
37/4539
38/4539
39/4539
40/4539
41/4539
42/4539
43/4539
44/4539
45/4539
46/4539
47/4539
48/4539
49/4539
50/4539
51/4539
52/4539
53/4539
54/4539
55/4539
56/4539
57/4539
58/4539
59/4539
60/4539
61/4539
62/4539
63/4539
64/4539
65/4539
66/4539
67/4539
68/4539
69/4539
70/4539
71/4539
72/4539
73/4539
74/4539
75/4539
76/4539
77/4539
78/4539
79/4539
80/4539
81/4539
82/4539
83/4539
84/4539
85/4539
86/4539
87/4539
88/4539
89/4539
90/4539
91/4539
92/4539
93/4539
94/4539
95/4539
96/4539
97/4539
98/4539
99/4539
100/4539
101/4539
102/4539
103/4539
104/4539
105/4539
106/4539
107/4539
108/4539
109/4539
110/4539
111/4539
112/4539
113/4539
114/4539
115/4539
116/4539
117/4539
118/4539
119/4539
120/



253/4539
254/4539
255/4539
256/4539
257/4539
258/4539
259/4539
260/4539
261/4539
262/4539
263/4539
264/4539
265/4539
266/4539
267/4539
268/4539
269/4539
270/4539
271/4539
272/4539
273/4539
274/4539
275/4539
276/4539
277/4539
278/4539
279/4539
280/4539
281/4539
282/4539
283/4539
284/4539
285/4539
286/4539
287/4539
288/4539
289/4539
290/4539
291/4539
292/4539
293/4539
294/4539
295/4539
296/4539
297/4539
298/4539
299/4539
300/4539
301/4539
302/4539
303/4539
304/4539
305/4539
306/4539
307/4539
308/4539
309/4539
310/4539
311/4539
312/4539
313/4539
314/4539
315/4539
316/4539
317/4539
318/4539
319/4539
320/4539
321/4539
322/4539
323/4539
324/4539
325/4539
326/4539
327/4539
328/4539
329/4539
330/4539
331/4539
332/4539
333/4539
334/4539
335/4539
336/4539
337/4539
338/4539
339/4539
340/4539
341/4539
342/4539
343/4539
344/4539
345/4539
346/4539
347/4539
348/4539
349/4539
350/4539
351/4539
352/4539
353/4539
354/4539
355/4539
356/4539
357/4539
358/4539
359/4539
360/4539
361/4539
362/4539
363/4539
3



393/4539
394/4539
395/4539
396/4539
397/4539
398/4539
399/4539
400/4539
401/4539
402/4539
403/4539
404/4539
405/4539
406/4539
407/4539
408/4539
409/4539
410/4539
411/4539
412/4539
413/4539
414/4539
415/4539
416/4539
417/4539
418/4539
419/4539
420/4539
421/4539
422/4539
423/4539
424/4539
425/4539
426/4539
427/4539
428/4539
429/4539
430/4539
431/4539
432/4539
433/4539
434/4539
435/4539
436/4539
437/4539
438/4539
439/4539
440/4539
441/4539
442/4539
443/4539
444/4539
445/4539
446/4539
447/4539
448/4539
449/4539
450/4539
451/4539
452/4539
453/4539
454/4539
455/4539
456/4539
457/4539
458/4539
459/4539
460/4539
461/4539
462/4539
463/4539
464/4539
465/4539
466/4539
467/4539
468/4539
469/4539
470/4539
471/4539
472/4539
473/4539
474/4539
475/4539
476/4539
477/4539
478/4539
479/4539
480/4539
481/4539
482/4539
483/4539
484/4539
485/4539
486/4539
487/4539
488/4539
489/4539
490/4539
491/4539
492/4539
493/4539
494/4539
495/4539
496/4539
497/4539
498/4539
499/4539
500/4539
501/4539
502/4539
503/4539
5

1346/4539
1347/4539
1348/4539
1349/4539
1350/4539
1351/4539
1352/4539
1353/4539
1354/4539
1355/4539
1356/4539
1357/4539
1358/4539
1359/4539
1360/4539
1361/4539
1362/4539
1363/4539
1364/4539
1365/4539
1366/4539
1367/4539
1368/4539
1369/4539
1370/4539
1371/4539
1372/4539
1373/4539
1374/4539
1375/4539
1376/4539
1377/4539
1378/4539
1379/4539
1380/4539
1381/4539
1382/4539
1383/4539
1384/4539
1385/4539
1386/4539
1387/4539
1388/4539
1389/4539
1390/4539
1391/4539
1392/4539
1393/4539
1394/4539
1395/4539
1396/4539
1397/4539
1398/4539
1399/4539
1400/4539
1401/4539
1402/4539
1403/4539
1404/4539
1405/4539
1406/4539
1407/4539
1408/4539
1409/4539
1410/4539
1411/4539
1412/4539
1413/4539
1414/4539
1415/4539
1416/4539
1417/4539
1418/4539
1419/4539
1420/4539
1421/4539
1422/4539
1423/4539
1424/4539
1425/4539
1426/4539
1427/4539
1428/4539
1429/4539
1430/4539
1431/4539
1432/4539
1433/4539
1434/4539
1435/4539
1436/4539
1437/4539
1438/4539
1439/4539
1440/4539
1441/4539
1442/4539
1443/4539
1444/4539
1445/4539


2201/4539
2202/4539
2203/4539
2204/4539
2205/4539
2206/4539
2207/4539
2208/4539
2209/4539
2210/4539
2211/4539
2212/4539
2213/4539
2214/4539
2215/4539
2216/4539
2217/4539
2218/4539
2219/4539
2220/4539
2221/4539
2222/4539
2223/4539
2224/4539
2225/4539
2226/4539
2227/4539
2228/4539
2229/4539
2230/4539
2231/4539
2232/4539
2233/4539
2234/4539
2235/4539
2236/4539
2237/4539
2238/4539
2239/4539
2240/4539
2241/4539
2242/4539
2243/4539
2244/4539
2245/4539
2246/4539
2247/4539
2248/4539
2249/4539
2250/4539
2251/4539
2252/4539
2253/4539
2254/4539
2255/4539
2256/4539
2257/4539
2258/4539
2259/4539
2260/4539
2261/4539
2262/4539
2263/4539
2264/4539
2265/4539
2266/4539
2267/4539
2268/4539
2269/4539
2270/4539
2271/4539
2272/4539
2273/4539
2274/4539
2275/4539
2276/4539
2277/4539
2278/4539
2279/4539
2280/4539
2281/4539
2282/4539
2283/4539
2284/4539
2285/4539
2286/4539
2287/4539
2288/4539
2289/4539
2290/4539
2291/4539
2292/4539
2293/4539
2294/4539
2295/4539
2296/4539
2297/4539
2298/4539
2299/4539
2300/4539


3040/4539
3041/4539
3042/4539
3043/4539
3044/4539
3045/4539
3046/4539
3047/4539
3048/4539
3049/4539
3050/4539
3051/4539
3052/4539
3053/4539
3054/4539
3055/4539
3056/4539
3057/4539
3058/4539
3059/4539
3060/4539
3061/4539
3062/4539
3063/4539
3064/4539
3065/4539
3066/4539
3067/4539
3068/4539
3069/4539
3070/4539
3071/4539
3072/4539
3073/4539
3074/4539
3075/4539
3076/4539
3077/4539
3078/4539
3079/4539
3080/4539
3081/4539
3082/4539
3083/4539
3084/4539
3085/4539
3086/4539
3087/4539
3088/4539
3089/4539
3090/4539
3091/4539
3092/4539
3093/4539
3094/4539
3095/4539
3096/4539
3097/4539
3098/4539
3099/4539
3100/4539
3101/4539
3102/4539
3103/4539
3104/4539
3105/4539
3106/4539
3107/4539
3108/4539
3109/4539
3110/4539
3111/4539
3112/4539
3113/4539
3114/4539
3115/4539
3116/4539
3117/4539
3118/4539
3119/4539
3120/4539
3121/4539
3122/4539
3123/4539
3124/4539
3125/4539
3126/4539
3127/4539
3128/4539
3129/4539
3130/4539
3131/4539
3132/4539
3133/4539
3134/4539
3135/4539
3136/4539
3137/4539
3138/4539
3139/4539


3942/4539
3943/4539
3944/4539
3945/4539
3946/4539
3947/4539
3948/4539
3949/4539
3950/4539
3951/4539
3952/4539
3953/4539
3954/4539
3955/4539
3956/4539
3957/4539
3958/4539
3959/4539
3960/4539
3961/4539
3962/4539
3963/4539
3964/4539
3965/4539
3966/4539
3967/4539
3968/4539
3969/4539
3970/4539
3971/4539
3972/4539
3973/4539
3974/4539
3975/4539
3976/4539
3977/4539
3978/4539
3979/4539
3980/4539
3981/4539
3982/4539
3983/4539
3984/4539
3985/4539
3986/4539
3987/4539
3988/4539
3989/4539
3990/4539
3991/4539
3992/4539
3993/4539
3994/4539
3995/4539
3996/4539
3997/4539
3998/4539
3999/4539
4000/4539
4001/4539
4002/4539
4003/4539
4004/4539
4005/4539
4006/4539
4007/4539
4008/4539
4009/4539
4010/4539
4011/4539
4012/4539
4013/4539
4014/4539
4015/4539
4016/4539
4017/4539
4018/4539
4019/4539
4020/4539
4021/4539
4022/4539
4023/4539
4024/4539
4025/4539
4026/4539
4027/4539
4028/4539
4029/4539
4030/4539
4031/4539
4032/4539
4033/4539
4034/4539
4035/4539
4036/4539
4037/4539
4038/4539
4039/4539
4040/4539
4041/4539


### Creating dataframe object

In [6]:
data_maccs = np.zeros((len(MACCS_list),MACCS_list[0].shape[0]))

for j in range(len(MACCS_list)):
    data_maccs[j,:] = MACCS_list[j]

indices = np.linspace(0,len(MACCS_list)-1,len(MACCS_list))
indices.astype(int)

df = pd.DataFrame(data=data_maccs, index=indices, columns=name_list)

In [7]:
target = np.zeros((len(MACCS_list),))

for j in range(len(MACCS_list)):
    target[j] = int(Properties[j][0])

In [8]:
np.save(dir_input + 'df', df)
np.save(dir_input + 'target', target)