-
Notifications
You must be signed in to change notification settings - Fork 2
/
v_u_n_check.py
83 lines (67 loc) · 2.44 KB
/
v_u_n_check.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
# -*- coding: utf-8 -*-
# @Time : 2022/3/31 21:28
# @Author : TkiChus
# @Email : XU.Chao.TkiChus@gmail.com
import rdkit
from rdkit import Chem
from rdkit.Chem import Draw
"""
# Implement a procedure to assess the valence of generated molecules here!
# You can adapt and use the Molecule class in utility_classes.py,
# but the current code is tailored towards the QM9 dataset. In fact,
# the OpenBabel algorithm to kekulize bond orders is not very reliable
# and we implemented some heuristics in the Molecule class to fix these
# flaws for structures made of C, N, O, and F atoms. However, when using
# more complex structures with a more diverse set of atom types, we think
# that the reliability of bond assignment in OpenBabel might further
# degrade and therefore do no recommend to use valence checks for
# analysis unless it is very important for your use case.
"""
def check_vality(smiles):
total = 0
with open(smiles, "r") as f:
a = f.readlines()
for i in range(len(a)):
smiles_ = a[i]
try:
mol = Chem.MolFromSmiles(smiles_)
vai_smiles = Chem.MolToSmiles(mol, isomericSmiles=True)
print(vai_smiles)
total += 1
except Exception as e:
pass
continue
print("all valid molecules %d" % (total))
val_ratio = float(total / len(a))
print("valid ratio %.2f" % (val_ratio))
# all molecules(no duplicate) / all valid generate molecules
def check_uniqueness(smiles):
### 1057
with open(smiles, "r") as f:
lines = f.readlines()
print(lines)
smiles_fg = set(lines)
print(len(smiles_fg))
for smiles in smiles_fg:
with open("final_un.smi", "a+", encoding="utf8") as a:
a.writelines(smiles)
un_ratio = float(len(smiles_fg) / len(lines))
print("valid ratio %.2f" % (un_ratio))
f.close()
# all novel molecules / all datasets molecules
def check_novelty(train_smi, genUn_smi):
count = 0
# 1057
with open(train_smi, "r") as tr:
train = tr.readlines()
print(train)
with open(genUn_smi, "r") as gr:
gen = gr.readlines()
for line in gen:
if line not in train:
count += 1
print(count)
novelty_ratio = float(len(gr.readlines()) / len(train))
print("novelty ratio %.2f" % (novelty_ratio))
tr.close()
gr.close()