In [1]:
import pandas as pd
from rdkit import Chem
import numpy as np 
import shutil

In [2]:
def valid_mol(x):
    s = Chem.MolFromSmiles(Chem.MolToSmiles(x, isomericSmiles=True)) if x is not None else None
    if s is not None and '.' not in Chem.MolToSmiles(s, isomericSmiles=True):
        return s
    return None

def check_novelty(gen_smiles, train_smiles, n_generated_mols): # gen: say 788, train: 120803
    if len(gen_smiles) == 0:
        novel_ratio = 0.
    else:
        duplicates = [1 for mol in gen_smiles if mol in train_smiles]  # [1]*45
        novel = len(gen_smiles) - sum(duplicates)  # 788-45=743
        novel_ratio = novel*100./len(gen_smiles)  # 743*100/788=94.289
        abs_novel_ratio = novel*100./n_generated_mols
    print("novelty: {:.3f}%, abs novelty: {:.3f}%".format(novel_ratio, abs_novel_ratio))
    return novel_ratio, abs_novel_ratio

In [3]:
def process_csv(input_filename, output_filename):
    with open(input_filename, 'r', encoding='utf-8') as infile, open(output_filename, 'w', encoding='utf-8') as outfile:
        lines = infile.readlines()
        lines[0] = "no,Generation"

        for i, line in enumerate(lines):
            if not line.strip():
                continue

            parts = line.split('.', 1)
            if len(parts) < 2:
                parts = line.split(',', 1)
            nos = parts[0].strip().strip('"').strip("'").strip('"')
            smiles = parts[1].strip().strip("'").strip('"').strip()
            outfile.write(','.join([nos, smiles]) + '\n')

    shutil.move(output_filename, input_filename)

In [4]:
def value(path):
    with open(f'{path}/result.txt', 'w') as f:
        valid_list = []
        unique_list = []
        novelty_list = []
        for i in range(1, 6):
            df = pd.read_csv(f"{path}/{i}.csv")  # append mode
            list_of_generated_smiles=df['Generation'].tolist()
            print(len(list_of_generated_smiles))
            mol_list=[Chem.MolFromSmiles(smiles) for smiles in list_of_generated_smiles]

            valid = [valid_mol(mol) for mol in mol_list]
            valid = [mol for mol in valid if mol is not None]  #len()=valid number, say 794

            n_mols = len(mol_list)
            valid_ratio = len(valid)/n_mols  # say 794/1000
            valid_smiles = [Chem.MolToSmiles(mol, isomericSmiles=False) for mol in valid]
            unique_smiles = list(set(valid_smiles))  # unique valid, say 788
            unique_ratio = 0.
            if len(valid) > 0:
                unique_ratio = len(unique_smiles)/len(valid)  # say 788/794
            valid_smiles = unique_smiles
            valid_mols = [Chem.MolFromSmiles(s) for s in valid_smiles]
            abs_unique_ratio = len(unique_smiles)/n_mols

            f.write("valid: {:.3f}%, unique: {:.3f}%, abs unique: {:.3f}%\n".format(valid_ratio * 100, unique_ratio * 100, abs_unique_ratio * 100))
            valid_list.append(valid_ratio * 100)
            unique_list.append(unique_ratio * 100)

            results = dict()
            results['valid_mols'] = valid_mols
            results['valid_smiles'] = valid_smiles
            results['valid_ratio'] = valid_ratio*100
            results['unique_ratio'] = unique_ratio*100
            results['abs_unique_ratio'] = abs_unique_ratio * 100


            csv_file = 'output.csv'
            df = pd.read_csv(csv_file)
            smiles_list = df['smiles'].tolist()

            novel_ratio=[]
            novel_r, abs_novel_r = check_novelty(valid_smiles, smiles_list, len(list_of_generated_smiles))
            novel_ratio.append(novel_r)
            f.write("novelty: mean={:.2f}%, sd={:.2f}%, vals={}\n\n".format(np.mean(novel_ratio), np.std(novel_ratio), novel_ratio))
            novelty_list.append(np.mean(novel_ratio))

        f.write("average:\n")
        f.write(f"valid = ({valid_list[0]:.3f} + {valid_list[1]:.3f} + {valid_list[2]:.3f} + {valid_list[3]:.3f} + {valid_list[4]:.3f}) / 5 = {(sum(valid_list)/5):.3f}\n")
        f.write(f"unique = ({unique_list[0]:.3f} + {unique_list[1]:.3f} + {unique_list[2]:.3f} + {unique_list[3]:.3f} + {unique_list[4]:.3f}) / 5 = {(sum(unique_list)/5):.3f}\n")
        f.write(f"novelty = ({novelty_list[0]:.3f} + {novelty_list[1]:.3f} + {novelty_list[2]:.3f} + {novelty_list[3]:.3f} + {novelty_list[4]:.3f}) / 5 = {(sum(novelty_list)/5):.3f}\n")
            

In [15]:
path = "Datasets_GLM4/GLM_QM9/3"
# process_csv(path+"/1.csv", "temp.csv")
# process_csv(path+"/2.csv", "temp.csv")
# process_csv(path+"/3.csv", "temp.csv")
# process_csv(path+"/4.csv", "temp.csv")
# process_csv(path+"/5.csv", "temp.csv")

In [16]:
value(path)

100


[18:09:44] SMILES Parse Error: unclosed ring for input: 'C1=CC(=O)C2=CC=CC2'
[18:09:44] SMILES Parse Error: unclosed ring for input: 'C1=CC(=O)C2=CN=C3C=CC12'
[18:09:44] SMILES Parse Error: unclosed ring for input: 'C1=CC(=O)C2=CC=CC2'
[18:09:44] SMILES Parse Error: unclosed ring for input: 'C1=CC(=O)C2=CC=CC2'
[18:09:44] SMILES Parse Error: unclosed ring for input: 'C1=CC(=O)C2=CC=CC2'
[18:09:44] SMILES Parse Error: unclosed ring for input: 'C1=CC(=O)C2=CC=CC2'
[18:09:44] SMILES Parse Error: unclosed ring for input: 'C1=CC(=O)C2=CC=CC2'
[18:09:44] SMILES Parse Error: unclosed ring for input: 'C1=CC(=O)C2=CC=CC2'
[18:09:44] SMILES Parse Error: unclosed ring for input: 'C1=CC(=O)C2=CC=CC2'
[18:09:44] SMILES Parse Error: unclosed ring for input: 'C1=CC(=O)C2=CC=CC2'
[18:09:44] SMILES Parse Error: unclosed ring for input: 'C1=CC(=O)C2=CC=CC2'
[18:09:44] SMILES Parse Error: unclosed ring for input: 'C1=CC(=O)C2=CC=CC2'


novelty: 96.875%, abs novelty: 31.000%
100


[18:09:45] SMILES Parse Error: unclosed ring for input: 'C1=CC=CN1C(=O)C(C)C1'
[18:09:45] SMILES Parse Error: unclosed ring for input: 'C1=CC=CN1C(=O)C(C)O1'
[18:09:45] SMILES Parse Error: unclosed ring for input: 'C1=CC=CN1C(=O)C(C)N1'
[18:09:45] SMILES Parse Error: unclosed ring for input: 'C1=CC=CN1C(=S)C(C)C1'
[18:09:45] SMILES Parse Error: unclosed ring for input: 'C1=CC=CN1C(=S)C(C)O1'
[18:09:45] SMILES Parse Error: unclosed ring for input: 'C1=CC=CN1C(=S)C(C)N1'
[18:09:45] SMILES Parse Error: unclosed ring for input: 'C1=CC=CN1C(=O)C(C)(C)C1'
[18:09:45] SMILES Parse Error: unclosed ring for input: 'C1=CC=CN1C(=O)C(C)(C)O1'
[18:09:45] SMILES Parse Error: unclosed ring for input: 'C1=CC=CN1C(=O)C(C)(C)N1'
[18:09:45] SMILES Parse Error: unclosed ring for input: 'C1=CC=CN1C(=S)C(C)(C)C1'
[18:09:45] SMILES Parse Error: unclosed ring for input: 'C1=CC=CN1C(=S)C(C)(C)O1'
[18:09:45] SMILES Parse Error: unclosed ring for input: 'C1=CC=CN1C(=S)C(C)(C)N1'
[18:09:45] SMILES Parse Error: unc

novelty: 100.000%, abs novelty: 72.000%
100


[18:09:47] Explicit valence for atom # 6 O, 3, is greater than permitted
[18:09:47] Explicit valence for atom # 6 O, 4, is greater than permitted
[18:09:47] Explicit valence for atom # 8 O, 3, is greater than permitted
[18:09:47] Explicit valence for atom # 6 O, 4, is greater than permitted
[18:09:47] Explicit valence for atom # 8 O, 3, is greater than permitted
[18:09:47] Explicit valence for atom # 9 N, 4, is greater than permitted
[18:09:47] Explicit valence for atom # 9 O, 3, is greater than permitted
[18:09:47] Explicit valence for atom # 6 O, 4, is greater than permitted
[18:09:47] Explicit valence for atom # 8 O, 3, is greater than permitted
[18:09:47] Explicit valence for atom # 9 N, 4, is greater than permitted
[18:09:47] Explicit valence for atom # 9 O, 3, is greater than permitted
[18:09:47] Explicit valence for atom # 6 O, 4, is greater than permitted
[18:09:47] Explicit valence for atom # 8 O, 3, is greater than permitted
[18:09:47] Explicit valence for atom # 9 N, 4, is g

novelty: 98.387%, abs novelty: 61.000%
100


[18:09:48] SMILES Parse Error: unclosed ring for input: 'CNC(C)C1=NC(C)=O'
[18:09:48] SMILES Parse Error: syntax error while parsing: CCC(NH3+)C1OC1C(=O)O
[18:09:48] SMILES Parse Error: Failed parsing SMILES 'CCC(NH3+)C1OC1C(=O)O' for input: 'CCC(NH3+)C1OC1C(=O)O'
[18:09:48] SMILES Parse Error: unclosed ring for input: 'CCOCC1=NC(C)=O'
[18:09:48] Explicit valence for atom # 7 C, 5, is greater than permitted
[18:09:48] SMILES Parse Error: syntax error while parsing: CCC(NH3+)C1OC1=CO
[18:09:48] SMILES Parse Error: Failed parsing SMILES 'CCC(NH3+)C1OC1=CO' for input: 'CCC(NH3+)C1OC1=CO'
[18:09:48] SMILES Parse Error: unclosed ring for input: 'C1CC#CC2CC=CC2'
[18:09:48] SMILES Parse Error: syntax error while parsing: CCC(NH3+)C1OC1C(=O)O
[18:09:48] SMILES Parse Error: Failed parsing SMILES 'CCC(NH3+)C1OC1C(=O)O' for input: 'CCC(NH3+)C1OC1C(=O)O'
[18:09:48] SMILES Parse Error: unclosed ring for input: 'CCOCC1=NC(C)=O'
[18:09:48] Explicit valence for atom # 7 C, 5, is greater than permitted

novelty: 100.000%, abs novelty: 25.000%
100


[18:09:49] Explicit valence for atom # 5 N, 4, is greater than permitted
[18:09:49] Explicit valence for atom # 5 N, 5, is greater than permitted
[18:09:49] SMILES Parse Error: unclosed ring for input: 'C1CC2C(C1)N(C)C2F3'
[18:09:49] SMILES Parse Error: unclosed ring for input: 'C1CC2C(C1)N(C)C2Cl3'
[18:09:49] SMILES Parse Error: unclosed ring for input: 'C1CC2C(C1)N(C)C2Br3'
[18:09:49] SMILES Parse Error: unclosed ring for input: 'C1CC2C(C1)N(C)C2I3'
[18:09:49] SMILES Parse Error: unclosed ring for input: 'C1CC2C(C1)N(C)C2CF3'
[18:09:49] SMILES Parse Error: unclosed ring for input: 'C1CC2C(C1)N(C)C2CCl3'
[18:09:49] SMILES Parse Error: unclosed ring for input: 'C1CC2C(C1)N(C)C2CBr3'
[18:09:49] SMILES Parse Error: unclosed ring for input: 'C1CC2C(C1)N(C)C2CI3'
[18:09:49] SMILES Parse Error: unclosed ring for input: 'C1CC2C(C1)N(C)C2OCF3'
[18:09:49] SMILES Parse Error: unclosed ring for input: 'C1CC2C(C1)N(C)C2OCCl3'
[18:09:49] SMILES Parse Error: unclosed ring for input: 'C1CC2C(C1)N(C)

novelty: 100.000%, abs novelty: 36.000%
