In [15]:
from rdkit import Chem
import ast
from collections import OrderedDict
from ogb.lsc import PCQM4Mv2Dataset
import gemmi
import py3Dmol
import ase
from openbabel import openbabel
import random
import numpy as np
import json
np.set_printoptions(legacy='1.25')

In [2]:
suppl = Chem.SDMolSupplier('../../../pcqm4m-v2-train.sdf')
dataset = PCQM4Mv2Dataset(root = '../../../', only_smiles = True)

In [3]:
with open('../../../pcqm4m-v2-train.sdf', 'r') as sdf_file:
    molecule_blocks = sdf_file.read()
molecule_blocks = molecule_blocks.split('$$$$')

In [14]:
def get_atom_coord(mol, sample):
    original_smiles = sample[0]
    canonical_smiles = Chem.MolToSmiles(mol, canonical=True, isomericSmiles=False)
    atom_order = ast.literal_eval(mol.GetProp('_smilesAtomOutputOrder'))
    conf = mol.GetConformer()
    coordinates = conf.GetPositions()
    embedded = ''
    atom_coords = OrderedDict()
    for char in canonical_smiles:
        if char in ['B','b','C','c','N','n','O','o','P','p','S','s','F','f','Cl','cl','Br','br','I','I']:
            i = atom_order.pop(0)
            atom_coords[str(i) + "_" + char] = coordinates[i].tolist()
            embedded += f'{char}<{coordinates[i][0]},{coordinates[i][1]},{coordinates[i][2]}>'
        else:
            embedded += char
    return {"canonical_smiles": canonical_smiles, 
            "pcqm4v2_smiles": original_smiles, 
            "pcqm4v2_label": sample[1], 
            "conformers": {"embedded_smiles": embedded, "atom_coords": atom_coords}}
    

In [22]:
data = []
failed = []
valid = []
for idx, mol in enumerate(suppl):
    try:
        coords = get_atom_coord(mol, dataset[idx])
        json_string = json.dumps(coords)  
        if random.random() < 0.0003 and dataset[idx][1] != 'nan':
            valid.append(json_string)
        else:
            data.append(json_string)
    except:
        failed.append([idx, dataset[idx]])
        continue

[15:47:28] Both bonds on one end of an atropisomer are on the same side - atoms are: 0 1
[15:47:29] The 2 defining bonds for an atropisomer are co-planar - atoms are: 6 5
[15:47:30] Both bonds on one end of an atropisomer are on the same side - atoms are: 6 2
[15:47:35] Both bonds on one end of an atropisomer are on the same side - atoms are: 6 10
[15:47:36] The 2 defining bonds for an atropisomer are co-planar - atoms are: 14 9
[15:47:36] The 2 defining bonds for an atropisomer are co-planar - atoms are: 0 9
[15:47:37] Both bonds on one end of an atropisomer are on the same side - atoms are: 1 3
[15:47:37] The 2 defining bonds for an atropisomer are co-planar - atoms are: 10 9
[15:47:37] Both bonds on one end of an atropisomer are on the same side - atoms are: 9 5
[15:47:37] The 2 defining bonds for an atropisomer are co-planar - atoms are: 10 9
[15:47:37] Both bonds on one end of an atropisomer are on the same side - atoms are: 9 5
[15:47:37] Both bonds on one end of an atropisomer a

In [39]:
with open(f"pcqm_embedded_4.jsonl", "w") as file:
    for d in data[3000000:]:
        file.write(d)  
        file.write("\n")
file.close()

In [41]:
with open(f"pcqm_embedded_4.jsonl", "r") as file:
    pcqm_4 = [json.loads(line) for line in file]
file.close()

In [43]:
pcqm_4.__len__()

377578