# Dataset migration from v3 to Mnova ablation studies

Going to remove 1d data and replace it with Mnova simulated data

`v4`: Same data, just with Mnova 1d data replacing the existing train/val/test

`v4_large`: Train on whole retrieval set with Mnova simulations

### Step 1: Cleaning Dataset for Mnova Simulation

There's a lot of repeated SMILES strings, we want to simulate only once and also clear out anything with huge molecular weight (>1800 Da)

Statistics:
- 216,586 entries in train/val/test
- 190,164 unique SMILES across train/val/test
- 189,691 unique SMILES kept for simulation (<=1800 Da)
- 182,637 unique SMILES kept for simulation (<=1000 Da)

In [1]:
from rdkit import Chem
from rdkit.Chem import Descriptors

def check_invalid_mol(smiles: str) -> bool:
    """
    Returns True if the molecule described by `smiles` has
    molecular weight > 1800 Da or is invalid. Returns False otherwise.
    """
    if not smiles:
        return True

    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return True

    mw = Descriptors.MolWt(mol)
    return mw > 1000.0

In [3]:
import pickle
import os
from tqdm import tqdm

DATASET_ROOT = '/data/nas-gpu/wang/atong/Datasets/MoonshotDatasetv3'
OUTPUT_ROOT = '/data/nas-gpu/wang/atong/Datasets/MoonshotDatasetv4/WorkingDir'
os.makedirs(OUTPUT_ROOT, exist_ok=True)

index: dict[int, dict] = pickle.load(open(os.path.join(DATASET_ROOT, 'index.pkl'), 'rb'))
all_smiles: list[str] = list(set([e['smiles'] for e in index.values()]))
valid_smiles: list[str] = [s for s in tqdm(all_smiles) if not check_invalid_mol(s)]

valid_smiles_path = os.path.join(OUTPUT_ROOT, 'smiles_1000.txt')
with open(valid_smiles_path, 'w') as f:
    f.write('\n'.join(valid_smiles))

print(f'Wrote {len(valid_smiles)}/{len(index)} valid SMILES to {valid_smiles_path}')

100%|██████████| 190164/190164 [00:18<00:00, 10074.65it/s]

Wrote 182637/216586 valid SMILES to /data/nas-gpu/wang/atong/Datasets/MoonshotDatasetv4/WorkingDir/smiles_1000.txt





In [4]:
old_smiles = set(valid_smiles)

In [5]:
import pickle
import os
from tqdm import tqdm

DATASET_ROOT = '/data/nas-gpu/wang/atong/Datasets/MoonshotDatasetv3'
OUTPUT_ROOT = '/data/nas-gpu/wang/atong/Datasets/MoonshotDatasetv4/WorkingDir'
os.makedirs(OUTPUT_ROOT, exist_ok=True)

index: dict[int, dict] = pickle.load(open(os.path.join(DATASET_ROOT, 'retrieval.pkl'), 'rb'))
all_smiles: list[str] = list(set([e['smiles'] for e in index.values()]))
valid_smiles: list[str] = [s for s in tqdm(all_smiles) if not check_invalid_mol(s)]
valid_smiles = list(set(valid_smiles) - old_smiles)
valid_smiles_path = os.path.join(OUTPUT_ROOT, 'smiles_retrieval.txt')
with open(valid_smiles_path, 'w') as f:
    f.write('\n'.join(valid_smiles))

print(f'Wrote {len(valid_smiles)}/{len(index)} valid SMILES to {valid_smiles_path}')

100%|██████████| 518901/518901 [00:56<00:00, 9137.09it/s]


Wrote 304294/518901 valid SMILES to /data/nas-gpu/wang/atong/Datasets/MoonshotDatasetv4/WorkingDir/smiles_retrieval.txt


# JSONL Format for MoonshotDataset
```json
{
    "idx": 0,
    "smiles": "C=CC1CN2CCC1CC2C(O)c1ccnc2ccc(OC)cc12",
    "split": "train",
    "has_hsqc": true,
    "has_c_nmr": false,
    "has_h_nmr": false,
    "has_mass_spec": true,
    "has_iso_dist": true,
    "mw": 404.1099,
    "name": "quinine hydrobromide",
    "has_mw": true,
    "formula": "C20H25BrN2O2",
    "has_formula": true,
    "np_pathway": ["Alkaloids"],
    "np_superclass": ["Tryptophan alkaloids"],
    "np_class": [],
    "hsqc": [
        [54.89, 3.077, -1.0], 
        ...
    ],
    "mass_spec": [
        [93.06987762451172, 1.3903098106384277], 
        ...
    ],
    "c_nmr": [
        2.130000114440918, 2.25, ...
    ],
    "h_nmr": [
        2.130000114440918, 2.25, ...
    ],
    "fragidx": [
        1, 3, 4, 5, 6, 8, 9, 14, 15, 17, 19, 20, 29, 32, 35, 46, 50, 59, 60, 62, 69, 80, 81, 125, 184, 210, 240, 378, 382, 392, 472, 491, 698, 891, 933, 1439, 1486, 1639, 2185, 2792, 5479, 5610, 6182, 7318, 7903, 11697, 14346
    ]
}
```

# JSONL Format for Predictions

```json
{
    "idx":0,
    "smiles":"COc1ccc2cc1Oc1cc(ccc1O)C(O)C13SSSC4(C(=O)N1C)C(O)C1=COC=CC(OC2=O)C1N4C3=O",
    "status":"SUCCESS",
    "error":null,
    "atoms":[
        {"number":"1","name":"CH3"},
        {"number":"2","name":"O"},
        ...
    ],
    "predictions":{
        "hsqc":{
            "status":"SUCCESS",
            "H":[
                {
                    "atom":[{"index":1}],
                    "shift":{"value":3.9237684493895095,"error":0.1},
                    "js":[{"atom":[{"index":1}],"j":{"value":5.46,"error":1.72}}]
                },
                {
                    "atom":[{"index":4}],
                    "shift":{"value":7.14351619175921,"error":0.1},
                    "js":[{"atom":[{"index":5}],"j":{"value":8.65,"error":0.31}},{"atom":[{"index":7}],"j":{"value":0.1,"error":0.1}}]
                },
                {
                    "atom":[{"index":5}],
                    "shift":{"value":7.5815223902024345,"error":0.1},
                    "js":[{"atom":[{"index":4}],"j":{"value":8.65,"error":0.31}},{"atom":[{"index":7}],"j":{"value":2.03,"error":0.48}}]
                },
                ...
            ],
            "C":[
                {
                    "atom":[{"index":1}],
                    "shift":{"value":56.0195331969619,"error":3},
                    "js":[{"atom":[{"index":1}],"j":{"value":143.96,"error":3.91}},{"atom":[{"index":4}],"j":{"value":0.4,"error":1.18}}]
                },
                {
                    "atom":[{"index":3}],
                    "shift":{"value":154.3666380106334,"error":3},
                    "js":[{"atom":[{"index":1}],"j":{"value":3.97,"error":12.36}},{"atom":[{"index":4}],"j":{"value":2.52,"error":4.27}},{"atom":[{"index":5}],"j":{"value":7.17,"error":2.84}},{"atom":[{"index":7}],"j":{"value":6.43,"error":2.84}}]
                },
                ...
            ],
            "error":null
        }
    }
}
```

# Dataset Forms

We will reduce the dataset to the following forms:

Form 1

- **MARINABase1**:
    - MoonshotDatasetv3 without any molecules that errored during predictions and filter all molecules <= 1000 Da
- **MARINADataset1**:
    - MARINABase1, with replacing all existing C/H NMRs
- **MARINADataset2**:
    - MARINABase1, put all C/H simulated NMR that exist
- **MARINADataset3**:
    - MARINABase1, replacing all existing C/H/HSQC NMRs
- **MARINADataset4**:
    - MARINABase1, put all C/H/HSQC simulated NMR that exist

Form 2

- **MARINABase2**:
    - MoonshotDatasetv3 without any molecules that errored during predictions and filter all molecules within [100Da, 1000Da]
- **MARINAMedDataset1**:
    - MARINABase2, with replacing all existing C/H NMRs
- **MARINAMedDataset2**:
    - MARINABase2, put all C/H simulated NMR that exist
- **MARINAMedDataset3**:
    - MARINABase2, replacing all existing C/H/HSQC NMRs
- **MARINAMedDataset4**:
    - MARINABase2, put all C/H/HSQC simulated NMR that exist

Form 3

- **MARINABaseNoDup**:
    - MoonshotDatasetv3 without any molecules that errored during predictions and filter all molecules <= 1000 Da, and also no duplicate SMILES


In [1]:
import zipfile
from pathlib import Path

def zip_directory(src_dir, output_zip_path):
    src_dir = Path(src_dir)
    output_zip_path = Path(output_zip_path)

    with zipfile.ZipFile(output_zip_path, "w", zipfile.ZIP_DEFLATED) as zipf:
        for file in src_dir.rglob("*"):
            if file.is_file():
                # keep relative paths inside the zip
                zipf.write(file, arcname=file.relative_to(src_dir))

In [2]:
import glob
import os
import json
from tqdm import tqdm
PRED_ROOT = '/data/nas-gpu/wang/atong/Datasets/MnovaPredictions/raw'
files = sorted(glob.glob(os.path.join(PRED_ROOT, '*.jsonl')))
def process_file(file):
    nmrs = {}
    with open(file, 'r') as f:
        for line in f:
            data = json.loads(line)
            if data['predictions']['hsqc']['status'] != 'SUCCESS':
                continue
            h_nmr = data['predictions']['hsqc']['H']
            c_nmr = data['predictions']['hsqc']['C']
            if h_nmr is None or c_nmr is None or len(h_nmr) == 0 or len(c_nmr) == 0:
                continue
            nmrs[data['smiles']] = {
                'h_nmr': h_nmr,
                'c_nmr': c_nmr,
                'atoms': data['atoms']
            }
    return nmrs

all_nmrs = {}
for file in tqdm(files):
    nmrs = process_file(file)
    all_nmrs.update(nmrs)


100%|██████████| 183/183 [02:15<00:00,  1.35it/s]


In [3]:
from typing import Any, Dict, List, Tuple
nmr_data = {}

def _atom_sign_from_name(atom_name: str) -> int:
    return -1 if "CH2" in atom_name else +1

def assemble_nmr_data(preds: Dict[str, Any]) -> Dict[str, List]:
    data = {
        "h_nmr": [],
        "c_nmr": [],
        "hsqc": [],
        "h_nmr_error": [],
        "c_nmr_error": [],
        "hsqc_error": [],
    }

    atom_name_by_idx: Dict[int, str] = {}
    for a in preds['atoms']:
        idx = int(a["number"])
        atom_name_by_idx[idx] = a['name']

    c_by_atom: Dict[int, Tuple[float, float]] = {}
    for c in preds['c_nmr']:
        for atom in c['atom']:
            atom_idx = atom['index']
            c_shift = float(c['shift']['value'])
            c_err = float(c['shift']['error'])
            c_by_atom[int(atom_idx)] = (c_shift, c_err)
            data["c_nmr"].append(c_shift)
            data["c_nmr_error"].append(c_err)

    for h in preds['h_nmr']:
        for atom in h['atom']:
            atom_idx = atom['index']
            h_shift = float(h['shift']['value'])
            h_err = float(h['shift']['error'])
            data["h_nmr"].append(h_shift)
            data["h_nmr_error"].append(h_err)

            if atom_idx not in c_by_atom:
                if atom_name_by_idx[atom_idx] in ('CH', 'CH2', 'CH3'):
                    raise ValueError()
                continue
            c_shift, c_err = c_by_atom[atom_idx]
            sign = _atom_sign_from_name(atom_name_by_idx[atom_idx])

            data["hsqc"].append([c_shift, h_shift, sign])
            data["hsqc_error"].append([c_err, h_err, 0.0])

    return data

In [4]:
for smiles, nmr in tqdm(all_nmrs.items()):
    nmr_data[smiles] = assemble_nmr_data(nmr)

100%|██████████| 182619/182619 [00:31<00:00, 5831.40it/s] 


#### Build MARINAControl1

Original MARINA training set, but filter out all molecules <1000 Da and not present in MARINABase1.

Then run
```
cd examples/scripts
python convert.py --to-lmdb ~/Datasets/MARINAControl1_jsonl ~/Datasets/MARINAControl1
```

In [5]:
v3_BASE = '/data/nas-gpu/wang/atong/Datasets/MoonshotDatasetv3_jsonl'
OUT_BASE = '/data/nas-gpu/wang/atong/Datasets/MARINAControl1_jsonl'

import pickle
import json
import os
from tqdm import tqdm
import shutil

def process_set(set_name: str, index: dict):
    all_data = []
    processed_smiles = set()
    with open(os.path.join(v3_BASE, f'{set_name}.jsonl'), 'r') as f:
        for line in tqdm(f):
            data = json.loads(line)
            if data['smiles'] in nmr_data:
                if data['smiles'] in processed_smiles:
                    continue
                processed_smiles.add(data['smiles'])
                data['idx'] = len(index)
                all_data.append(data)
                index[len(index)] = {
                    'smiles': data['smiles'],
                    'name': data['name'],
                    'formula': data['formula'],
                    'np_pathway': data['np_pathway'],
                    'np_superclass': data['np_superclass'],
                    'np_class': data['np_class'],
                    'has_hsqc': data['has_hsqc'] and len(data['hsqc']) > 0,
                    'has_c_nmr': data['has_c_nmr'] and len(data['c_nmr']) > 0,
                    'has_h_nmr': data['has_h_nmr'] and len(data['h_nmr']) > 0,
                    'has_mass_spec': data['has_mass_spec'],
                    'has_mw': data['has_mw'],
                    'has_formula': data['has_formula'],
                    'mw': data['mw'],
                    'split': set_name,
                }
    with open(os.path.join(OUT_BASE, f'{set_name}.jsonl'), 'w') as f:
        for data in all_data:
            f.write(json.dumps(data) + '\n')
    print(f'Wrote {len(all_data)}/{len(processed_smiles)} molecules to {os.path.join(OUT_BASE, f"{set_name}.jsonl")}')    
    return index

os.makedirs(os.path.join(OUT_BASE), exist_ok=True)
index = {}
index = process_set('train', index)
index = process_set('val', index)
index = process_set('test', index)
pickle.dump(index, open(os.path.join(OUT_BASE, 'index.pkl'), 'wb'))
shutil.copy(os.path.join(v3_BASE, 'metadata.json'), os.path.join(OUT_BASE, 'metadata.json'))
shutil.copy(os.path.join(v3_BASE, 'count_hashes_under_radius_6.pkl'), os.path.join(OUT_BASE, 'count_hashes_under_radius_6.pkl'))
shutil.copy(os.path.join(v3_BASE, 'retrieval.pkl'), os.path.join(OUT_BASE, 'retrieval.pkl'))
os.makedirs(os.path.join(OUT_BASE, 'RankingEntropy'), exist_ok=True)
shutil.copy(os.path.join(v3_BASE, 'RankingEntropy', 'bitinfo_to_idx.pkl'), os.path.join(OUT_BASE, 'RankingEntropy', 'bitinfo_to_idx.pkl'))
shutil.copy(os.path.join(v3_BASE, 'RankingEntropy', 'rankingset.pt'), os.path.join(OUT_BASE, 'RankingEntropy', 'rankingset.pt'))

173422it [00:04, 37967.60it/s]


Wrote 146375/146375 molecules to /data/nas-gpu/wang/atong/Datasets/MARINAControl1_jsonl/train.jsonl


21621it [00:00, 41416.41it/s]


Wrote 18228/18228 molecules to /data/nas-gpu/wang/atong/Datasets/MARINAControl1_jsonl/val.jsonl


21543it [00:00, 39576.01it/s]


Wrote 18245/18245 molecules to /data/nas-gpu/wang/atong/Datasets/MARINAControl1_jsonl/test.jsonl


'/data/nas-gpu/wang/atong/Datasets/MARINAControl1_jsonl/RankingEntropy/rankingset.pt'

#### Build MARINABase1

Replace all H NMR, all C NMR, preserve HSQC NMR available. If nmr data does not exist for SMILES, delete SMILES.

Then run
```
cd examples/scripts
python convert.py --to-lmdb ~/Datasets/MARINABase1_jsonl ~/Datasets/MARINABase1
```

In [6]:
v3_BASE = '/data/nas-gpu/wang/atong/Datasets/MoonshotDatasetv3_jsonl'
OUT_BASE = '/data/nas-gpu/wang/atong/Datasets/MARINABase1_jsonl'

import pickle
import json
import os
from tqdm import tqdm
import shutil

def process_set(set_name: str, index: dict):
    all_data = []
    processed_smiles = set()
    with open(os.path.join(v3_BASE, f'{set_name}.jsonl'), 'r') as f:
        for line in tqdm(f):
            data = json.loads(line)
            if data['smiles'] in nmr_data:
                if data['smiles'] in processed_smiles:
                    continue
                processed_smiles.add(data['smiles'])
                data['h_nmr'] = nmr_data[data['smiles']]['h_nmr']
                data['c_nmr'] = nmr_data[data['smiles']]['c_nmr']
                data['h_nmr_error'] = nmr_data[data['smiles']]['h_nmr_error']
                data['c_nmr_error'] = nmr_data[data['smiles']]['c_nmr_error']
                data['idx'] = len(index)
                all_data.append(data)
                index[len(index)] = {
                    'smiles': data['smiles'],
                    'name': data['name'],
                    'formula': data['formula'],
                    'np_pathway': data['np_pathway'],
                    'np_superclass': data['np_superclass'],
                    'np_class': data['np_class'],
                    'has_hsqc': data['has_hsqc'],
                    'has_c_nmr': True,
                    'has_h_nmr': True,
                    'has_mass_spec': data['has_mass_spec'],
                    'has_mw': data['has_mw'],
                    'has_formula': data['has_formula'],
                    'mw': data['mw'],
                    'split': set_name,
                }

    with open(os.path.join(OUT_BASE, f'{set_name}.jsonl'), 'w') as f:
        for data in all_data:
            f.write(json.dumps(data) + '\n')
    print(f'Wrote {len(all_data)}/{len(processed_smiles)} molecules to {os.path.join(OUT_BASE, f"{set_name}.jsonl")}')    
    return index

os.makedirs(os.path.join(OUT_BASE), exist_ok=True)
index = {}
index = process_set('train', index)
index = process_set('val', index)
index = process_set('test', index)
pickle.dump(index, open(os.path.join(OUT_BASE, 'index.pkl'), 'wb'))

shutil.copy(os.path.join(v3_BASE, 'metadata.json'), os.path.join(OUT_BASE, 'metadata.json'))
shutil.copy(os.path.join(v3_BASE, 'count_hashes_under_radius_6.pkl'), os.path.join(OUT_BASE, 'count_hashes_under_radius_6.pkl'))
shutil.copy(os.path.join(v3_BASE, 'retrieval.pkl'), os.path.join(OUT_BASE, 'retrieval.pkl'))
os.makedirs(os.path.join(OUT_BASE, 'RankingEntropy'), exist_ok=True)
shutil.copy(os.path.join(v3_BASE, 'RankingEntropy', 'bitinfo_to_idx.pkl'), os.path.join(OUT_BASE, 'RankingEntropy', 'bitinfo_to_idx.pkl'))
shutil.copy(os.path.join(v3_BASE, 'RankingEntropy', 'rankingset.pt'), os.path.join(OUT_BASE, 'RankingEntropy', 'rankingset.pt'))

173422it [00:04, 36962.06it/s]


Wrote 146375/146375 molecules to /data/nas-gpu/wang/atong/Datasets/MARINABase1_jsonl/train.jsonl


21621it [00:00, 39711.02it/s]


Wrote 18228/18228 molecules to /data/nas-gpu/wang/atong/Datasets/MARINABase1_jsonl/val.jsonl


21543it [00:00, 38851.41it/s]


Wrote 18245/18245 molecules to /data/nas-gpu/wang/atong/Datasets/MARINABase1_jsonl/test.jsonl


'/data/nas-gpu/wang/atong/Datasets/MARINABase1_jsonl/RankingEntropy/rankingset.pt'

#### Build MARINADataset1

MARINABase1, but filter out all molecules <1000 Da and do not give C NMR and H NMR if not previously existed

Then run
```
cd examples/scripts
python convert.py --to-lmdb ~/Datasets/MARINADataset1_jsonl ~/Datasets/MARINADataset1
```

In [7]:
v3_BASE = '/data/nas-gpu/wang/atong/Datasets/MoonshotDatasetv3_jsonl'
OUT_BASE = '/data/nas-gpu/wang/atong/Datasets/MARINADataset1_jsonl'

import pickle
import json
import os
from tqdm import tqdm
import shutil

def process_set(set_name: str, index: dict):
    all_data = []
    processed_smiles = set()
    with open(os.path.join(v3_BASE, f'{set_name}.jsonl'), 'r') as f:
        for line in tqdm(f):
            data = json.loads(line)
            if data['smiles'] in nmr_data:
                if data['smiles'] in processed_smiles:
                    continue
                processed_smiles.add(data['smiles'])
                if data['has_h_nmr']:
                    data['h_nmr'] = nmr_data[data['smiles']]['h_nmr']
                    data['h_nmr_error'] = nmr_data[data['smiles']]['h_nmr_error']
                if data['has_c_nmr']:
                    data['c_nmr'] = nmr_data[data['smiles']]['c_nmr']
                    data['c_nmr_error'] = nmr_data[data['smiles']]['c_nmr_error']
                
                data['idx'] = len(index)
                all_data.append(data)
                index[len(index)] = {
                    'smiles': data['smiles'],
                    'name': data['name'],
                    'formula': data['formula'],
                    'np_pathway': data['np_pathway'],
                    'np_superclass': data['np_superclass'],
                    'np_class': data['np_class'],
                    'has_hsqc': data['has_hsqc'],
                    'has_c_nmr': data['has_c_nmr'],
                    'has_h_nmr': data['has_h_nmr'],
                    'has_mass_spec': data['has_mass_spec'],
                    'has_mw': data['has_mw'],
                    'has_formula': data['has_formula'],
                    'mw': data['mw'],
                    'split': set_name,
                }

    with open(os.path.join(OUT_BASE, f'{set_name}.jsonl'), 'w') as f:
        for data in all_data:
            f.write(json.dumps(data) + '\n')
    print(f'Wrote {len(all_data)}/{len(processed_smiles)} molecules to {os.path.join(OUT_BASE, f"{set_name}.jsonl")}')    
    return index
    
os.makedirs(os.path.join(OUT_BASE), exist_ok=True)
index = {}
index = process_set('train', index)
index = process_set('val', index)
index = process_set('test', index)
pickle.dump(index, open(os.path.join(OUT_BASE, 'index.pkl'), 'wb'))

shutil.copy(os.path.join(v3_BASE, 'metadata.json'), os.path.join(OUT_BASE, 'metadata.json'))
shutil.copy(os.path.join(v3_BASE, 'count_hashes_under_radius_6.pkl'), os.path.join(OUT_BASE, 'count_hashes_under_radius_6.pkl'))
shutil.copy(os.path.join(v3_BASE, 'retrieval.pkl'), os.path.join(OUT_BASE, 'retrieval.pkl'))
os.makedirs(os.path.join(OUT_BASE, 'RankingEntropy'), exist_ok=True)
shutil.copy(os.path.join(v3_BASE, 'RankingEntropy', 'bitinfo_to_idx.pkl'), os.path.join(OUT_BASE, 'RankingEntropy', 'bitinfo_to_idx.pkl'))
shutil.copy(os.path.join(v3_BASE, 'RankingEntropy', 'rankingset.pt'), os.path.join(OUT_BASE, 'RankingEntropy', 'rankingset.pt'))

124044it [00:03, 43511.49it/s]

173422it [00:30, 5744.69it/s] 


Wrote 146375/146375 molecules to /data/nas-gpu/wang/atong/Datasets/MARINADataset1_jsonl/train.jsonl


21621it [00:00, 39155.42it/s]


Wrote 18228/18228 molecules to /data/nas-gpu/wang/atong/Datasets/MARINADataset1_jsonl/val.jsonl


21543it [00:00, 38061.39it/s]


Wrote 18245/18245 molecules to /data/nas-gpu/wang/atong/Datasets/MARINADataset1_jsonl/test.jsonl


'/data/nas-gpu/wang/atong/Datasets/MARINADataset1_jsonl/RankingEntropy/rankingset.pt'

#### Build MARINADataset2

MARINABase1, but filter out all molecules <1000 Da and give all C and H simulated

Then run
```
cd examples/scripts
python convert.py --to-lmdb ~/Datasets/MARINADataset2_jsonl ~/Datasets/MARINADataset2
```

In [8]:
v3_BASE = '/data/nas-gpu/wang/atong/Datasets/MoonshotDatasetv3_jsonl'
OUT_BASE = '/data/nas-gpu/wang/atong/Datasets/MARINADataset2_jsonl'

import pickle
import json
import os
from tqdm import tqdm
import shutil

def process_set(set_name: str, index: dict):
    all_data = []
    processed_smiles = set()
    with open(os.path.join(v3_BASE, f'{set_name}.jsonl'), 'r') as f:
        for line in tqdm(f):
            data = json.loads(line)
            if data['smiles'] in nmr_data:
                if data['smiles'] in processed_smiles:
                    continue
                processed_smiles.add(data['smiles'])
                data['h_nmr'] = nmr_data[data['smiles']]['h_nmr']
                data['c_nmr'] = nmr_data[data['smiles']]['c_nmr']
                data['h_nmr_error'] = nmr_data[data['smiles']]['h_nmr_error']
                data['c_nmr_error'] = nmr_data[data['smiles']]['c_nmr_error']
                data['idx'] = len(index)
                all_data.append(data)
                index[len(index)] = {
                    'smiles': data['smiles'],
                    'name': data['name'],
                    'formula': data['formula'],
                    'np_pathway': data['np_pathway'],
                    'np_superclass': data['np_superclass'],
                    'np_class': data['np_class'],
                    'has_hsqc': data['has_hsqc'],
                    'has_c_nmr': True,
                    'has_h_nmr': True,
                    'has_mass_spec': data['has_mass_spec'],
                    'has_mw': data['has_mw'],
                    'has_formula': data['has_formula'],
                    'mw': data['mw'],
                    'split': set_name,
                }

    with open(os.path.join(OUT_BASE, f'{set_name}.jsonl'), 'w') as f:
        for data in all_data:
            f.write(json.dumps(data) + '\n')
    print(f'Wrote {len(all_data)}/{len(processed_smiles)} molecules to {os.path.join(OUT_BASE, f"{set_name}.jsonl")}')    
    return index

os.makedirs(os.path.join(OUT_BASE), exist_ok=True)
index = {}
index = process_set('train', index)
index = process_set('val', index)
index = process_set('test', index)
pickle.dump(index, open(os.path.join(OUT_BASE, 'index.pkl'), 'wb'))

shutil.copy(os.path.join(v3_BASE, 'metadata.json'), os.path.join(OUT_BASE, 'metadata.json'))
shutil.copy(os.path.join(v3_BASE, 'count_hashes_under_radius_6.pkl'), os.path.join(OUT_BASE, 'count_hashes_under_radius_6.pkl'))
shutil.copy(os.path.join(v3_BASE, 'retrieval.pkl'), os.path.join(OUT_BASE, 'retrieval.pkl'))
os.makedirs(os.path.join(OUT_BASE, 'RankingEntropy'), exist_ok=True)
shutil.copy(os.path.join(v3_BASE, 'RankingEntropy', 'bitinfo_to_idx.pkl'), os.path.join(OUT_BASE, 'RankingEntropy', 'bitinfo_to_idx.pkl'))
shutil.copy(os.path.join(v3_BASE, 'RankingEntropy', 'rankingset.pt'), os.path.join(OUT_BASE, 'RankingEntropy', 'rankingset.pt'))

173422it [00:04, 37139.10it/s]


Wrote 146375/146375 molecules to /data/nas-gpu/wang/atong/Datasets/MARINADataset2_jsonl/train.jsonl


21621it [00:00, 39011.54it/s]


Wrote 18228/18228 molecules to /data/nas-gpu/wang/atong/Datasets/MARINADataset2_jsonl/val.jsonl


21543it [00:00, 37912.55it/s]


Wrote 18245/18245 molecules to /data/nas-gpu/wang/atong/Datasets/MARINADataset2_jsonl/test.jsonl


'/data/nas-gpu/wang/atong/Datasets/MARINADataset2_jsonl/RankingEntropy/rankingset.pt'

#### Build MARINADataset3

MARINABase1, but filter out all molecules <1000 Da and replace all C and H and HSQC simulated that exist already

Then run
```
cd examples/scripts
python convert.py --to-lmdb ~/Datasets/MARINADataset3_jsonl ~/Datasets/MARINADataset3
```

In [9]:
v3_BASE = '/data/nas-gpu/wang/atong/Datasets/MoonshotDatasetv3_jsonl'
OUT_BASE = '/data/nas-gpu/wang/atong/Datasets/MARINADataset3_jsonl'

import pickle
import json
import os
from tqdm import tqdm
import shutil

def process_set(set_name: str, index: dict):
    all_data = []
    processed_smiles = set()
    with open(os.path.join(v3_BASE, f'{set_name}.jsonl'), 'r') as f:
        for line in tqdm(f):
            data = json.loads(line)
            if data['smiles'] in nmr_data:
                if data['smiles'] in processed_smiles:
                    continue
                processed_smiles.add(data['smiles'])
                if data['has_h_nmr']:
                    data['h_nmr'] = nmr_data[data['smiles']]['h_nmr']
                    data['h_nmr_error'] = nmr_data[data['smiles']]['h_nmr_error']
                if data['has_c_nmr']:
                    data['c_nmr'] = nmr_data[data['smiles']]['c_nmr']
                    data['c_nmr_error'] = nmr_data[data['smiles']]['c_nmr_error']
                if data['has_hsqc']:
                    data['hsqc'] = nmr_data[data['smiles']]['hsqc']
                    data['hsqc_error'] = nmr_data[data['smiles']]['hsqc_error']
                data['idx'] = len(index)
                all_data.append(data)
                index[len(index)] = {
                    'smiles': data['smiles'],
                    'name': data['name'],
                    'formula': data['formula'],
                    'np_pathway': data['np_pathway'],
                    'np_superclass': data['np_superclass'],
                    'np_class': data['np_class'],
                    'has_hsqc': data['has_hsqc'],
                    'has_c_nmr': data['has_c_nmr'],
                    'has_h_nmr': data['has_h_nmr'],
                    'has_mass_spec': data['has_mass_spec'],
                    'has_mw': data['has_mw'],
                    'has_formula': data['has_formula'],
                    'mw': data['mw'],
                    'split': set_name,
                }

    with open(os.path.join(OUT_BASE, f'{set_name}.jsonl'), 'w') as f:
        for data in all_data:
            f.write(json.dumps(data) + '\n')
    print(f'Wrote {len(all_data)}/{len(processed_smiles)} molecules to {os.path.join(OUT_BASE, f"{set_name}.jsonl")}')    
    return index
    
os.makedirs(os.path.join(OUT_BASE), exist_ok=True)
index = {}
index = process_set('train', index)
index = process_set('val', index)
index = process_set('test', index)
pickle.dump(index, open(os.path.join(OUT_BASE, 'index.pkl'), 'wb'))

shutil.copy(os.path.join(v3_BASE, 'metadata.json'), os.path.join(OUT_BASE, 'metadata.json'))
shutil.copy(os.path.join(v3_BASE, 'count_hashes_under_radius_6.pkl'), os.path.join(OUT_BASE, 'count_hashes_under_radius_6.pkl'))
shutil.copy(os.path.join(v3_BASE, 'retrieval.pkl'), os.path.join(OUT_BASE, 'retrieval.pkl'))
os.makedirs(os.path.join(OUT_BASE, 'RankingEntropy'), exist_ok=True)
shutil.copy(os.path.join(v3_BASE, 'RankingEntropy', 'bitinfo_to_idx.pkl'), os.path.join(OUT_BASE, 'RankingEntropy', 'bitinfo_to_idx.pkl'))
shutil.copy(os.path.join(v3_BASE, 'RankingEntropy', 'rankingset.pt'), os.path.join(OUT_BASE, 'RankingEntropy', 'rankingset.pt'))

51650it [00:01, 33620.08it/s]

173422it [00:04, 38876.17it/s]


Wrote 146375/146375 molecules to /data/nas-gpu/wang/atong/Datasets/MARINADataset3_jsonl/train.jsonl


21621it [00:00, 39347.91it/s]


Wrote 18228/18228 molecules to /data/nas-gpu/wang/atong/Datasets/MARINADataset3_jsonl/val.jsonl


21543it [00:00, 38253.02it/s]


Wrote 18245/18245 molecules to /data/nas-gpu/wang/atong/Datasets/MARINADataset3_jsonl/test.jsonl


'/data/nas-gpu/wang/atong/Datasets/MARINADataset3_jsonl/RankingEntropy/rankingset.pt'

#### Build MARINADataset4

MARINABase1, but filter out all molecules <1000 Da and use all C and H and HSQC simulated data

Then run
```
cd examples/scripts
python convert.py --to-lmdb ~/Datasets/MARINADataset4_jsonl ~/Datasets/MARINADataset4
```

In [None]:
v3_BASE = '/data/nas-gpu/wang/atong/Datasets/MoonshotDatasetv3_jsonl'
OUT_BASE = '/data/nas-gpu/wang/atong/Datasets/MARINADataset4_jsonl'

import pickle
import json
import os
from tqdm import tqdm
import shutil

def process_set(set_name: str, index: dict):
    all_data = []
    processed_smiles = set()
    with open(os.path.join(v3_BASE, f'{set_name}.jsonl'), 'r') as f:
        for line in tqdm(f):
            data = json.loads(line)
            if data['smiles'] in nmr_data:
                if data['smiles'] in processed_smiles:
                    continue
                processed_smiles.add(data['smiles'])
                data['h_nmr'] = nmr_data[data['smiles']]['h_nmr']
                data['h_nmr_error'] = nmr_data[data['smiles']]['h_nmr_error']
                data['c_nmr'] = nmr_data[data['smiles']]['c_nmr']
                data['c_nmr_error'] = nmr_data[data['smiles']]['c_nmr_error']
                if len(nmr_data[data['smiles']]['hsqc']) > 0:
                    data['hsqc'] = nmr_data[data['smiles']]['hsqc']
                    data['hsqc_error'] = nmr_data[data['smiles']]['hsqc_error']
                data['idx'] = len(index)
                all_data.append(data)
                index[len(index)] = {
                    'smiles': data['smiles'],
                    'name': data['name'],
                    'formula': data['formula'],
                    'np_pathway': data['np_pathway'],
                    'np_superclass': data['np_superclass'],
                    'np_class': data['np_class'],
                    'has_hsqc': 'hsqc_error' in data,
                    'has_c_nmr': True,
                    'has_h_nmr': True,
                    'has_mass_spec': data['has_mass_spec'],
                    'has_mw': data['has_mw'],
                    'has_formula': data['has_formula'],
                    'mw': data['mw'],
                    'split': set_name,
                }

    with open(os.path.join(OUT_BASE, f'{set_name}.jsonl'), 'w') as f:
        for data in all_data:
            f.write(json.dumps(data) + '\n')
    print(f'Wrote {len(all_data)}/{len(processed_smiles)} molecules to {os.path.join(OUT_BASE, f"{set_name}.jsonl")}')    
    return index
    
os.makedirs(os.path.join(OUT_BASE), exist_ok=True)
index = {}
index = process_set('train', index)
index = process_set('val', index)
index = process_set('test', index)
pickle.dump(index, open(os.path.join(OUT_BASE, 'index.pkl'), 'wb'))

shutil.copy(os.path.join(v3_BASE, 'metadata.json'), os.path.join(OUT_BASE, 'metadata.json'))
shutil.copy(os.path.join(v3_BASE, 'count_hashes_under_radius_6.pkl'), os.path.join(OUT_BASE, 'count_hashes_under_radius_6.pkl'))
shutil.copy(os.path.join(v3_BASE, 'retrieval.pkl'), os.path.join(OUT_BASE, 'retrieval.pkl'))
os.makedirs(os.path.join(OUT_BASE, 'RankingEntropy'), exist_ok=True)
shutil.copy(os.path.join(v3_BASE, 'RankingEntropy', 'bitinfo_to_idx.pkl'), os.path.join(OUT_BASE, 'RankingEntropy', 'bitinfo_to_idx.pkl'))
shutil.copy(os.path.join(v3_BASE, 'RankingEntropy', 'rankingset.pt'), os.path.join(OUT_BASE, 'RankingEntropy', 'rankingset.pt'))

173422it [00:04, 36672.93it/s]


Wrote 146375/146375 molecules to /data/nas-gpu/wang/atong/Datasets/MARINADataset4_jsonl/train.jsonl


21621it [00:00, 39077.62it/s]


Wrote 18228/18228 molecules to /data/nas-gpu/wang/atong/Datasets/MARINADataset4_jsonl/val.jsonl


21543it [00:00, 38116.69it/s]


Wrote 18245/18245 molecules to /data/nas-gpu/wang/atong/Datasets/MARINADataset4_jsonl/test.jsonl


'/data/nas-gpu/wang/atong/Datasets/MARINADataset4_jsonl/RankingEntropy/rankingset.pt'

In [11]:
import subprocess
from pathlib import Path

scripts_dir = Path("/data/nas-gpu/wang/atong/SMART-Moonshot/examples/scripts")
DATASET_ROOT = '/data/nas-gpu/wang/atong/Datasets/'
DATASETS = [
    'MARINAControl1',
    'MARINABase1',
    'MARINADataset1',
    'MARINADataset2',
    'MARINADataset3',
    'MARINADataset4'
]
for dataset in DATASETS:
    cmd = [
        "python",
        "convert.py",
        "--to-lmdb",
        f'{DATASET_ROOT}/{dataset}_jsonl',
        f'{DATASET_ROOT}/{dataset}',
    ]
    subprocess.run(cmd, cwd=scripts_dir, check=True)

Converting to LMDB format: /data/nas-gpu/wang/atong/Datasets//MARINAControl1_jsonl -> /data/nas-gpu/wang/atong/Datasets//MARINAControl1
Copying regular files...
  Copied index.pkl
  Copied retrieval.pkl
  Copied metadata.json
  Copied count_hashes_under_radius_6.pkl
  Copied RankingEntropy/

Converting train from /data/nas-gpu/wang/atong/Datasets//MARINAControl1_jsonl/train.jsonl


train: 124546it [00:11, 14025.89it/s]

train: 146375it [00:12, 11296.81it/s]
val: 0it [00:00, ?it/s]

  Converted 146375 entries for train

Converting val from /data/nas-gpu/wang/atong/Datasets//MARINAControl1_jsonl/val.jsonl


val: 18228it [00:01, 14694.90it/s]
test: 0it [00:00, ?it/s]

  Converted 18228 entries for val

Converting test from /data/nas-gpu/wang/atong/Datasets//MARINAControl1_jsonl/test.jsonl


test: 18245it [00:01, 13835.02it/s]


  Converted 18245 entries for test

Conversion complete!
Converting to LMDB format: /data/nas-gpu/wang/atong/Datasets//MARINABase1_jsonl -> /data/nas-gpu/wang/atong/Datasets//MARINABase1
Copying regular files...
  Copied index.pkl
  Copied retrieval.pkl
  Copied metadata.json
  Copied count_hashes_under_radius_6.pkl
  Copied RankingEntropy/

Converting train from /data/nas-gpu/wang/atong/Datasets//MARINABase1_jsonl/train.jsonl


train: 146375it [00:17, 8414.42it/s] 
val: 1024it [00:00, 8226.69it/s]

  Converted 146375 entries for train

Converting val from /data/nas-gpu/wang/atong/Datasets//MARINABase1_jsonl/val.jsonl


val: 18228it [00:01, 11298.96it/s]
test: 1024it [00:00, 8276.82it/s]

  Converted 18228 entries for val

Converting test from /data/nas-gpu/wang/atong/Datasets//MARINABase1_jsonl/test.jsonl


test: 18245it [00:01, 11029.03it/s]


  Converted 18245 entries for test

Conversion complete!
Converting to LMDB format: /data/nas-gpu/wang/atong/Datasets//MARINADataset1_jsonl -> /data/nas-gpu/wang/atong/Datasets//MARINADataset1
Copying regular files...
  Copied index.pkl
  Copied retrieval.pkl
  Copied metadata.json
  Copied count_hashes_under_radius_6.pkl
  Copied RankingEntropy/

Converting train from /data/nas-gpu/wang/atong/Datasets//MARINADataset1_jsonl/train.jsonl


train: 146375it [00:14, 9789.73it/s] 
val: 0it [00:00, ?it/s]

  Converted 146375 entries for train

Converting val from /data/nas-gpu/wang/atong/Datasets//MARINADataset1_jsonl/val.jsonl


val: 18228it [00:01, 12936.59it/s]
test: 0it [00:00, ?it/s]

  Converted 18228 entries for val

Converting test from /data/nas-gpu/wang/atong/Datasets//MARINADataset1_jsonl/test.jsonl


test: 18245it [00:01, 13124.93it/s]


  Converted 18245 entries for test

Conversion complete!
Converting to LMDB format: /data/nas-gpu/wang/atong/Datasets//MARINADataset2_jsonl -> /data/nas-gpu/wang/atong/Datasets//MARINADataset2
Copying regular files...
  Copied index.pkl
  Copied retrieval.pkl
  Copied metadata.json
  Copied count_hashes_under_radius_6.pkl
  Copied RankingEntropy/

Converting train from /data/nas-gpu/wang/atong/Datasets//MARINADataset2_jsonl/train.jsonl


train: 146375it [00:16, 8614.50it/s] 
val: 0it [00:00, ?it/s]

  Converted 146375 entries for train

Converting val from /data/nas-gpu/wang/atong/Datasets//MARINADataset2_jsonl/val.jsonl


val: 18228it [00:01, 11316.51it/s]
test: 0it [00:00, ?it/s]

  Converted 18228 entries for val

Converting test from /data/nas-gpu/wang/atong/Datasets//MARINADataset2_jsonl/test.jsonl


test: 18245it [00:01, 11286.27it/s]


  Converted 18245 entries for test

Conversion complete!
Converting to LMDB format: /data/nas-gpu/wang/atong/Datasets//MARINADataset3_jsonl -> /data/nas-gpu/wang/atong/Datasets//MARINADataset3
Copying regular files...
  Copied index.pkl
  Copied retrieval.pkl
  Copied metadata.json
  Copied count_hashes_under_radius_6.pkl
  Copied RankingEntropy/

Converting train from /data/nas-gpu/wang/atong/Datasets//MARINADataset3_jsonl/train.jsonl


train: 146375it [00:15, 9448.28it/s] 


  Converted 146375 entries for train

Converting val from /data/nas-gpu/wang/atong/Datasets//MARINADataset3_jsonl/val.jsonl


val: 18228it [00:01, 12322.84it/s]
test: 0it [00:00, ?it/s]

  Converted 18228 entries for val

Converting test from /data/nas-gpu/wang/atong/Datasets//MARINADataset3_jsonl/test.jsonl


test: 18245it [00:01, 12548.71it/s]


  Converted 18245 entries for test

Conversion complete!
Converting to LMDB format: /data/nas-gpu/wang/atong/Datasets//MARINADataset4_jsonl -> /data/nas-gpu/wang/atong/Datasets//MARINADataset4
Copying regular files...
  Copied index.pkl
  Copied retrieval.pkl
  Copied metadata.json
  Copied count_hashes_under_radius_6.pkl
  Copied RankingEntropy/

Converting train from /data/nas-gpu/wang/atong/Datasets//MARINADataset4_jsonl/train.jsonl


train: 146375it [00:19, 7353.77it/s]
val: 0it [00:00, ?it/s]

  Converted 146375 entries for train

Converting val from /data/nas-gpu/wang/atong/Datasets//MARINADataset4_jsonl/val.jsonl


val: 18228it [00:01, 9714.39it/s] 
test: 0it [00:00, ?it/s]

  Converted 18228 entries for val

Converting test from /data/nas-gpu/wang/atong/Datasets//MARINADataset4_jsonl/test.jsonl


test: 18245it [00:01, 9673.47it/s] 


  Converted 18245 entries for test

Conversion complete!


In [12]:
from tqdm import tqdm
import shutil
for dataset in tqdm(DATASETS):
    zip_directory(os.path.join(DATASET_ROOT, dataset), os.path.join(DATASET_ROOT, f'{dataset}.zip'))
    shutil.rmtree(os.path.join(DATASET_ROOT, f'{dataset}_jsonl'))

100%|██████████| 6/6 [04:49<00:00, 48.30s/it]


In [6]:
import json

with open('/data/nas-gpu/wang/atong/Datasets/MARINADataset4_jsonl/train.jsonl', 'r') as f:
    for line in f:
        data = json.loads(line)
        if data['idx'] == 129014:
            print(data)
            break

{'idx': 129014, 'smiles': 'Oc1c(Br)c(O)c(Br)c(O)c1Br', 'split': 'train', 'has_hsqc': False, 'has_c_nmr': True, 'has_h_nmr': True, 'has_mass_spec': True, 'has_iso_dist': True, 'mw': 362.799, 'name': 'tribromobenzene-1,3,5-triol', 'has_mw': True, 'formula': 'C6H3Br3O3', 'has_formula': True, 'np_pathway': ['Shikimates and Phenylpropanoids'], 'np_superclass': [], 'np_class': [], 'h_nmr': [7.1405471106596465, 7.1405471106596465, 7.1405471106596465], 'c_nmr': [153.33137424827768, 153.33137424827768, 153.33137424827768, 97.33674830695483, 97.33674830695483, 97.33674830695483], 'mass_spec': [[281.8521728515625, 15.783791542053223], [342.75994873046875, 3.930712938308716], [359.7626953125, 3.1665396690368652], [360.7705078125, 55.29762649536133]], 'fragidx': [6, 23, 29, 84, 437, 562, 4010], 'h_nmr_error': [3.83, 3.83, 3.83], 'c_nmr_error': [8.23, 8.23, 8.23, 9.6, 9.6, 9.6], 'hsqc': [], 'hsqc_error': []}


In [7]:
nmr_data['Oc1c(Br)c(O)c(Br)c(O)c1Br']

{'h_nmr': [7.1405471106596465, 7.1405471106596465, 7.1405471106596465],
 'c_nmr': [153.33137424827768,
  153.33137424827768,
  153.33137424827768,
  97.33674830695483,
  97.33674830695483,
  97.33674830695483],
 'hsqc': [],
 'h_nmr_error': [3.83, 3.83, 3.83],
 'c_nmr_error': [8.23, 8.23, 8.23, 9.6, 9.6, 9.6],
 'hsqc_error': []}