# Let's take a look at the General Service List (list of common english words) ... 2000 words or so

https://www.eapfoundation.com/vocab/general/gsl/

# Corpus of Contemporary American English (COCA) <-- sorted by frequency 
https://www.wordfrequency.info/ 
UGH i have to purchase this? booo

# Game plan...
* I want certain reactive groups to correlate with whether things are nouns or verbs.
    * I think that means I have to somehow separate out reactive vs. unreactive chemicals first.
    * Similarly separate out verbs and nouns in english (that should have a function already)
* Then I want to maybe match the most common words in each category using COCA for english and some other sort of frequency of most common chemicals...
* After frequency, let's rely on embeddings to get everything else closest to the nearest neighbor anchor point. This way we can be sure things are at least somewhat aligned.
    * alternatively we don't even start with the anchors but jump straight into the embeddings? Need to talk to David to get his help here with alignment.


* Subjunctive / conditional / other fun tenses could be inert gases that I add to the reaction like Helium could be a good example of conditional.
* Electro chemistry would be crazy... maybe that can be for a tense that is crazy... again I want to say conditional.

* Past, present, future will be decided by which side of a reaction is given. if it's in equilibrium then it's present.

I guess stoichometry is important here :^P

* One thing I wish would be that we have 

# The rest is easy enough... what about getting the top molecules now?

Okay let's take a swing at Enamine Building Blocks, I found a list of 500K of them. that should be a good start.
Can we create their morgan fingerprints and cluster them?

Taken from https://zinc12.docking.org/db/byvendor/enaminebb/enaminebb.in


SCREW it, let's just download the ZINC compounds directly...  https://files.docking.org/zinc20-ML/

Double screw it... let's just use what we already have with enamine


In [29]:
import h5py
import numpy as np
import pickle
import requests
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
from tqdm import tqdm
import time

def deserialize_numpy_array(data):
    return pickle.loads(data)

def inchikey_to_smiles(inchikey):
    url = f'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/inchikey/{inchikey}/property/CanonicalSMILES/JSON'
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        if 'PropertyTable' in data and 'Properties' in data['PropertyTable'] and len(data['PropertyTable']['Properties']) > 0:
            return data['PropertyTable']['Properties'][0]['CanonicalSMILES']
    print(f"Failed to retrieve data for InChIKey: {inchikey}")
    return None

def get_single_molecule_embedding(smiles, radius=5, nBits=2048):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=radius, nBits=nBits)
        return list(fp)
    return None

def process_batch(inchikeys, output_file="final_processed_data.csv"):
    data = []
    for inchikey in inchikeys:
        smiles = inchikey_to_smiles(inchikey[0])
        if smiles:
            fingerprint = get_single_molecule_embedding(smiles)
            data.append({'InChIKey': inchikey[0], 'SMILES': smiles, 'Morgan_fingerprint': fingerprint})

    if data:
        df = pd.DataFrame(data)
        mode = 'a' if start_index > 0 else 'w'
        header = not pd.read_csv(output_file).empty if start_index > 0 else True  # Check if the CSV is empty or not
        df.to_csv(output_file, mode=mode, header=header, index=False)
    return len(data)

def check_progress(output_file):
    """Check if the output file already exists and determine the last processed index"""
    try:
        df = pd.read_csv(output_file)
        last_index_processed = len(df)
        return last_index_processed
    except FileNotFoundError:
        return 0

# Load InChIKeys from HDF5 file
file_path = 'zinc_stock.hdf5'
with h5py.File(file_path, 'r') as f:
    block0_values = f['table/block0_values'][:]
    
    # Deserialize the numpy array
    if isinstance(block0_values, np.ndarray) and block0_values.dtype == 'O':
        block0_values = np.concatenate(block0_values)
    
    numpy_data = deserialize_numpy_array(block0_values.tobytes())
    inchikeys = numpy_data.tolist()  # Convert to list for easier handling

# Check for existing progress and determine where to resume
output_file = "final_processed_data.csv"
start_index = check_progress(output_file)

batch_size = 100  # Adjust batch size as needed
total_batches = (len(inchikeys) + batch_size - 1) // batch_size

for i in tqdm(range(start_index // batch_size, total_batches)):
    start = i * batch_size
    end = min(start + batch_size, len(inchikeys))
    batch = inchikeys[start:end]
    processed_count = process_batch(batch, output_file)
    print(f"Processed batch {i + 1}/{total_batches}, {processed_count} records saved")
    time.sleep(1)  # Add delay to avoid hitting API rate limits

print("Processing complete.")


  0%|          | 0/174229 [00:00<?, ?it/s]

Processed batch 1/174229, 100 records saved


  0%|          | 1/174229 [00:23<1143:23:44, 23.63s/it]

Processed batch 2/174229, 100 records saved


  0%|          | 2/174229 [00:47<1145:50:54, 23.68s/it]

Failed to retrieve data for InChIKey: RLBVNHKQTOTPJO-UHFFFAOYSA-N
Processed batch 3/174229, 99 records saved


  0%|          | 3/174229 [01:11<1165:04:02, 24.07s/it]

Processed batch 4/174229, 100 records saved


  0%|          | 4/174229 [01:35<1161:29:46, 24.00s/it]

Processed batch 5/174229, 100 records saved


  0%|          | 5/174229 [02:00<1174:34:07, 24.27s/it]

Processed batch 6/174229, 100 records saved


  0%|          | 6/174229 [02:26<1204:44:23, 24.89s/it]

Processed batch 7/174229, 100 records saved


  0%|          | 7/174229 [02:50<1188:55:36, 24.57s/it]

Processed batch 8/174229, 100 records saved


  0%|          | 8/174229 [03:18<1236:07:03, 25.54s/it]

Failed to retrieve data for InChIKey: CPIUTOFQEKHTNJ-UHFFFAOYSA-N
Processed batch 9/174229, 99 records saved


  0%|          | 9/174229 [03:45<1256:14:02, 25.96s/it]

Processed batch 10/174229, 100 records saved


  0%|          | 10/174229 [04:09<1237:22:42, 25.57s/it]

Processed batch 11/174229, 100 records saved


  0%|          | 11/174229 [04:36<1256:30:58, 25.96s/it]

Processed batch 12/174229, 100 records saved


  0%|          | 12/174229 [05:00<1224:52:02, 25.31s/it]

Failed to retrieve data for InChIKey: OLUCSAXZBYCQLW-UHFFFAOYSA-N
Processed batch 13/174229, 99 records saved


  0%|          | 13/174229 [05:24<1203:04:11, 24.86s/it]

Failed to retrieve data for InChIKey: RSPFBUOBABXVGM-UHFFFAOYSA-N


  0%|          | 13/174229 [05:43<1280:21:09, 26.46s/it]


KeyboardInterrupt: 

In [23]:
import h5py
import numpy as np
import pickle
import requests

def deserialize_numpy_array(data):
    return pickle.loads(data)

def inchikey_to_smiles(inchikey):
    url = f'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/inchikey/{inchikey}/property/CanonicalSMILES/JSON'
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        smiles = data['PropertyTable']['Properties'][0]['CanonicalSMILES']
        return smiles
    else:
        print(f"Failed to retrieve data for InChIKey: {inchikey}")
        return None


file_path = 'zinc_stock.hdf5'
with h5py.File(file_path, 'r') as f:
    block0_values = f['table/block0_values'][:]
    
    # Deserialize the numpy array
    if isinstance(block0_values, np.ndarray) and block0_values.dtype == 'O':
        block0_values = np.concatenate(block0_values)
    
    numpy_data = deserialize_numpy_array(block0_values.tobytes())
    inchikeys = numpy_data.tolist()  # Convert to list for easier handling
    # print('total len of inchikeys ', len(inchikeys)) # TOTAL: 17,422,831
    
    for inchikey in inchikeys[:10]:  # Print the first 10 InChIKeys
        print(f"InChIKey: {inchikey} SMILES {inchikey_to_smiles(inchikey[0])}")


total len of inchikeys  17422831
InChIKey: ['RYYVLZVUVIJVGH-UHFFFAOYSA-N'] SMILES CN1C=NC2=C1C(=O)N(C(=O)N2C)C
InChIKey: ['HZZVJAQRINQKSD-RQJHMYQMSA-N'] SMILES C1C2N(C1=O)C(C(=CCO)O2)C(=O)O
InChIKey: ['RRTKVYSLIGQWCO-UHFFFAOYSA-N'] SMILES CN1C2=C(C(=O)N(C1=O)C)N=CN=N2
InChIKey: ['YAPQBXQYLJRXSA-UHFFFAOYSA-N'] SMILES CN1C=NC2=C1C(=O)NC(=O)N2C
InChIKey: ['SZPBAPFUXAADQV-UHFFFAOYSA-N'] SMILES C1=CN=C(C(=O)N1)C(=O)N
InChIKey: ['MFHLWNMYHQMDMV-UHFFFAOYSA-N'] SMILES C1C2=C(C(=O)O1)NC(=O)NC2=O
InChIKey: ['BGQNOPFTJROKJE-UHFFFAOYSA-N'] SMILES CN1C(=C(C(=O)N(C1=O)C)N)N
InChIKey: ['UNVDEKZKNWGICE-BQBZGAKWSA-N'] SMILES CC(=O)N1CN(C(C1O)O)C(=O)C
InChIKey: ['UNVDEKZKNWGICE-RNFRBKRXSA-N'] SMILES CC(=O)N1CN(C(C1O)O)C(=O)C
InChIKey: ['XBAVGYMDOXCWQU-UHFFFAOYSA-N'] SMILES CC1=C(C(=O)NC(=O)N1)CO
