# This is Step 0 in the Pipeline - Embedding
With this notebook we generate molecular fingerprints and Spec2Vec embeddings.

### Imports

In [27]:
import pandas as pd
import os
from spec2vec.model_building import train_new_word2vec_model
from mass_spectra.to_fingerprint import generate_fingerprint, inchikey_to_inchi, AVAILABLE_FINGERPRINTS
from mass_spectra.train_spec2vec import preprocess_file
from spec2vec import Spec2Vec, calc_vector
import gensim
from time import time, sleep
from tqdm.notebook import tqdm
import requests

In [3]:
import importlib
import mass_spectra.train_spec2vec
importlib.reload(mass_spectra.train_spec2vec)

<module 'mass_spectra.train_spec2vec' from 'c:\\Users\\aleks\\Projects\\IJS\\mass_spectra\\mass_spectra\\train_spec2vec.py'>

### Define source files and output folder

In [5]:
metadata = './source/dataset/Metadata_training_TMS derivatives_3.1.xlsx'
spectra = './source/dataset/Train NIST 3.1 dataset_TMS_BS.mgf'
model_folder = './source/spec2vec/nist/'
embedding_folder = './source/embedding/nist_all_fingerprints/'

In [6]:
model_file = None # Can be None if the pipeline should train a new model, otherwise specify path to model file (.model ending)

In [7]:
assert os.path.isfile(metadata)
assert os.path.isfile(spectra)
assert os.path.isdir(model_folder)
assert os.path.isdir(embedding_folder)
assert model_file is None or os.path.isfile(model_file)
assert metadata.endswith('.xlsx')
assert spectra.endswith('.mgf')
assert model_file is None or model_file.endswith('.model')

### Define Parameters

In [8]:
AVAILABLE_FINGERPRINTS

['AtomPairs2DFingerprinter',
 'CircularFingerprinter',
 'EStateFingerprinter',
 'ExtendedFingerprinter',
 'KlekotaRothFingerprinter',
 'MACCSFingerprinter',
 'PubchemFingerprinter',
 'SubstructureFingerprinter']

In [9]:
FINGERPRINTS = ['EStateFingerprinter', 'MACCSFingerprinter', 'PubchemFingerprinter', 'SubstructureFingerprinter']

In [10]:
FINGERPRINT_NAMES = [v.replace('Fingerprinter', '') for v in FINGERPRINTS]

In [11]:
REMOVE_CONSTANT_BITS = True
REMOVE_DUPLICATE_BITS = True

In [12]:
assert set(FINGERPRINTS).issubset(set(AVAILABLE_FINGERPRINTS))

### Read Source Files

In [13]:
metadata_df = pd.read_excel(metadata)
metadata_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4648 entries, 0 to 4647
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Name               4648 non-null   object 
 1   InChIKey           4644 non-null   object 
 2   Molecular formula  4648 non-null   object 
 3   CAS number         4648 non-null   object 
 4   Exact mass         4648 non-null   float64
 5   Mw (g/mol)         4648 non-null   object 
 6   NIST NO            4648 non-null   int64  
 7   ID                 4648 non-null   int64  
dtypes: float64(1), int64(2), object(5)
memory usage: 290.6+ KB


In [14]:
print(f"Dataframe shape is {metadata_df.shape}")
metadata_df = metadata_df.dropna(axis=0, how='any')
print(f"Dataframe shape is {metadata_df.shape}")

Dataframe shape is (4648, 8)
Dataframe shape is (4644, 8)


In [15]:
# Sanitize source columns
def sanitize(s):
    s = s.lower()
    s = s.strip()
    s = s.replace(' ', '_')
    return s
metadata_df.rename(columns={c: sanitize(c) for c in metadata_df.columns}, inplace=True)

# Sanitize data in string columns
metadata_df[metadata_df.columns[metadata_df.dtypes == 'object']] = metadata_df[metadata_df.columns[metadata_df.dtypes == 'object']].applymap(lambda x: x.strip() if isinstance(x, str) else x)

In [16]:
metadata_df.rename(columns={'inchikey': 'inchi_key'}, inplace=True)

In [17]:
metadata_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4644 entries, 0 to 4643
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   name               4644 non-null   object 
 1   inchi_key          4644 non-null   object 
 2   molecular_formula  4644 non-null   object 
 3   cas_number         4644 non-null   object 
 4   exact_mass         4644 non-null   float64
 5   mw_(g/mol)         4644 non-null   object 
 6   nist_no            4644 non-null   int64  
 7   id                 4644 non-null   int64  
dtypes: float64(1), int64(2), object(5)
memory usage: 326.5+ KB


In [18]:
DESCRIPTIVE_COLUMNS = ['inchi_key', 'inchi', 'name']

In [46]:
metadata_df['inchi'] = metadata_df.inchi_key.map(inchi_key_to_inchi)
print(f"Missing inchi keys: {len(missing_inchi_keys)}")
print(f"Found inchi keys: {len(inchi_key_to_inchi)}")
metadata_df.to_excel(f"{metadata.split('.')[0]}_with_inchi.xlsx", index=False)

Missing inchi keys: 894
Found inchi keys: 1039


In [43]:
if 'inchi' not in metadata_df.columns:
    URL = "https://www.chemspider.com/InChI.asmx/InChIKeyToInChI?inchi_key={}"
    missing_inchi_keys = set()
    try:
        print(len(inchi_key_to_inchi))
    except NameError:
        inchi_key_to_inchi = {}
    
    inchi_keys = metadata_df.inchi_key.unique()
    for inchi_key in tqdm(inchi_keys):
        if inchi_key in inchi_key_to_inchi:
            continue
        xml_response = requests.get(URL.format(inchi_key), timeout=5)
        # xml response will only have xml version and one string tag
        # if inchi key could not be ocnverted string tag is not closed and empty
        if xml_response.status_code == 200 and xml_response.text.endswith('</string>'):
            inchi = xml_response.text.split('>')[-2].split('<')[0].strip()
            inchi_key_to_inchi[inchi_key] = inchi
        else:
            missing_inchi_keys.add(inchi_key)
    metadata_df['inchi'] = metadata_df.inchi_key.map(inchi_key_to_inchi)
    print(f"Missing inchi keys: {len(missing_inchi_keys)}")
    print(f"Found inchi keys: {len(inchi_key_to_inchi)}")
    metadata_df.to_excel(f"{metadata.split('.')[0]}_with_inchi.xlsx", index=False)

434


  0%|          | 0/3752 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
metadata_df

Unnamed: 0,name,inchi_key,molecular_formula,cas_number,exact_mass,mw_(g/mol),nist_no,id
0,"Heparin, hexa(trimethylsilyl)-",SWNJOCYLDHZZBH-UHFFFAOYSA-N,C32H69NO11Si6,0,811.348620,811,161321,30
1,"Deoxyfructosazine, heptakis(trimethylsilyl)- d...",PGVKXFDGUWEAAZ-UHFFFAOYSA-N,C33H76N2O7Si7,68361171,808.403740,808,465574,31
2,"trans-Piceid, 6TMS",ZAYPUEYKPPTDBS-FMQUCBEESA-N,C38H70O8Si6,0,822.368630,822,414022,38
3,"cis-Piceid, 6TMS",ZAYPUEYKPPTDBS-VXPUYCOJSA-N,C38H70O8Si6,0,822.368630,822,414023,41
4,"Azadirachtin, O,O-bis(trimethylsilyl)-",QGPRRBSVFYCNEQ-KGENOOAVSA-N,C41H60O16Si2,37293155,864.341988,864,67466,48
...,...,...,...,...,...,...,...,...
4639,"Formamide, 2TMS derivative",PQCLLXHKPSJTOY-UHFFFAOYSA-N,C7H19NOSi2,15500-60-4,189.100517,189,368600,148144
4640,"Benzamide, TMS derivative",QKZIAZLDMXJTCK-UHFFFAOYSA-N,C10H15NOSi,1011-57-0,193.092291,193,417318,182150
4641,"Benzamide, TMS derivative",DCQRROFFBYXKTH-UHFFFAOYSA-N,C10H15NOSi,1011-57-0,193.092291,193,466168,12389
4642,"Benzamide, TMS derivative",GGCVGNFGUJEXDQ-UHFFFAOYSA-N,C10H15NOSi,1011-57-0,193.092291,193,282481,30959


In [None]:
# metadata_df[DESCRIPTIVE_COLUMNS].head(3)

### Construct Fingerprints

In [None]:
# Generate fingerprints (can take some time depending on the number of molecules)
# If the fingerprint is not available it will be replaced by None values
fingerprints = generate_fingerprint(FINGERPRINTS, metadata_df['inchi_key'])

Generating EStateFingerprinter fingerprint
Converting 4644 InChI keys to 79 bit EStateFingerprinter fingerprint


Converted InChI key SWNJOCYLDHZZBH-UHFFFAOYSA-N to InChI=1S/C32H69NO11Si6/c1-22(34)33-26-29(27(41-47(8,9)10)25(21-36-45(2,3)4)38-31(26)44-50(17,18)19)39-32-28(42-48(11,12)13)23(40-46(5,6)7)20-24(37-32)30(35)43-49(14,15)16/h20,23,25-29,31-32H,21H2,1-19H3,(H,33,34)
Converted InChI key PGVKXFDGUWEAAZ-UHFFFAOYSA-N to InChI=1S/C33H76N2O7Si7/c1-43(2,3)36-25-30(39-46(10,11)12)29(38-45(7,8)9)22-27-23-34-24-28(35-27)32(41-48(16,17)18)33(42-49(19,20)21)31(40-47(13,14)15)26-37-44(4,5)6/h23-24,29-33H,22,25-26H2,1-21H3
Converted InChI key ZAYPUEYKPPTDBS-FMQUCBEESA-N to InChI=1S/C38H70O8Si6/c1-47(2,3)39-28-34-35(44-50(10,11)12)36(45-51(13,14)15)37(46-52(16,17)18)38(41-34)40-32-25-30(26-33(27-32)43-49(7,8)9)20-19-29-21-23-31(24-22-29)42-48(4,5)6/h19-27,34-38H,28H2,1-18H3/b20-19+
Converted InChI key ZAYPUEYKPPTDBS-VXPUYCOJSA-N to InChI=1S/C38H70O8Si6/c1-47(2,3)39-28-34-35(44-50(10,11)12)36(45-51(13,14)15)37(46-52(16,17)18)38(41-34)40-32-25-30(26-33(27-32)43-49(7,8)9)20-19-29-21-23-31(24-22-29)42-48(4,

KeyboardInterrupt: 

In [None]:
# check if inchi is set as index and if not set it
for k, v in fingerprints.items():
    if not v.index.name == 'inchi':
        fingerprints[k].set_index('inchi', inplace=True)

In [None]:
# combine all generated fingerprints with prefixed column names
merged = None
for fp, fp_df in fingerprints.items():
    tmp = fp_df.copy()
    tmp.columns = [f'{fp.lower().replace("fingerprinter", "")}_{c}' for c in tmp.columns]
    if merged is None:
        merged = tmp
    else:
        merged = merged.join(tmp, how='inner')
merged.shape

(104, 1433)

In [None]:
if REMOVE_CONSTANT_BITS:
    merged = merged[merged.columns[merged.nunique() > 1]]
merged.shape

(104, 604)

In [None]:
if REMOVE_DUPLICATE_BITS:
    merged = merged.T.drop_duplicates().T
merged.shape

(104, 302)

In [None]:
descriptive_data = metadata_df[DESCRIPTIVE_COLUMNS].set_index('inchi')
fingerprints_df = descriptive_data.join(merged, how='inner').reset_index()
other_columns = [c for c in fingerprints_df.columns if c not in DESCRIPTIVE_COLUMNS]
fingerprints_df = fingerprints_df[DESCRIPTIVE_COLUMNS + other_columns]
fingerprints_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104 entries, 0 to 103
Columns: 305 entries, inchi_key to substructure_bit_306
dtypes: int64(302), object(3)
memory usage: 247.9+ KB


In [None]:
fingerprints_df.head(3)

Unnamed: 0,inchi_key,inchi,name,estate_bit_6,estate_bit_7,estate_bit_8,estate_bit_9,estate_bit_10,estate_bit_11,estate_bit_12,...,substructure_bit_134,substructure_bit_135,substructure_bit_278,substructure_bit_286,substructure_bit_290,substructure_bit_299,substructure_bit_300,substructure_bit_302,substructure_bit_304,substructure_bit_306
0,FWZOFSHJDAIJQE-UHFFFAOYSA-N,InChI=1S/C27H46O2Si2/c1-11-12-13-14-22-18-25(2...,"Cannabidiol, O,O-bis trimethylsilyl ester",1,1,1,0,1,1,1,...,0,0,0,0,0,0,0,0,0,1
1,VUNXPEWGQXFNOL-UHFFFAOYSA-N,InChI=1S/C24H34O2Si/c1-8-9-10-11-18-15-21-23(2...,"Cannabinol, O-trimethylsilyl-",1,0,1,0,0,1,0,...,0,0,1,0,0,0,0,0,0,1
2,JFPSLJJGWCHYOE-WOJBJXKFSA-N,InChI=1S/C24H38O2Si/c1-8-9-10-11-18-15-21-23(2...,"Δ9-tetrahydrocannabinol, TMS derivative",1,0,1,0,1,1,0,...,0,0,1,0,0,0,0,0,0,1


In [None]:
print(f"Dataframe shape is {fingerprints_df.shape}")
fingerprints_df = fingerprints_df.dropna(axis=0, how='any')
print(f"Dataframe shape is {fingerprints_df.shape}")

Dataframe shape is (104, 305)
Dataframe shape is (104, 305)


In [None]:
fingerprints_df.to_csv(f'{embedding_folder}/fingerprint.csv', index=False)

### Preprocess Spectra

In [None]:
spectra_documents = preprocess_file(spectra)
len(spectra_documents)

3144

### Train Spec2Vec Model

In [None]:
EPOCHS = 25
WORKERS = 6
PROCESS_LOGGER = True

In [None]:
if model_file is None:
    model = train_new_word2vec_model(spectra_documents, iterations=EPOCHS, workers=WORKERS, progress_logger=PROCESS_LOGGER)

    model_file = f'{model_folder}/spec2vec.model'
    model.save(model_file)

### Construct Spec2Vec Embeddings

In [None]:
model = gensim.models.Word2Vec.load(model_file)
model = Spec2Vec(model=model)

In [None]:
embedding_df = []
for spectra in spectra_documents:
    inchikey = spectra.metadata.get('inchikey')

    embedding = calc_vector(model.model, spectra)
    embedding_df.append((inchikey, *embedding))
embedding_df = pd.DataFrame(embedding_df, columns=['inchi_key', *[str(i) for i in range(len(embedding))]])
embedding_df.set_index('inchi_key', inplace=True)
embedding_df.shape

(3144, 300)

In [None]:
descriptive_data = metadata_df[DESCRIPTIVE_COLUMNS].set_index('inchi_key')
embedding_df = descriptive_data.join(embedding_df, how='inner').reset_index()
other_columns = [c for c in embedding_df.columns if c not in DESCRIPTIVE_COLUMNS]
embedding_df = embedding_df[DESCRIPTIVE_COLUMNS + other_columns]
embedding_df.shape

(3082, 303)

In [None]:
embedding_df.to_csv(f'{embedding_folder}/spec2vec.csv', index=False)

### Merge Fingerprints and Embeddings

In [None]:
fingerprint_df = pd.read_csv(f'{embedding_folder}/fingerprint.csv')
fingerprint_df.shape

(104, 305)

In [None]:
embedding_df = pd.read_csv(f'{embedding_folder}/spec2vec.csv')
embedding_df.shape

(3082, 303)

In [None]:
fingerprint_keys = set(fingerprint_df['inchi_key'])
embedding_keys = set(embedding_df['inchi_key'])

print("Missing in fingerprint:")
print(fingerprint_keys - embedding_keys)
print("Missing in embedding:")
print(embedding_keys - fingerprint_keys)

Missing in fingerprint:
{'OIBARLCQMDCDSG-NSHDSACASA-N', 'AYONZGOWFAKCNA-UHFFFAOYSA-N'}
Missing in embedding:
{'HBWAMRSFAPVOKZ-UHFFFAOYSA-N'}


In [None]:
prefixed_fingerprint_df = fingerprint_df.copy()
prefixed_fingerprint_df.set_index(DESCRIPTIVE_COLUMNS, inplace=True)
prefixed_fingerprint_df = prefixed_fingerprint_df[prefixed_fingerprint_df.columns[prefixed_fingerprint_df.dtypes != 'object']]
prefixed_fingerprint_df = prefixed_fingerprint_df.add_prefix('fingerprint_')

In [None]:
prefixed_embedding_df = embedding_df.copy()
prefixed_embedding_df.set_index(DESCRIPTIVE_COLUMNS, inplace=True)
prefixed_embedding_df = prefixed_embedding_df[prefixed_embedding_df.columns[prefixed_embedding_df.dtypes != 'object']]
prefixed_embedding_df = prefixed_embedding_df.add_prefix('embedding_')

In [None]:
merged_df = prefixed_fingerprint_df.join(prefixed_embedding_df, how='inner')
merged_df.reset_index(inplace=True)
merged_df.shape

(3052, 605)

In [None]:
merged_df.to_csv(f'{embedding_folder}/merged.csv', index=False)