# This is Step 0 in the Pipeline - Embedding
With this notebook we generate molecular fingerprints and Spec2Vec embeddings.

### Imports

In [29]:
import pandas as pd
import os
from spec2vec.model_building import train_new_word2vec_model
from mass_spectra.to_fingerprint import generate_fingerprint, AVAILABLE_FINGERPRINTS
from mass_spectra.train_spec2vec import preprocess_file
from spec2vec import Spec2Vec, calc_vector
import gensim

### Define source files and output folder

In [30]:
metadata = './source/dataset/Metadata_test_TMS_derivatives.xlsx'
spectra = './source/dataset/Test dataset_TMS_RAW.mgf'
model_folder = './source/spec2vec/tms/'
embedding_folder = './source/embedding/tms_maccs/'

In [34]:
model_file = None # Can be None if the pipeline should train a new model, otherwise specify path to model file (.model ending)

In [36]:
assert os.path.isfile(metadata)
assert os.path.isfile(spectra)
assert os.path.isdir(model_folder)
assert os.path.isdir(embedding_folder)
assert model_file is None or os.path.isfile(model_file)
assert metadata.endswith('.xlsx')
assert spectra.endswith('.mgf')
assert model_file is None or model_file.endswith('.model')

### Define Parameters

In [37]:
AVAILABLE_FINGERPRINTS

['AtomPairs2DFingerprinter',
 'CircularFingerprinter',
 'EStateFingerprinter',
 'ExtendedFingerprinter',
 'KlekotaRothFingerprinter',
 'LingoFingerprinter',
 'MACCSFingerprinter',
 'PubchemFingerprinter']

In [38]:
FINGERPRINT = 'MACCSFingerprinter'

In [39]:
assert FINGERPRINT in AVAILABLE_FINGERPRINTS

### Read Source Files

In [40]:
metadata_df = pd.read_excel(metadata)
metadata_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 106 entries, 0 to 105
Data columns (total 15 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Name                      106 non-null    object 
 1   Molecular formula         106 non-null    object 
 2   Exact mass                106 non-null    float64
 3   PubChem ID                106 non-null    object 
 4   InChI                     105 non-null    object 
 5   InChI Key                 106 non-null    object 
 6   SMILEs                    105 non-null    object 
 7   Parent compound           106 non-null    object 
 8   Molecular formula.1       106 non-null    object 
 9   Exact mass.1              106 non-null    float64
 10  PubChem ID.1              106 non-null    object 
 11  InChI parent              106 non-null    object 
 12  InChI Key parent          106 non-null    object 
 13  SMILEs parent             106 non-null    object 
 14  present_in

In [41]:
print(f"Dataframe shape is {metadata_df.shape}")
metadata_df = metadata_df.dropna(axis=0, how='any')
print(f"Dataframe shape is {metadata_df.shape}")

Dataframe shape is (106, 15)
Dataframe shape is (105, 15)


In [42]:
# Sanitize source columns
def sanitize(s):
    s = s.lower()
    s = s.strip()
    s = s.replace(' ', '_')
    return s
metadata_df.rename(columns={c: sanitize(c) for c in metadata_df.columns}, inplace=True)

# Sanitize data in string columns
metadata_df[metadata_df.columns[metadata_df.dtypes == 'object']] = metadata_df[metadata_df.columns[metadata_df.dtypes == 'object']].applymap(lambda x: x.strip() if isinstance(x, str) else x)

In [43]:
metadata_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 105 entries, 0 to 105
Data columns (total 15 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   name                      105 non-null    object 
 1   molecular_formula         105 non-null    object 
 2   exact_mass                105 non-null    float64
 3   pubchem_id                105 non-null    object 
 4   inchi                     105 non-null    object 
 5   inchi_key                 105 non-null    object 
 6   smiles                    105 non-null    object 
 7   parent_compound           105 non-null    object 
 8   molecular_formula.1       105 non-null    object 
 9   exact_mass.1              105 non-null    float64
 10  pubchem_id.1              105 non-null    object 
 11  inchi_parent              105 non-null    object 
 12  inchi_key_parent          105 non-null    object 
 13  smiles_parent             105 non-null    object 
 14  present_in_trai

### Construct Fingerprints

In [44]:
# Generate fingerprints (can take some time depending on the number of molecules)
# If the fingerprint is not available it will be replaced by None values
fingerprints = generate_fingerprint([FINGERPRINT], metadata_df['inchi'])

Error Number 0 at idx 89 converting InChI key InChI=1S/C25H37NO4Si2/c1-16(27)25(30-32(6,7)8)12-11-18-19-15-17-9-10-20(29-31(3,4)5)22-21(17)24(18,23(25)28-22)13-14-26(19)2/h9-12: HTTPSConnectionPool(host='www.ebi.ac.uk', port=443): Max retries exceeded with url: /unichem/legacy/rest/inchi/InChI=1S/C25H37NO4Si2/c1-16(27)25(30-32(6,7)8)12-11-18-19-15-17-9-10-20(29-31(3,4)5)22-21(17)24(18,23(25)28-22)13-14-26(19)2/h9-12 (Caused by ResponseError('too many 400 error responses'))
Error Number 1 at idx 89 converting InChI key InChI=1S/C25H37NO4Si2/c1-16(27)25(30-32(6,7)8)12-11-18-19-15-17-9-10-20(29-31(3,4)5)22-21(17)24(18,23(25)28-22)13-14-26(19)2/h9-12: HTTPSConnectionPool(host='www.ebi.ac.uk', port=443): Max retries exceeded with url: /unichem/legacy/rest/inchi/InChI=1S/C25H37NO4Si2/c1-16(27)25(30-32(6,7)8)12-11-18-19-15-17-9-10-20(29-31(3,4)5)22-21(17)24(18,23(25)28-22)13-14-26(19)2/h9-12 (Caused by ResponseError('too many 400 error responses'))
Error Number 2 at idx 89 converting InChI ke

In [45]:
fingerprints_df = pd.concat([df[['name', 'inchi', 'inchi_key']], pd.DataFrame(fingerprints[FINGERPRINT])], axis=1)
fingerprints_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 106 entries, 0 to 98
Columns: 169 entries, name to 165
dtypes: float64(166), object(3)
memory usage: 140.8+ KB


In [46]:
print(f"Dataframe shape is {fingerprints_df.shape}")
fingerprints_df = fingerprints_df.dropna(axis=0, how='any')
print(f"Dataframe shape is {fingerprints_df.shape}")

Dataframe shape is (106, 169)
Dataframe shape is (103, 169)


In [47]:
fingerprints_df.to_csv(f'{embedding_folder}/fingerprint.csv', index=False)

### Preprocess Spectra

In [48]:
spectra_documents = preprocess_file(spectra)
len(spectra_documents)

3144

### Train Spec2Vec Model

In [49]:
EPOCHS = 10
WORKERS = 6
PROCESS_LOGGER = True

In [50]:
if model_file is None:
    model = train_new_word2vec_model(spectra_documents, iterations=EPOCHS, workers=WORKERS, progress_logger=PROCESS_LOGGER)

    model_file = f'{model_folder}/spec2vec.model'
    model.save(model_file)

  Epoch 1 of 10.Change in loss after epoch 1: 649373.625
  Epoch 2 of 10.Change in loss after epoch 2: 545320.375
  Epoch 3 of 10.Change in loss after epoch 3: 539171.5
  Epoch 4 of 10.Change in loss after epoch 4: 517972.0
  Epoch 5 of 10.Change in loss after epoch 5: 526741.5
  Epoch 6 of 10.Change in loss after epoch 6: 498357.25
  Epoch 7 of 10.Change in loss after epoch 7: 502363.75
  Epoch 8 of 10.Change in loss after epoch 8: 491027.5
  Epoch 9 of 10.Change in loss after epoch 9: 467041.0
  Epoch 10 of 10.Change in loss after epoch 10: 408274.5


### Construct Spec2Vec Embeddings

In [51]:
model = gensim.models.Word2Vec.load(model_file)
model = Spec2Vec(model=model)

In [52]:
embedding_df = []
for spectra in spectra_documents:
    title = spectra.metadata.get('title')
    inchikey = spectra.metadata.get('inchikey')

    embedding = calc_vector(model.model, spectra)
    embedding_df.append((title, inchikey, *embedding))
embedding_df = pd.DataFrame(embedding_df, columns=['name', 'inchi_key', *[str(i) for i in range(len(embedding))]])

In [53]:
# Sanitize data in string columns
embedding_df[embedding_df.columns[embedding_df.dtypes == 'object']] = embedding_df[embedding_df.columns[embedding_df.dtypes == 'object']].applymap(lambda x: x.strip() if isinstance(x, str) else x)

In [54]:
embedding_df.to_csv(f'{embedding_folder}/spec2vec.csv', index=False)

### Merge Fingerprints and Embeddings

In [55]:
fingerprint_df = pd.read_csv(f'{embedding_folder}/fingerprint.csv')
fingerprint_df.shape

(103, 169)

In [56]:
embedding_df = pd.read_csv(f'{embedding_folder}/spec2vec.csv')
embedding_df.shape

(3144, 302)

In [57]:
fingerprint_keys = set(fingerprint_df['inchi_key'])
embedding_keys = set(embedding_df['inchi_key'])

print("Missing in fingerprint:")
print(fingerprint_keys - embedding_keys)
print("Missing in embedding:")
print(embedding_keys - fingerprint_keys)

Missing in fingerprint:
{'OIBARLCQMDCDSG-NSHDSACASA-N', 'AYONZGOWFAKCNA-UHFFFAOYSA-N'}
Missing in embedding:
{'PXHFJGOVGANQGI-UHFFFAOYSA-N', 'HBWAMRSFAPVOKZ-UHFFFAOYSA-N', 'JZGPZUIFYWMNKG-UHFFFAOYSA-N', 'ORYOBNFVKJSNIY-UHFFFAOYSA-N', 'HGGWBFIRNWOJCL-CPDXTSBQSA-N'}


In [58]:
prefixed_fingerprint_df = fingerprint_df.copy()
prefixed_fingerprint_df.set_index('inchi_key', inplace=True)
prefixed_fingerprint_df = prefixed_fingerprint_df[prefixed_fingerprint_df.columns[prefixed_fingerprint_df.dtypes != 'object']]
prefixed_fingerprint_df = prefixed_fingerprint_df.add_prefix('fingerprint_')

In [59]:
prefixed_embedding_df = embedding_df.copy()
prefixed_embedding_df.set_index('inchi_key', inplace=True)
prefixed_embedding_df = prefixed_embedding_df[prefixed_embedding_df.columns[prefixed_embedding_df.dtypes != 'object']]
prefixed_embedding_df = prefixed_embedding_df.add_prefix('embedding_')

In [60]:
merged_df = prefixed_fingerprint_df.join(prefixed_embedding_df, how='inner')
merged_df.reset_index(inplace=True)
merged_df.shape

(3025, 467)

In [62]:
merged_df.to_csv(f'{embedding_folder}/merged.csv', index=False)