# This is Step 0 in the Pipeline - Embedding
With this notebook we generate molecular fingerprints and Spec2Vec embeddings.

### Imports

In [1]:
import pandas as pd
import os
from spec2vec.model_building import train_new_word2vec_model
from mass_spectra.to_fingerprint import generate_fingerprint, AVAILABLE_FINGERPRINTS
from mass_spectra.train_spec2vec import preprocess_file
from spec2vec import Spec2Vec, calc_vector
import gensim

In [2]:
import importlib
import mass_spectra.train_spec2vec
importlib.reload(mass_spectra.train_spec2vec)

<module 'mass_spectra.train_spec2vec' from 'c:\\Users\\aleks\\Projects\\IJS\\mass_spectra\\mass_spectra\\train_spec2vec.py'>

### Define source files and output folder

In [3]:
metadata = './source/dataset/Metadata_test_TMS_derivatives.xlsx'
spectra = './source/dataset/Test dataset_TMS_RAW.mgf'
model_folder = './source/spec2vec/all_positive/'
embedding_folder = './source/embedding/all_positive_all_fingerprints/'

In [4]:
model_file = './source/spec2vec/all_positive/spec2vec.model' # Can be None if the pipeline should train a new model, otherwise specify path to model file (.model ending)

In [5]:
assert os.path.isfile(metadata)
assert os.path.isfile(spectra)
assert os.path.isdir(model_folder)
assert os.path.isdir(embedding_folder)
assert model_file is None or os.path.isfile(model_file)
assert metadata.endswith('.xlsx')
assert spectra.endswith('.mgf')
assert model_file is None or model_file.endswith('.model')

### Define Parameters

In [6]:
AVAILABLE_FINGERPRINTS

['AtomPairs2DFingerprinter',
 'CircularFingerprinter',
 'EStateFingerprinter',
 'ExtendedFingerprinter',
 'KlekotaRothFingerprinter',
 'MACCSFingerprinter',
 'PubchemFingerprinter',
 'SubstructureFingerprinter']

In [7]:
FINGERPRINTS = ['EStateFingerprinter', 'MACCSFingerprinter', 'PubchemFingerprinter', 'SubstructureFingerprinter']

In [8]:
assert set(FINGERPRINTS).issubset(set(AVAILABLE_FINGERPRINTS))

### Read Source Files

In [9]:
metadata_df = pd.read_excel(metadata)
metadata_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 106 entries, 0 to 105
Data columns (total 15 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Name                      106 non-null    object 
 1   Molecular formula         106 non-null    object 
 2   Exact mass                106 non-null    float64
 3   PubChem ID                106 non-null    object 
 4   InChI                     105 non-null    object 
 5   InChI Key                 106 non-null    object 
 6   SMILEs                    105 non-null    object 
 7   Parent compound           106 non-null    object 
 8   Molecular formula.1       106 non-null    object 
 9   Exact mass.1              106 non-null    float64
 10  PubChem ID.1              106 non-null    object 
 11  InChI parent              106 non-null    object 
 12  InChI Key parent          106 non-null    object 
 13  SMILEs parent             106 non-null    object 
 14  present_in

In [10]:
print(f"Dataframe shape is {metadata_df.shape}")
metadata_df = metadata_df.dropna(axis=0, how='any')
print(f"Dataframe shape is {metadata_df.shape}")

Dataframe shape is (106, 15)
Dataframe shape is (105, 15)


In [11]:
# Sanitize source columns
def sanitize(s):
    s = s.lower()
    s = s.strip()
    s = s.replace(' ', '_')
    return s
metadata_df.rename(columns={c: sanitize(c) for c in metadata_df.columns}, inplace=True)

# Sanitize data in string columns
metadata_df[metadata_df.columns[metadata_df.dtypes == 'object']] = metadata_df[metadata_df.columns[metadata_df.dtypes == 'object']].applymap(lambda x: x.strip() if isinstance(x, str) else x)

In [12]:
metadata_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 105 entries, 0 to 105
Data columns (total 15 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   name                      105 non-null    object 
 1   molecular_formula         105 non-null    object 
 2   exact_mass                105 non-null    float64
 3   pubchem_id                105 non-null    object 
 4   inchi                     105 non-null    object 
 5   inchi_key                 105 non-null    object 
 6   smiles                    105 non-null    object 
 7   parent_compound           105 non-null    object 
 8   molecular_formula.1       105 non-null    object 
 9   exact_mass.1              105 non-null    float64
 10  pubchem_id.1              105 non-null    object 
 11  inchi_parent              105 non-null    object 
 12  inchi_key_parent          105 non-null    object 
 13  smiles_parent             105 non-null    object 
 14  present_in_trai

### Construct Fingerprints

In [13]:
# Generate fingerprints (can take some time depending on the number of molecules)
# If the fingerprint is not available it will be replaced by None values
fingerprints = generate_fingerprint(FINGERPRINTS, metadata_df['inchi'])

Generating EStateFingerprinter fingerprint
Converting 105 InChI keys to 79 bit EStateFingerprinter fingerprint
Error Number 0 at idx 89 converting InChI key InChI=1S/C25H37NO4Si2/c1-16(27)25(30-32(6,7)8)12-11-18-19-15-17-9-10-20(29-31(3,4)5)22-21(17)24(18,23(25)28-22)13-14-26(19)2/h9-12: HTTPSConnectionPool(host='www.ebi.ac.uk', port=443): Max retries exceeded with url: /unichem/legacy/rest/inchi/InChI=1S/C25H37NO4Si2/c1-16(27)25(30-32(6,7)8)12-11-18-19-15-17-9-10-20(29-31(3,4)5)22-21(17)24(18,23(25)28-22)13-14-26(19)2/h9-12 (Caused by ResponseError('too many 400 error responses'))
Error Number 1 at idx 89 converting InChI key InChI=1S/C25H37NO4Si2/c1-16(27)25(30-32(6,7)8)12-11-18-19-15-17-9-10-20(29-31(3,4)5)22-21(17)24(18,23(25)28-22)13-14-26(19)2/h9-12: HTTPSConnectionPool(host='www.ebi.ac.uk', port=443): Max retries exceeded with url: /unichem/legacy/rest/inchi/InChI=1S/C25H37NO4Si2/c1-16(27)25(30-32(6,7)8)12-11-18-19-15-17-9-10-20(29-31(3,4)5)22-21(17)24(18,23(25)28-22)13-14-26(19

In [15]:
# join all fingerprints dataframes by inchi column. Use fingerprint name as prefix without 'Fingerprinter' suffix
merged = fingerprints[FINGERPRINTS[0]]


In [16]:
merged

Unnamed: 0,inchi,bit_0,bit_1,bit_2,bit_3,bit_4,bit_5,bit_6,bit_7,bit_8,...,bit_69,bit_70,bit_71,bit_72,bit_73,bit_74,bit_75,bit_76,bit_77,bit_78
0,InChI=1S/C27H46O2Si2/c1-11-12-13-14-22-18-25(2...,0,0,0,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,InChI=1S/C24H34O2Si/c1-8-9-10-11-18-15-21-23(2...,0,0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
2,InChI=1S/C24H38O2Si/c1-8-9-10-11-18-15-21-23(2...,0,0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
3,InChI=1S/C24H38O2Si/c1-8-9-10-13-20-17-22-21(2...,0,0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
4,InChI=1S/C31H54O4Si3/c1-14-15-16-17-24-21-27(3...,0,0,0,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100,InChI=1S/C25H44O3Si2/c1-24-14-13-18(26)15-17(2...,0,0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
101,InChI=1S/C22H34O3Si/c1-21-11-10-15(23)12-14(21...,0,0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
102,"InChI=1S/C25H46O2Si2/c1-24-15-13-19(26-28(3,4)...",0,0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
103,InChI=1S/C22H38O2Si/c1-21-12-10-16(23)14-15(21...,0,0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0


In [1]:
fingerprints_df = pd.concat([metadata_df[['name', 'inchi', 'inchi_key']], pd.DataFrame(fingerprints[FINGERPRINT])], axis=1)
fingerprints_df.info()

NameError: name 'pd' is not defined

In [None]:
print(f"Dataframe shape is {fingerprints_df.shape}")
fingerprints_df = fingerprints_df.dropna(axis=0, how='any')
print(f"Dataframe shape is {fingerprints_df.shape}")

Dataframe shape is (106, 169)
Dataframe shape is (103, 169)


In [None]:
fingerprints_df.to_csv(f'{embedding_folder}/fingerprint.csv', index=False)

### Preprocess Spectra

In [None]:
spectra_documents = preprocess_file(spectra)
len(spectra_documents)

3144

### Train Spec2Vec Model

In [None]:
EPOCHS = 25
WORKERS = 6
PROCESS_LOGGER = True

In [None]:
if model_file is None:
    model = train_new_word2vec_model(spectra_documents, iterations=EPOCHS, workers=WORKERS, progress_logger=PROCESS_LOGGER)

    model_file = f'{model_folder}/spec2vec.model'
    model.save(model_file)

### Construct Spec2Vec Embeddings

In [None]:
model = gensim.models.Word2Vec.load(model_file)
model = Spec2Vec(model=model)

In [None]:
embedding_df = []
for spectra in spectra_documents:
    title = spectra.metadata.get('title')
    inchikey = spectra.metadata.get('inchikey')

    embedding = calc_vector(model.model, spectra)
    embedding_df.append((title, inchikey, *embedding))
embedding_df = pd.DataFrame(embedding_df, columns=['name', 'inchi_key', *[str(i) for i in range(len(embedding))]])

In [None]:
# Sanitize data in string columns
embedding_df[embedding_df.columns[embedding_df.dtypes == 'object']] = embedding_df[embedding_df.columns[embedding_df.dtypes == 'object']].applymap(lambda x: x.strip() if isinstance(x, str) else x)

In [None]:
embedding_df.to_csv(f'{embedding_folder}/spec2vec.csv', index=False)

### Merge Fingerprints and Embeddings

In [None]:
fingerprint_df = pd.read_csv(f'{embedding_folder}/fingerprint.csv')
fingerprint_df.shape

(103, 169)

In [None]:
embedding_df = pd.read_csv(f'{embedding_folder}/spec2vec.csv')
embedding_df.shape

(3144, 302)

In [None]:
fingerprint_keys = set(fingerprint_df['inchi_key'])
embedding_keys = set(embedding_df['inchi_key'])

print("Missing in fingerprint:")
print(fingerprint_keys - embedding_keys)
print("Missing in embedding:")
print(embedding_keys - fingerprint_keys)

Missing in fingerprint:
{'AYONZGOWFAKCNA-UHFFFAOYSA-N', 'OIBARLCQMDCDSG-NSHDSACASA-N'}
Missing in embedding:
{'ORYOBNFVKJSNIY-UHFFFAOYSA-N', 'PXHFJGOVGANQGI-UHFFFAOYSA-N', 'HGGWBFIRNWOJCL-CPDXTSBQSA-N', 'JZGPZUIFYWMNKG-UHFFFAOYSA-N', 'HBWAMRSFAPVOKZ-UHFFFAOYSA-N'}


In [None]:
prefixed_fingerprint_df = fingerprint_df.copy()
prefixed_fingerprint_df.set_index('inchi_key', inplace=True)
prefixed_fingerprint_df = prefixed_fingerprint_df[prefixed_fingerprint_df.columns[prefixed_fingerprint_df.dtypes != 'object']]
prefixed_fingerprint_df = prefixed_fingerprint_df.add_prefix('fingerprint_')

In [None]:
prefixed_embedding_df = embedding_df.copy()
prefixed_embedding_df.set_index('inchi_key', inplace=True)
prefixed_embedding_df = prefixed_embedding_df[prefixed_embedding_df.columns[prefixed_embedding_df.dtypes != 'object']]
prefixed_embedding_df = prefixed_embedding_df.add_prefix('embedding_')

In [None]:
merged_df = prefixed_fingerprint_df.join(prefixed_embedding_df, how='inner')
merged_df.reset_index(inplace=True)
merged_df.shape

(3025, 467)

In [None]:
merged_df.to_csv(f'{embedding_folder}/merged.csv', index=False)