# This is Step 0 in the Pipeline - Embedding
With this notebook we generate molecular fingerprints and Spec2Vec embeddings.

### Imports

In [108]:
import pandas as pd
import os
from spec2vec.model_building import train_new_word2vec_model
from mass_spectra.to_fingerprint import generate_fingerprint, AVAILABLE_FINGERPRINTS
from mass_spectra.train_spec2vec import preprocess_file
from spec2vec import Spec2Vec, calc_vector
import gensim

In [109]:
import importlib
import mass_spectra.train_spec2vec
importlib.reload(mass_spectra.train_spec2vec)

<module 'mass_spectra.train_spec2vec' from 'c:\\Users\\aleks\\Projects\\IJS\\mass_spectra\\mass_spectra\\train_spec2vec.py'>

### Define source files and output folder

In [110]:
metadata = './source/dataset/Metadata_test_TMS_derivatives.xlsx'
spectra = './source/dataset/Test dataset_TMS_RAW.mgf'
model_folder = './source/spec2vec/all_positive/'
embedding_folder = './source/embedding/all_positive_all_fingerprints/'

In [111]:
model_file = './source/spec2vec/all_positive/spec2vec.model' # Can be None if the pipeline should train a new model, otherwise specify path to model file (.model ending)

In [112]:
assert os.path.isfile(metadata)
assert os.path.isfile(spectra)
assert os.path.isdir(model_folder)
assert os.path.isdir(embedding_folder)
assert model_file is None or os.path.isfile(model_file)
assert metadata.endswith('.xlsx')
assert spectra.endswith('.mgf')
assert model_file is None or model_file.endswith('.model')

### Define Parameters

In [113]:
AVAILABLE_FINGERPRINTS

['AtomPairs2DFingerprinter',
 'CircularFingerprinter',
 'EStateFingerprinter',
 'ExtendedFingerprinter',
 'KlekotaRothFingerprinter',
 'MACCSFingerprinter',
 'PubchemFingerprinter',
 'SubstructureFingerprinter']

In [114]:
FINGERPRINTS = ['EStateFingerprinter', 'MACCSFingerprinter', 'PubchemFingerprinter', 'SubstructureFingerprinter']

In [115]:
FINGERPRINT_NAMES = [v.replace('Fingerprinter', '') for v in FINGERPRINTS]

In [116]:
REMOVE_CONSTANT_BITS = True
REMOVE_DUPLICATE_BITS = True

In [117]:
assert set(FINGERPRINTS).issubset(set(AVAILABLE_FINGERPRINTS))

### Read Source Files

In [118]:
metadata_df = pd.read_excel(metadata)
metadata_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 106 entries, 0 to 105
Data columns (total 15 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Name                      106 non-null    object 
 1   Molecular formula         106 non-null    object 
 2   Exact mass                106 non-null    float64
 3   PubChem ID                106 non-null    object 
 4   InChI                     105 non-null    object 
 5   InChI Key                 106 non-null    object 
 6   SMILEs                    105 non-null    object 
 7   Parent compound           106 non-null    object 
 8   Molecular formula.1       106 non-null    object 
 9   Exact mass.1              106 non-null    float64
 10  PubChem ID.1              106 non-null    object 
 11  InChI parent              106 non-null    object 
 12  InChI Key parent          106 non-null    object 
 13  SMILEs parent             106 non-null    object 
 14  present_in

In [119]:
print(f"Dataframe shape is {metadata_df.shape}")
metadata_df = metadata_df.dropna(axis=0, how='any')
print(f"Dataframe shape is {metadata_df.shape}")

Dataframe shape is (106, 15)
Dataframe shape is (105, 15)


In [120]:
# Sanitize source columns
def sanitize(s):
    s = s.lower()
    s = s.strip()
    s = s.replace(' ', '_')
    return s
metadata_df.rename(columns={c: sanitize(c) for c in metadata_df.columns}, inplace=True)

# Sanitize data in string columns
metadata_df[metadata_df.columns[metadata_df.dtypes == 'object']] = metadata_df[metadata_df.columns[metadata_df.dtypes == 'object']].applymap(lambda x: x.strip() if isinstance(x, str) else x)

In [121]:
metadata_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 105 entries, 0 to 105
Data columns (total 15 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   name                      105 non-null    object 
 1   molecular_formula         105 non-null    object 
 2   exact_mass                105 non-null    float64
 3   pubchem_id                105 non-null    object 
 4   inchi                     105 non-null    object 
 5   inchi_key                 105 non-null    object 
 6   smiles                    105 non-null    object 
 7   parent_compound           105 non-null    object 
 8   molecular_formula.1       105 non-null    object 
 9   exact_mass.1              105 non-null    float64
 10  pubchem_id.1              105 non-null    object 
 11  inchi_parent              105 non-null    object 
 12  inchi_key_parent          105 non-null    object 
 13  smiles_parent             105 non-null    object 
 14  present_in_trai

In [122]:
DESCRIPTIVE_COLUMNS = ['inchi_key', 'inchi', 'name']

In [123]:
metadata_df[DESCRIPTIVE_COLUMNS].head(3)

Unnamed: 0,inchi_key,inchi,name
0,FWZOFSHJDAIJQE-UHFFFAOYSA-N,InChI=1S/C27H46O2Si2/c1-11-12-13-14-22-18-25(2...,"Cannabidiol, O,O-bis trimethylsilyl ester"
1,VUNXPEWGQXFNOL-UHFFFAOYSA-N,InChI=1S/C24H34O2Si/c1-8-9-10-11-18-15-21-23(2...,"Cannabinol, O-trimethylsilyl-"
2,JFPSLJJGWCHYOE-WOJBJXKFSA-N,InChI=1S/C24H38O2Si/c1-8-9-10-11-18-15-21-23(2...,"Δ9-tetrahydrocannabinol, TMS derivative"


### Construct Fingerprints

In [124]:
# Generate fingerprints (can take some time depending on the number of molecules)
# If the fingerprint is not available it will be replaced by None values
fingerprints = generate_fingerprint(FINGERPRINTS, metadata_df['inchi'])

Generating EStateFingerprinter fingerprint
Converting 105 InChI keys to 79 bit EStateFingerprinter fingerprint


Error Number 0 at idx 89 converting InChI key InChI=1S/C25H37NO4Si2/c1-16(27)25(30-32(6,7)8)12-11-18-19-15-17-9-10-20(29-31(3,4)5)22-21(17)24(18,23(25)28-22)13-14-26(19)2/h9-12: HTTPSConnectionPool(host='www.ebi.ac.uk', port=443): Max retries exceeded with url: /unichem/legacy/rest/inchi/InChI=1S/C25H37NO4Si2/c1-16(27)25(30-32(6,7)8)12-11-18-19-15-17-9-10-20(29-31(3,4)5)22-21(17)24(18,23(25)28-22)13-14-26(19)2/h9-12 (Caused by ResponseError('too many 400 error responses'))
Error Number 1 at idx 89 converting InChI key InChI=1S/C25H37NO4Si2/c1-16(27)25(30-32(6,7)8)12-11-18-19-15-17-9-10-20(29-31(3,4)5)22-21(17)24(18,23(25)28-22)13-14-26(19)2/h9-12: HTTPSConnectionPool(host='www.ebi.ac.uk', port=443): Max retries exceeded with url: /unichem/legacy/rest/inchi/InChI=1S/C25H37NO4Si2/c1-16(27)25(30-32(6,7)8)12-11-18-19-15-17-9-10-20(29-31(3,4)5)22-21(17)24(18,23(25)28-22)13-14-26(19)2/h9-12 (Caused by ResponseError('too many 400 error responses'))
Error Number 2 at idx 89 converting InChI ke

In [125]:
# check if inchi is set as index and if not set it
for k, v in fingerprints.items():
    if not v.index.name == 'inchi':
        fingerprints[k].set_index('inchi', inplace=True)

In [126]:
# combine all generated fingerprints with prefixed column names
merged = None
for fp, fp_df in fingerprints.items():
    tmp = fp_df.copy()
    tmp.columns = [f'{fp.lower().replace("fingerprinter", "")}_{c}' for c in tmp.columns]
    if merged is None:
        merged = tmp
    else:
        merged = merged.join(tmp, how='inner')
merged.shape

(104, 1433)

In [127]:
if REMOVE_CONSTANT_BITS:
    merged = merged[merged.columns[merged.nunique() > 1]]
merged.shape

(104, 604)

In [128]:
if REMOVE_DUPLICATE_BITS:
    merged = merged.T.drop_duplicates().T
merged.shape

(104, 302)

In [129]:
descriptive_data = metadata_df[DESCRIPTIVE_COLUMNS].set_index('inchi')
fingerprints_df = descriptive_data.join(merged, how='inner').reset_index()
other_columns = [c for c in fingerprints_df.columns if c not in DESCRIPTIVE_COLUMNS]
fingerprints_df = fingerprints_df[DESCRIPTIVE_COLUMNS + other_columns]
fingerprints_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104 entries, 0 to 103
Columns: 305 entries, inchi_key to substructure_bit_306
dtypes: int64(302), object(3)
memory usage: 247.9+ KB


In [130]:
fingerprints_df.head(3)

Unnamed: 0,inchi_key,inchi,name,estate_bit_6,estate_bit_7,estate_bit_8,estate_bit_9,estate_bit_10,estate_bit_11,estate_bit_12,...,substructure_bit_134,substructure_bit_135,substructure_bit_278,substructure_bit_286,substructure_bit_290,substructure_bit_299,substructure_bit_300,substructure_bit_302,substructure_bit_304,substructure_bit_306
0,FWZOFSHJDAIJQE-UHFFFAOYSA-N,InChI=1S/C27H46O2Si2/c1-11-12-13-14-22-18-25(2...,"Cannabidiol, O,O-bis trimethylsilyl ester",1,1,1,0,1,1,1,...,0,0,0,0,0,0,0,0,0,1
1,VUNXPEWGQXFNOL-UHFFFAOYSA-N,InChI=1S/C24H34O2Si/c1-8-9-10-11-18-15-21-23(2...,"Cannabinol, O-trimethylsilyl-",1,0,1,0,0,1,0,...,0,0,1,0,0,0,0,0,0,1
2,JFPSLJJGWCHYOE-WOJBJXKFSA-N,InChI=1S/C24H38O2Si/c1-8-9-10-11-18-15-21-23(2...,"Δ9-tetrahydrocannabinol, TMS derivative",1,0,1,0,1,1,0,...,0,0,1,0,0,0,0,0,0,1


In [131]:
print(f"Dataframe shape is {fingerprints_df.shape}")
fingerprints_df = fingerprints_df.dropna(axis=0, how='any')
print(f"Dataframe shape is {fingerprints_df.shape}")

Dataframe shape is (104, 305)
Dataframe shape is (104, 305)


In [132]:
fingerprints_df.to_csv(f'{embedding_folder}/fingerprint.csv', index=False)

### Preprocess Spectra

In [133]:
spectra_documents = preprocess_file(spectra)
len(spectra_documents)

3144

### Train Spec2Vec Model

In [134]:
EPOCHS = 25
WORKERS = 6
PROCESS_LOGGER = True

In [135]:
if model_file is None:
    model = train_new_word2vec_model(spectra_documents, iterations=EPOCHS, workers=WORKERS, progress_logger=PROCESS_LOGGER)

    model_file = f'{model_folder}/spec2vec.model'
    model.save(model_file)

### Construct Spec2Vec Embeddings

In [136]:
model = gensim.models.Word2Vec.load(model_file)
model = Spec2Vec(model=model)

In [137]:
embedding_df = []
for spectra in spectra_documents:
    inchikey = spectra.metadata.get('inchikey')

    embedding = calc_vector(model.model, spectra)
    embedding_df.append((inchikey, *embedding))
embedding_df = pd.DataFrame(embedding_df, columns=['inchi_key', *[str(i) for i in range(len(embedding))]])
embedding_df.set_index('inchi_key', inplace=True)
embedding_df.shape

(3144, 300)

In [138]:
descriptive_data = metadata_df[DESCRIPTIVE_COLUMNS].set_index('inchi_key')
embedding_df = descriptive_data.join(embedding_df, how='inner').reset_index()
other_columns = [c for c in embedding_df.columns if c not in DESCRIPTIVE_COLUMNS]
embedding_df = embedding_df[DESCRIPTIVE_COLUMNS + other_columns]
embedding_df.shape

(3082, 303)

In [139]:
embedding_df.to_csv(f'{embedding_folder}/spec2vec.csv', index=False)

### Merge Fingerprints and Embeddings

In [140]:
fingerprint_df = pd.read_csv(f'{embedding_folder}/fingerprint.csv')
fingerprint_df.shape

(104, 305)

In [141]:
embedding_df = pd.read_csv(f'{embedding_folder}/spec2vec.csv')
embedding_df.shape

(3082, 303)

In [142]:
fingerprint_keys = set(fingerprint_df['inchi_key'])
embedding_keys = set(embedding_df['inchi_key'])

print("Missing in fingerprint:")
print(fingerprint_keys - embedding_keys)
print("Missing in embedding:")
print(embedding_keys - fingerprint_keys)

Missing in fingerprint:
{'OIBARLCQMDCDSG-NSHDSACASA-N', 'AYONZGOWFAKCNA-UHFFFAOYSA-N'}
Missing in embedding:
{'HBWAMRSFAPVOKZ-UHFFFAOYSA-N'}


In [143]:
prefixed_fingerprint_df = fingerprint_df.copy()
prefixed_fingerprint_df.set_index(DESCRIPTIVE_COLUMNS, inplace=True)
prefixed_fingerprint_df = prefixed_fingerprint_df[prefixed_fingerprint_df.columns[prefixed_fingerprint_df.dtypes != 'object']]
prefixed_fingerprint_df = prefixed_fingerprint_df.add_prefix('fingerprint_')

In [144]:
prefixed_embedding_df = embedding_df.copy()
prefixed_embedding_df.set_index(DESCRIPTIVE_COLUMNS, inplace=True)
prefixed_embedding_df = prefixed_embedding_df[prefixed_embedding_df.columns[prefixed_embedding_df.dtypes != 'object']]
prefixed_embedding_df = prefixed_embedding_df.add_prefix('embedding_')

In [145]:
merged_df = prefixed_fingerprint_df.join(prefixed_embedding_df, how='inner')
merged_df.reset_index(inplace=True)
merged_df.shape

(3052, 605)

In [146]:
merged_df.to_csv(f'{embedding_folder}/merged.csv', index=False)