# This is Step 0 in the Pipeline - Embedding
With this notebook we generate molecular fingerprints and Spec2Vec embeddings.

### Imports

In [23]:
import pandas as pd
import os
from spec2vec.model_building import train_new_word2vec_model
from mass_spectra.to_fingerprint import generate_fingerprint, inchikey_to_inchi, AVAILABLE_FINGERPRINTS
from mass_spectra.train_spec2vec import preprocess_file
from spec2vec import Spec2Vec, calc_vector
import gensim
from time import time, sleep
from tqdm.notebook import tqdm
import requests

In [24]:
import importlib
import mass_spectra.train_spec2vec
importlib.reload(mass_spectra.train_spec2vec)

<module 'mass_spectra.train_spec2vec' from 'c:\\Users\\aleks\\Projects\\IJS\\mass_spectra\\mass_spectra\\train_spec2vec.py'>

### Define source files and output folder

In [25]:
metadata = './source/compounds/compounds.pkl'
spectra = './source/dataset/Train NIST 3.1 dataset_TMS_BS.mgf'
model_folder = './source/spec2vec/nist/'
embedding_folder = './source/embedding/nist_all_fingerprints/'

In [26]:
model_file = None # Can be None if the pipeline should train a new model, otherwise specify path to model file (.model ending)

In [27]:
assert os.path.isfile(metadata)
assert os.path.isfile(spectra)
assert os.path.isdir(model_folder)
assert os.path.isdir(embedding_folder)
assert model_file is None or os.path.isfile(model_file)
assert metadata.endswith('.pkl')
assert spectra.endswith('.mgf')
assert model_file is None or model_file.endswith('.model')

### Define Parameters

In [28]:
AVAILABLE_FINGERPRINTS

['AtomPairs2DFingerprinter',
 'CircularFingerprinter',
 'EStateFingerprinter',
 'ExtendedFingerprinter',
 'KlekotaRothFingerprinter',
 'MACCSFingerprinter',
 'PubchemFingerprinter',
 'SubstructureFingerprinter']

In [29]:
FINGERPRINTS = ['EStateFingerprinter', 'MACCSFingerprinter', 'PubchemFingerprinter', 'SubstructureFingerprinter']

In [30]:
FINGERPRINT_NAMES = [v.replace('Fingerprinter', '') for v in FINGERPRINTS]

In [31]:
REMOVE_CONSTANT_BITS = True
REMOVE_DUPLICATE_BITS = True

In [32]:
assert set(FINGERPRINTS).issubset(set(AVAILABLE_FINGERPRINTS))

### Read Source Files

In [33]:
metadata_df = pd.read_pickle(metadata)
metadata_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12949 entries, 0 to 12948
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   smiles     12531 non-null  object
 1   inchi      12302 non-null  object
 2   inchi_key  12949 non-null  object
dtypes: object(3)
memory usage: 303.6+ KB


In [34]:
print(f"Dataframe shape is {metadata_df.shape}")
metadata_df = metadata_df.dropna(axis=0, how='any')
print(f"Dataframe shape is {metadata_df.shape}")

Dataframe shape is (12949, 3)
Dataframe shape is (12267, 3)


In [35]:
# Sanitize source columns
def sanitize(s):
    s = s.lower()
    s = s.strip()
    s = s.replace(' ', '_')
    return s
metadata_df.rename(columns={c: sanitize(c) for c in metadata_df.columns}, inplace=True)

# Sanitize data in string columns
metadata_df[metadata_df.columns[metadata_df.dtypes == 'object']] = metadata_df[metadata_df.columns[metadata_df.dtypes == 'object']].applymap(lambda x: x.strip() if isinstance(x, str) else x)

In [36]:
metadata_df.rename(columns={'inchikey': 'inchi_key'}, inplace=True)

In [37]:
metadata_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12267 entries, 0 to 12530
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   smiles     12267 non-null  object
 1   inchi      12267 non-null  object
 2   inchi_key  12267 non-null  object
dtypes: object(3)
memory usage: 383.3+ KB


In [38]:
DESCRIPTIVE_COLUMNS = ['inchi_key', 'inchi', 'smiles']

In [39]:
metadata_df[DESCRIPTIVE_COLUMNS].head(3)

Unnamed: 0,inchi_key,inchi,smiles
0,AGCGTUNOUDOTHI-KRYNHHJKSA-N,"InChI=1S/C54H130O16Si12/c1-71(2,3)55-37-41-47(...",C[Si](C)(C)OC[C@@H]1[C@H]([C@@H]([C@@H]([C@H](...
1,IADLJLVXUGSEBP-UHFFFAOYSA-N,"InChI=1S/C50H92O14Si8/c1-65(2,3)53-32-42-44(61...",C[Si](C)(C)OCC1C(C(C(C(O1)OC2=CC3=C(C(=C2)O[Si...
2,KGNFGWOIKRFMAR-UHFFFAOYSA-N,"InChI=1S/C52H98O15Si8/c1-34-45(62-70(9,10)11)4...",CC1C(C(C(C(O1)OCC2C(C(C(C(O2)OC3=CC4=C(C(=O)CC...


### Construct Fingerprints

In [40]:
source = metadata_df['inchi']
print(source.isna().sum())
source = source.fillna(metadata_df['inchi_key'])
print(source.isna().sum())

0
0


In [41]:
# Generate fingerprints (can take some time depending on the number of molecules)
# If the fingerprint is not available it will be replaced by None values
fingerprints = generate_fingerprint(FINGERPRINTS, source)

Generating EStateFingerprinter fingerprint
Converting 12267 InChI keys to 79 bit EStateFingerprinter fingerprint
Generating MACCSFingerprinter fingerprint
Converting 12267 InChI keys to 166 bit MACCSFingerprinter fingerprint
Generating PubchemFingerprinter fingerprint
Converting 12267 InChI keys to 881 bit PubchemFingerprinter fingerprint


In [None]:
# check if inchi is set as index and if not set it
for k, v in fingerprints.items():
    if not v.index.name == 'inchi':
        fingerprints[k].set_index('inchi', inplace=True)

In [None]:
# combine all generated fingerprints with prefixed column names
merged = None
for fp, fp_df in fingerprints.items():
    tmp = fp_df.copy()
    tmp.columns = [f'{fp.lower().replace("fingerprinter", "")}_{c}' for c in tmp.columns]
    if merged is None:
        merged = tmp
    else:
        merged = merged.join(tmp, how='inner')
merged.shape

(104, 1433)

In [None]:
if REMOVE_CONSTANT_BITS:
    merged = merged[merged.columns[merged.nunique() > 1]]
merged.shape

(104, 604)

In [None]:
if REMOVE_DUPLICATE_BITS:
    merged = merged.T.drop_duplicates().T
merged.shape

(104, 302)

In [None]:
descriptive_data = metadata_df[DESCRIPTIVE_COLUMNS].set_index('inchi')
fingerprints_df = descriptive_data.join(merged, how='inner').reset_index()
other_columns = [c for c in fingerprints_df.columns if c not in DESCRIPTIVE_COLUMNS]
fingerprints_df = fingerprints_df[DESCRIPTIVE_COLUMNS + other_columns]
fingerprints_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104 entries, 0 to 103
Columns: 305 entries, inchi_key to substructure_bit_306
dtypes: int64(302), object(3)
memory usage: 247.9+ KB


In [None]:
fingerprints_df.head(3)

Unnamed: 0,inchi_key,inchi,name,estate_bit_6,estate_bit_7,estate_bit_8,estate_bit_9,estate_bit_10,estate_bit_11,estate_bit_12,...,substructure_bit_134,substructure_bit_135,substructure_bit_278,substructure_bit_286,substructure_bit_290,substructure_bit_299,substructure_bit_300,substructure_bit_302,substructure_bit_304,substructure_bit_306
0,FWZOFSHJDAIJQE-UHFFFAOYSA-N,InChI=1S/C27H46O2Si2/c1-11-12-13-14-22-18-25(2...,"Cannabidiol, O,O-bis trimethylsilyl ester",1,1,1,0,1,1,1,...,0,0,0,0,0,0,0,0,0,1
1,VUNXPEWGQXFNOL-UHFFFAOYSA-N,InChI=1S/C24H34O2Si/c1-8-9-10-11-18-15-21-23(2...,"Cannabinol, O-trimethylsilyl-",1,0,1,0,0,1,0,...,0,0,1,0,0,0,0,0,0,1
2,JFPSLJJGWCHYOE-WOJBJXKFSA-N,InChI=1S/C24H38O2Si/c1-8-9-10-11-18-15-21-23(2...,"Δ9-tetrahydrocannabinol, TMS derivative",1,0,1,0,1,1,0,...,0,0,1,0,0,0,0,0,0,1


In [None]:
print(f"Dataframe shape is {fingerprints_df.shape}")
fingerprints_df = fingerprints_df.dropna(axis=0, how='any')
print(f"Dataframe shape is {fingerprints_df.shape}")

Dataframe shape is (104, 305)
Dataframe shape is (104, 305)


In [None]:
fingerprints_df.to_csv(f'{embedding_folder}/fingerprint.csv', index=False)

### Preprocess Spectra

In [None]:
spectra_documents = preprocess_file(spectra)
len(spectra_documents)

3144

### Train Spec2Vec Model

In [None]:
EPOCHS = 25
WORKERS = 6
PROCESS_LOGGER = True

In [None]:
if model_file is None:
    model = train_new_word2vec_model(spectra_documents, iterations=EPOCHS, workers=WORKERS, progress_logger=PROCESS_LOGGER)

    model_file = f'{model_folder}/spec2vec.model'
    model.save(model_file)

### Construct Spec2Vec Embeddings

In [None]:
model = gensim.models.Word2Vec.load(model_file)
model = Spec2Vec(model=model)

In [None]:
embedding_df = []
for spectra in spectra_documents:
    inchikey = spectra.metadata.get('inchikey')

    embedding = calc_vector(model.model, spectra)
    embedding_df.append((inchikey, *embedding))
embedding_df = pd.DataFrame(embedding_df, columns=['inchi_key', *[str(i) for i in range(len(embedding))]])
embedding_df.set_index('inchi_key', inplace=True)
embedding_df.shape

(3144, 300)

In [None]:
descriptive_data = metadata_df[DESCRIPTIVE_COLUMNS].set_index('inchi_key')
embedding_df = descriptive_data.join(embedding_df, how='inner').reset_index()
other_columns = [c for c in embedding_df.columns if c not in DESCRIPTIVE_COLUMNS]
embedding_df = embedding_df[DESCRIPTIVE_COLUMNS + other_columns]
embedding_df.shape

(3082, 303)

In [None]:
embedding_df.to_csv(f'{embedding_folder}/spec2vec.csv', index=False)

### Merge Fingerprints and Embeddings

In [None]:
fingerprint_df = pd.read_csv(f'{embedding_folder}/fingerprint.csv')
fingerprint_df.shape

(104, 305)

In [None]:
embedding_df = pd.read_csv(f'{embedding_folder}/spec2vec.csv')
embedding_df.shape

(3082, 303)

In [None]:
fingerprint_keys = set(fingerprint_df['inchi_key'])
embedding_keys = set(embedding_df['inchi_key'])

print("Missing in fingerprint:")
print(fingerprint_keys - embedding_keys)
print("Missing in embedding:")
print(embedding_keys - fingerprint_keys)

Missing in fingerprint:
{'OIBARLCQMDCDSG-NSHDSACASA-N', 'AYONZGOWFAKCNA-UHFFFAOYSA-N'}
Missing in embedding:
{'HBWAMRSFAPVOKZ-UHFFFAOYSA-N'}


In [None]:
prefixed_fingerprint_df = fingerprint_df.copy()
prefixed_fingerprint_df.set_index(DESCRIPTIVE_COLUMNS, inplace=True)
prefixed_fingerprint_df = prefixed_fingerprint_df[prefixed_fingerprint_df.columns[prefixed_fingerprint_df.dtypes != 'object']]
prefixed_fingerprint_df = prefixed_fingerprint_df.add_prefix('fingerprint_')

In [None]:
prefixed_embedding_df = embedding_df.copy()
prefixed_embedding_df.set_index(DESCRIPTIVE_COLUMNS, inplace=True)
prefixed_embedding_df = prefixed_embedding_df[prefixed_embedding_df.columns[prefixed_embedding_df.dtypes != 'object']]
prefixed_embedding_df = prefixed_embedding_df.add_prefix('embedding_')

In [None]:
merged_df = prefixed_fingerprint_df.join(prefixed_embedding_df, how='inner')
merged_df.reset_index(inplace=True)
merged_df.shape

(3052, 605)

In [None]:
merged_df.to_csv(f'{embedding_folder}/merged.csv', index=False)