In [29]:
import pandas as pd
import os

### Collect all InchiKeys

In [30]:
tbdms_df = pd.read_excel('dataset/Metadata_test_TBDMS_derivatives.xlsx')

In [31]:
tms_df = pd.read_excel('dataset/Metadata_test_TMS_derivatives.xlsx')

In [32]:
tbdms_df.columns, tbdms_df.shape

(Index(['Name', 'Molecular formula', 'Exact mass', 'PubChem ID', 'InChI',
        'InChI Key', 'SMILEs', 'Parent compound', 'Molecular formula.1',
        'Exact mass.1', 'PubChem ID.1', 'InChI parent', 'InChI Key parent',
        'SMILEs parent', 'present_in_train_dataset'],
       dtype='object'),
 (86, 15))

In [33]:
tms_df.columns, tms_df.shape

(Index(['Name', 'Molecular formula', 'Exact mass', 'PubChem ID', 'InChI',
        'InChI Key', 'SMILEs', 'Parent compound', 'Molecular formula.1',
        'Exact mass.1', 'PubChem ID.1', 'InChI parent', 'InChI Key parent',
        'SMILEs parent', 'present_in_train_dataset'],
       dtype='object'),
 (106, 15))

In [34]:
# Drop any rows with missing values
tbdms_df = tbdms_df.dropna(axis=0, how='any')
tms_df = tms_df.dropna(axis=0, how='any')

In [35]:
tbdms_df.shape, tms_df.shape

((86, 15), (105, 15))

### Construct fingerprints for all inchikeys

In [36]:
from cdk_inchi_to_fingerprint import generate_fingerprint, java_bitset_to_python_array

In [37]:
FINGERPRINT = 'MACCSFingerprinter'

In [38]:
tbdms_fingerprints = generate_fingerprint([FINGERPRINT], tbdms_df['InChI'], fingerprint_parser=java_bitset_to_python_array)

In [39]:
tms_fingerprints = generate_fingerprint([FINGERPRINT], tms_df['InChI'], fingerprint_parser=java_bitset_to_python_array)

Error Number 0 at idx 89 converting InChI key InChI=1S/C25H37NO4Si2/c1-16(27)25(30-32(6,7)8)12-11-18-19-15-17-9-10-20(29-31(3,4)5)22-21(17)24(18,23(25)28-22)13-14-26(19)2/h9-12: HTTPSConnectionPool(host='www.ebi.ac.uk', port=443): Max retries exceeded with url: /unichem/legacy/rest/inchi/InChI=1S/C25H37NO4Si2/c1-16(27)25(30-32(6,7)8)12-11-18-19-15-17-9-10-20(29-31(3,4)5)22-21(17)24(18,23(25)28-22)13-14-26(19)2/h9-12 (Caused by ResponseError('too many 400 error responses'))
Error Number 1 at idx 89 converting InChI key InChI=1S/C25H37NO4Si2/c1-16(27)25(30-32(6,7)8)12-11-18-19-15-17-9-10-20(29-31(3,4)5)22-21(17)24(18,23(25)28-22)13-14-26(19)2/h9-12: HTTPSConnectionPool(host='www.ebi.ac.uk', port=443): Max retries exceeded with url: /unichem/legacy/rest/inchi/InChI=1S/C25H37NO4Si2/c1-16(27)25(30-32(6,7)8)12-11-18-19-15-17-9-10-20(29-31(3,4)5)22-21(17)24(18,23(25)28-22)13-14-26(19)2/h9-12 (Caused by ResponseError('too many 400 error responses'))
Error Number 2 at idx 89 converting InChI ke

In [40]:
tbdms_temp_df = pd.DataFrame(tbdms_fingerprints[FINGERPRINT]).reset_index().drop('index', axis=1)
tbdms_selected_columns = tbdms_df[['Name', 'InChI', 'InChI Key']].reset_index().drop('index', axis=1)
tbdms_fingerprints_df = pd.concat([tbdms_selected_columns, tbdms_temp_df], axis=1)

In [41]:
tms_temp_df = pd.DataFrame(tms_fingerprints[FINGERPRINT]).reset_index().drop('index', axis=1)
tms_selected_columns = tms_df[['Name', 'InChI', 'InChI Key']].reset_index().drop('index', axis=1)
tms_fingerprints_df = pd.concat([tms_selected_columns, tms_temp_df], axis=1)

In [42]:
tbdms_fingerprints_df.head()

Unnamed: 0,Name,InChI,InChI Key,0,1,2,3,4,5,6,...,182,183,184,185,186,187,188,189,190,191
0,"cannabidiol, 2TBDMS",InChI=1S/C33H58O2Si2/c1-15-16-17-18-26-22-29(3...,SVLCMZXBZOFSQX-MBMZGMDYSA-N,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"cannabinol, TBDMS",InChI=1S/C27H40O2Si/c1-10-11-12-13-20-17-23-25...,XNSPKWBHHSYZSF-UHFFFAOYSA-N,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"delta9-tetrahydrocannabinol, TBDMS",InChI=1S/C27H44O2Si/c1-10-11-12-13-20-17-23-25...,JLRFZWKTGGRERL-UHFFFAOYSA-N,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"cannabichromene, TBDMS derivative",InChI=1S/C27H44O2Si/c1-10-11-12-15-22-19-24-23...,LCMQYUXNWHZXNI-UHFFFAOYSA-N,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Δ9-tetrahydrocannabinolic acid, 2TBDMS",InChI=1S/C34H58O4Si2/c1-15-16-17-18-24-22-27-2...,RMSAJXDMUASUPI-CLJLJLNGSA-N,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [43]:
tms_fingerprints_df.head()

Unnamed: 0,Name,InChI,InChI Key,0,1,2,3,4,5,6,...,182,183,184,185,186,187,188,189,190,191
0,"Cannabidiol, O,O-bis trimethylsilyl ester",InChI=1S/C27H46O2Si2/c1-11-12-13-14-22-18-25(2...,FWZOFSHJDAIJQE-UHFFFAOYSA-N,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Cannabinol, O-trimethylsilyl-",InChI=1S/C24H34O2Si/c1-8-9-10-11-18-15-21-23(2...,VUNXPEWGQXFNOL-UHFFFAOYSA-N,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Δ9-tetrahydrocannabinol, TMS derivative",InChI=1S/C24H38O2Si/c1-8-9-10-11-18-15-21-23(2...,JFPSLJJGWCHYOE-WOJBJXKFSA-N,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"cannabichromene, O-trimethylsilyl-",InChI=1S/C24H38O2Si/c1-8-9-10-13-20-17-22-21(2...,VGYQPKLQPQJSQU-UHFFFAOYSA-N,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"cannabidiolic acid, TMS derivative",InChI=1S/C31H54O4Si3/c1-14-15-16-17-24-21-27(3...,NLUDHDUQAJYEEH-IZZNHLLZSA-N,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [44]:
tbdms_fingerprints_df

Unnamed: 0,Name,InChI,InChI Key,0,1,2,3,4,5,6,...,182,183,184,185,186,187,188,189,190,191
0,"cannabidiol, 2TBDMS",InChI=1S/C33H58O2Si2/c1-15-16-17-18-26-22-29(3...,SVLCMZXBZOFSQX-MBMZGMDYSA-N,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"cannabinol, TBDMS",InChI=1S/C27H40O2Si/c1-10-11-12-13-20-17-23-25...,XNSPKWBHHSYZSF-UHFFFAOYSA-N,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"delta9-tetrahydrocannabinol, TBDMS",InChI=1S/C27H44O2Si/c1-10-11-12-13-20-17-23-25...,JLRFZWKTGGRERL-UHFFFAOYSA-N,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"cannabichromene, TBDMS derivative",InChI=1S/C27H44O2Si/c1-10-11-12-15-22-19-24-23...,LCMQYUXNWHZXNI-UHFFFAOYSA-N,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Δ9-tetrahydrocannabinolic acid, 2TBDMS",InChI=1S/C34H58O4Si2/c1-15-16-17-18-24-22-27-2...,RMSAJXDMUASUPI-CLJLJLNGSA-N,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81,"5alpha-androstan-17beta-ol-3-one (stanolone), ...","InChI=1S/C25H44O2Si/c1-23(2,3)28(6,7)27-22-11-...",BDIJWGREUGQQQQ-PJJIPRPHSA-N,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
82,"2-anilinophenylacetic acid, TBDMS derivative","InChI=1S/C20H27NO2Si/c1-20(2,3)24(4,5)23-19(22...",QRZZJICWYDLAPK-UHFFFAOYSA-N,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
83,"l-leucine, O-TBDMS derivative",InChI=1S/C12H27NO2Si/c1-9(2)8-10(13)11(14)15-1...,FVSAHVDVPOAIKE-JTQLQIEISA-N,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
84,"l-leucine, 2TBDMS derivative","InChI=1S/C18H41NO2Si2/c1-14(2)13-15(19-22(9,10...",HKPKPAQMCPAPSM-HNNXBMFYSA-N,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [45]:
os.makedirs('./embeddings', exist_ok=True)
fingerprint_name = FINGERPRINT.lower().replace('fingerprinter', '')
tbdms_fingerprints_df.to_csv(f'./embeddings/tbdms_{fingerprint_name}_fingerprint.csv', index=False)
tms_fingerprints_df.to_csv(f'./embeddings/tms_{fingerprint_name}_fingerprint.csv', index=False)

### Construct Spec2Vec for all inchikeys (multiple spectra per inchikey)

In [46]:
from spec2vec_train import preprocess_file
import gensim
from spec2vec import Spec2Vec, calc_vector

In [47]:
tbdms_spectra_documents = preprocess_file("./dataset/Test dataset_TBDMS_RAW.mgf")
len(tbdms_spectra_documents)

1936

In [48]:
tms_spectra_documents = preprocess_file("./dataset/Test dataset_TMS_RAW.mgf")
len(tms_spectra_documents)

3144

In [49]:
tbdms_model = gensim.models.Word2Vec.load("./models/tbdms/spec2vec.model")
tbdms_model = Spec2Vec(tbdms_model)

In [50]:
tms_model = gensim.models.Word2Vec.load("./models/tms/spec2vec.model")
tms_model = Spec2Vec(tms_model)

In [51]:
tbdms_embedding = []
for spectra in tbdms_spectra_documents:
    title = spectra.metadata.get('title')
    inchikey = spectra.metadata.get('inchikey')
    # embedding = tbdms_model._calculate_embedding(spectra)
    embedding = calc_vector(tbdms_model.model, spectra)
    tbdms_embedding.append((title, inchikey, *embedding))

In [52]:
tms_embedding = []
for spectra in tms_spectra_documents:
    title = spectra.metadata.get('title')
    inchikey = spectra.metadata.get('inchikey')
    # embedding = tms_model._calculate_embedding(spectra)
    embedding = calc_vector(tms_model.model, spectra)
    tms_embedding.append((title, inchikey, *embedding))

In [53]:
tbdms_columns = ['title', 'inchikey'] + [i for i in range(0, len(tbdms_embedding[0])-2)]
tbdms_embedding_df = pd.DataFrame(tbdms_embedding, columns=tbdms_columns)

In [54]:
tms_columns = ['title', 'inchikey'] + [i for i in range(0, len(tms_embedding[0])-2)]
tms_embedding_df = pd.DataFrame(tms_embedding, columns=tms_columns)

In [55]:
tbdms_embedding_df.to_csv('./embeddings/tbdms_spec2vec_embeddings.csv', index=False)

In [56]:
tms_embedding_df.to_csv('./embeddings/tms_spec2vec_embeddings.csv', index=False)