In [1]:
import pandas as pd
import os

### Collect all InchiKeys

In [2]:
tbdms_df = pd.read_excel('dataset/Metadata_test_TBDMS_derivatives.xlsx')

In [3]:
tms_df = pd.read_excel('dataset/Metadata_test_TMS_derivatives.xlsx')

In [4]:
tbdms_df.columns, tbdms_df.shape

(Index(['Name', 'Molecular formula', 'Exact mass', 'PubChem ID', 'InChI',
        'InChI Key', 'SMILEs', 'Parent compound', 'Molecular formula.1',
        'Exact mass.1', 'PubChem ID.1', 'InChI parent', 'InChI Key parent',
        'SMILEs parent', 'present_in_train_dataset'],
       dtype='object'),
 (86, 15))

In [5]:
tms_df.columns, tms_df.shape

(Index(['Name', 'Molecular formula', 'Exact mass', 'PubChem ID', 'InChI',
        'InChI Key', 'SMILEs', 'Parent compound', 'Molecular formula.1',
        'Exact mass.1', 'PubChem ID.1', 'InChI parent', 'InChI Key parent',
        'SMILEs parent', 'present_in_train_dataset'],
       dtype='object'),
 (106, 15))

In [6]:
# Drop any rows with missing values
tbdms_df = tbdms_df.dropna(axis=0, how='any')
tms_df = tms_df.dropna(axis=0, how='any')

In [7]:
tbdms_df.shape, tms_df.shape

((86, 15), (105, 15))

### Construct fingerprints for all inchikeys

In [8]:
from cdk_inchi_to_fingerprint import generate_fingerprint

In [9]:
FINGERPRINT = 'MACCSFingerprinter'

In [10]:
tbdms_fingerprints = generate_fingerprint([FINGERPRINT], tbdms_df['InChI'])

In [11]:
tms_fingerprints = generate_fingerprint([FINGERPRINT], tms_df['InChI'])

Error Number 0 at idx 89 converting InChI key InChI=1S/C25H37NO4Si2/c1-16(27)25(30-32(6,7)8)12-11-18-19-15-17-9-10-20(29-31(3,4)5)22-21(17)24(18,23(25)28-22)13-14-26(19)2/h9-12: HTTPSConnectionPool(host='www.ebi.ac.uk', port=443): Max retries exceeded with url: /unichem/legacy/rest/inchi/InChI=1S/C25H37NO4Si2/c1-16(27)25(30-32(6,7)8)12-11-18-19-15-17-9-10-20(29-31(3,4)5)22-21(17)24(18,23(25)28-22)13-14-26(19)2/h9-12 (Caused by ResponseError('too many 400 error responses'))
Error Number 1 at idx 89 converting InChI key InChI=1S/C25H37NO4Si2/c1-16(27)25(30-32(6,7)8)12-11-18-19-15-17-9-10-20(29-31(3,4)5)22-21(17)24(18,23(25)28-22)13-14-26(19)2/h9-12: HTTPSConnectionPool(host='www.ebi.ac.uk', port=443): Max retries exceeded with url: /unichem/legacy/rest/inchi/InChI=1S/C25H37NO4Si2/c1-16(27)25(30-32(6,7)8)12-11-18-19-15-17-9-10-20(29-31(3,4)5)22-21(17)24(18,23(25)28-22)13-14-26(19)2/h9-12 (Caused by ResponseError('too many 400 error responses'))
Error Number 2 at idx 89 converting InChI ke

In [12]:
tms_fingerprints[FINGERPRINT][0], len(tms_fingerprints[FINGERPRINT][0])

([0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  1,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  1,
  1,
  1,
  1,
  1,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  1,
  1,
  0,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  1,
  1,
  1,
  0,
  1,
  1,
  1,
  1,
  0,
  1,
  0,
  0,
  1,
  0,
  1,
  0,
  1,
  1,
  0,
  1,
  1,
  1,
  1,
  0],
 166)

In [13]:
tbdms_temp_df = pd.DataFrame(tbdms_fingerprints[FINGERPRINT]).reset_index().drop('index', axis=1)
tbdms_selected_columns = tbdms_df[['Name', 'InChI', 'InChI Key']].reset_index().drop('index', axis=1)
tbdms_fingerprints_df = pd.concat([tbdms_selected_columns, tbdms_temp_df], axis=1)

# Improve df structure
# Rename Name to name, InChI to inchi, InChI Key to inchikey
tbdms_fingerprints_df = tbdms_fingerprints_df.rename(columns={'Name': 'name', 'InChI': 'inchi', 'InChI Key': 'inchikey'})
# Remove leading and trailing whitespaces
tbdms_fingerprints_df['name'] = tbdms_fingerprints_df['name'].str.strip()
tbdms_fingerprints_df['inchi'] = tbdms_fingerprints_df['inchi'].str.strip()
tbdms_fingerprints_df['inchikey'] = tbdms_fingerprints_df['inchikey'].str.strip()

In [14]:
tms_temp_df = pd.DataFrame(tms_fingerprints[FINGERPRINT]).reset_index().drop('index', axis=1)
tms_selected_columns = tms_df[['Name', 'InChI', 'InChI Key']].reset_index().drop('index', axis=1)
tms_fingerprints_df = pd.concat([tms_selected_columns, tms_temp_df], axis=1)

# Improve df structure
# Rename Name to name, InChI to inchi, InChI Key to inchikey
tms_fingerprints_df = tms_fingerprints_df.rename(columns={'Name': 'name', 'InChI': 'inchi', 'InChI Key': 'inchikey'})
# Remove leading and trailing whitespaces
tms_fingerprints_df['name'] = tms_fingerprints_df['name'].str.strip()
tms_fingerprints_df['inchi'] = tms_fingerprints_df['inchi'].str.strip()
tms_fingerprints_df['inchikey'] = tms_fingerprints_df['inchikey'].str.strip()

In [15]:
tbdms_fingerprints_df.head()

Unnamed: 0,name,inchi,inchikey,0,1,2,3,4,5,6,...,156,157,158,159,160,161,162,163,164,165
0,"cannabidiol, 2TBDMS",InChI=1S/C33H58O2Si2/c1-15-16-17-18-26-22-29(3...,SVLCMZXBZOFSQX-MBMZGMDYSA-N,0,0,0,0,0,0,0,...,1,0,1,1,0,1,1,1,1,0
1,"cannabinol, TBDMS",InChI=1S/C27H40O2Si/c1-10-11-12-13-20-17-23-25...,XNSPKWBHHSYZSF-UHFFFAOYSA-N,0,0,0,0,0,0,0,...,1,0,1,1,0,1,1,1,1,0
2,"delta9-tetrahydrocannabinol, TBDMS",InChI=1S/C27H44O2Si/c1-10-11-12-13-20-17-23-25...,JLRFZWKTGGRERL-UHFFFAOYSA-N,0,0,0,0,0,0,0,...,1,0,1,1,0,1,1,1,1,0
3,"cannabichromene, TBDMS derivative",InChI=1S/C27H44O2Si/c1-10-11-12-15-22-19-24-23...,LCMQYUXNWHZXNI-UHFFFAOYSA-N,0,0,0,0,0,0,0,...,1,0,1,1,0,1,1,1,1,0
4,"Δ9-tetrahydrocannabinolic acid, 2TBDMS",InChI=1S/C34H58O4Si2/c1-15-16-17-18-24-22-27-2...,RMSAJXDMUASUPI-CLJLJLNGSA-N,0,0,0,0,0,0,0,...,1,0,1,1,0,1,1,1,1,0


In [16]:
tms_fingerprints_df.head()

Unnamed: 0,name,inchi,inchikey,0,1,2,3,4,5,6,...,156,157,158,159,160,161,162,163,164,165
0,"Cannabidiol, O,O-bis trimethylsilyl ester",InChI=1S/C27H46O2Si2/c1-11-12-13-14-22-18-25(2...,FWZOFSHJDAIJQE-UHFFFAOYSA-N,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0
1,"Cannabinol, O-trimethylsilyl-",InChI=1S/C24H34O2Si/c1-8-9-10-11-18-15-21-23(2...,VUNXPEWGQXFNOL-UHFFFAOYSA-N,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0
2,"Δ9-tetrahydrocannabinol, TMS derivative",InChI=1S/C24H38O2Si/c1-8-9-10-11-18-15-21-23(2...,JFPSLJJGWCHYOE-WOJBJXKFSA-N,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0
3,"cannabichromene, O-trimethylsilyl-",InChI=1S/C24H38O2Si/c1-8-9-10-13-20-17-22-21(2...,VGYQPKLQPQJSQU-UHFFFAOYSA-N,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0
4,"cannabidiolic acid, TMS derivative",InChI=1S/C31H54O4Si3/c1-14-15-16-17-24-21-27(3...,NLUDHDUQAJYEEH-IZZNHLLZSA-N,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0


In [17]:
tbdms_fingerprints_df = tbdms_fingerprints_df.dropna(axis=0, how='any')
tms_fingerprints_df = tms_fingerprints_df.dropna(axis=0, how='any')

In [18]:
os.makedirs('./embeddings', exist_ok=True)
fingerprint_name = FINGERPRINT.lower().replace('fingerprinter', '')
tbdms_fingerprints_df.to_csv(f'./embeddings/tbdms_{fingerprint_name}_fingerprint.csv', index=False)
tms_fingerprints_df.to_csv(f'./embeddings/tms_{fingerprint_name}_fingerprint.csv', index=False)

### Construct Spec2Vec for all inchikeys (multiple spectra per inchikey)

In [19]:
from spec2vec_train import preprocess_file
import gensim
from spec2vec import Spec2Vec, calc_vector

In [20]:
tbdms_spectra_documents = preprocess_file("./dataset/Test dataset_TBDMS_RAW.mgf")
len(tbdms_spectra_documents)

1936

In [21]:
tms_spectra_documents = preprocess_file("./dataset/Test dataset_TMS_RAW.mgf")
len(tms_spectra_documents)

3144

In [22]:
tbdms_model = gensim.models.Word2Vec.load("./models/tbdms/spec2vec.model")
tbdms_model = Spec2Vec(tbdms_model)

In [23]:
tms_model = gensim.models.Word2Vec.load("./models/tms/spec2vec.model")
tms_model = Spec2Vec(tms_model)

In [None]:
tbdms_embedding = []
for spectra in tbdms_spectra_documents:
    title = spectra.metadata.get('title')
    inchikey = spectra.metadata.get('inchikey')
    embedding = calc_vector(tbdms_model.model, spectra)
    tbdms_embedding.append((title, inchikey, *embedding))

In [25]:
tms_embedding = []
for spectra in tms_spectra_documents:
    title = spectra.metadata.get('title')
    inchikey = spectra.metadata.get('inchikey')
    embedding = calc_vector(tms_model.model, spectra)
    tms_embedding.append((title, inchikey, *embedding))

In [26]:
tbdms_columns = ['title', 'inchikey'] + [i for i in range(0, len(tbdms_embedding[0])-2)]
tbdms_embedding_df = pd.DataFrame(tbdms_embedding, columns=tbdms_columns)

# Improve df structure
# Rename title to name
tbdms_embedding_df = tbdms_embedding_df.rename(columns={'title': 'name'})
# Remove leading and trailing whitespaces
tbdms_embedding_df['name'] = tbdms_embedding_df['name'].str.strip()
tbdms_embedding_df['inchikey'] = tbdms_embedding_df['inchikey'].str.strip()

In [27]:
tms_columns = ['title', 'inchikey'] + [i for i in range(0, len(tms_embedding[0])-2)]
tms_embedding_df = pd.DataFrame(tms_embedding, columns=tms_columns)

# Improve df structure
# Rename title to name
tms_embedding_df = tms_embedding_df.rename(columns={'title': 'name'})
# Remove leading and trailing whitespaces
tms_embedding_df['name'] = tms_embedding_df['name'].str.strip()

In [28]:
tbdms_embedding_df.to_csv('./embeddings/tbdms_spec2vec_embeddings.csv', index=False)

In [29]:
tms_embedding_df.to_csv('./embeddings/tms_spec2vec_embeddings.csv', index=False)

### Construct merged fingerprints and spec2vec embeddings

In [30]:
temp_fingerprint = tbdms_fingerprints_df.set_index(['inchikey', 'name', 'inchi']).add_prefix('fingerprint_').reset_index().set_index('inchikey')
temp_embedding = tbdms_embedding_df.set_index(['inchikey', 'name']).add_prefix('embedding_').reset_index().set_index('inchikey')
temp_embedding = temp_embedding.drop(columns=['name'])
tbdms_merged = pd.merge(temp_fingerprint, temp_embedding, on=['inchikey'], how='inner').reset_index()
tbdms_merged.head()

Unnamed: 0,inchikey,name,inchi,fingerprint_0,fingerprint_1,fingerprint_2,fingerprint_3,fingerprint_4,fingerprint_5,fingerprint_6,...,embedding_290,embedding_291,embedding_292,embedding_293,embedding_294,embedding_295,embedding_296,embedding_297,embedding_298,embedding_299
0,SVLCMZXBZOFSQX-MBMZGMDYSA-N,"cannabidiol, 2TBDMS",InChI=1S/C33H58O2Si2/c1-15-16-17-18-26-22-29(3...,0,0,0,0,0,0,0,...,-154.900948,-172.583792,-7.81164,-143.068135,7.432393,5.882822,-54.81946,-57.532868,95.880766,-9.720895
1,SVLCMZXBZOFSQX-MBMZGMDYSA-N,"cannabidiol, 2TBDMS",InChI=1S/C33H58O2Si2/c1-15-16-17-18-26-22-29(3...,0,0,0,0,0,0,0,...,-119.268463,-133.707017,-14.534004,-136.231737,64.416145,-92.835205,12.543429,1.738854,62.680567,-53.962423
2,SVLCMZXBZOFSQX-MBMZGMDYSA-N,"cannabidiol, 2TBDMS",InChI=1S/C33H58O2Si2/c1-15-16-17-18-26-22-29(3...,0,0,0,0,0,0,0,...,-33.854447,-92.517467,11.971426,-144.79375,0.935474,-30.727656,33.06046,-14.235519,-86.315635,-24.603099
3,SVLCMZXBZOFSQX-MBMZGMDYSA-N,"cannabidiol, 2TBDMS",InChI=1S/C33H58O2Si2/c1-15-16-17-18-26-22-29(3...,0,0,0,0,0,0,0,...,-122.286324,-36.319614,6.536764,-145.511717,20.404416,5.051611,-69.041317,-3.453226,-13.270084,-68.694064
4,SVLCMZXBZOFSQX-MBMZGMDYSA-N,"cannabidiol, 2TBDMS",InChI=1S/C33H58O2Si2/c1-15-16-17-18-26-22-29(3...,0,0,0,0,0,0,0,...,-163.603991,-34.555256,48.720901,-116.323747,68.092744,16.992305,61.134913,42.803688,11.394467,17.871866


In [33]:
tbdms_fingerprints_df.shape, tbdms_embedding_df.shape, tbdms_merged.shape

((86, 169), (1936, 302), (1936, 469))

In [31]:
temp_fingerprint = tms_fingerprints_df.set_index(['inchikey', 'name', 'inchi']).add_prefix('fingerprint_').reset_index().set_index('inchikey')
temp_embedding = tms_embedding_df.set_index(['inchikey', 'name']).add_prefix('embedding_').reset_index().set_index('inchikey')
temp_embedding = temp_embedding.drop(columns=['name'])
tms_merged = pd.merge(temp_fingerprint, temp_embedding, on=['inchikey'], how='inner').reset_index()
tms_merged.head()

Unnamed: 0,inchikey,name,inchi,fingerprint_0,fingerprint_1,fingerprint_2,fingerprint_3,fingerprint_4,fingerprint_5,fingerprint_6,...,embedding_290,embedding_291,embedding_292,embedding_293,embedding_294,embedding_295,embedding_296,embedding_297,embedding_298,embedding_299
0,FWZOFSHJDAIJQE-UHFFFAOYSA-N,"Cannabidiol, O,O-bis trimethylsilyl ester",InChI=1S/C27H46O2Si2/c1-11-12-13-14-22-18-25(2...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-139.332301,-100.62612,-249.842998,137.309562,186.220807,-17.937769,-119.924224,272.095743,-291.421216,85.683406
1,FWZOFSHJDAIJQE-UHFFFAOYSA-N,"Cannabidiol, O,O-bis trimethylsilyl ester",InChI=1S/C27H46O2Si2/c1-11-12-13-14-22-18-25(2...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-122.833101,-179.269458,30.020876,14.507649,141.249645,206.728673,-108.762419,131.967769,-67.333607,1.211677
2,FWZOFSHJDAIJQE-UHFFFAOYSA-N,"Cannabidiol, O,O-bis trimethylsilyl ester",InChI=1S/C27H46O2Si2/c1-11-12-13-14-22-18-25(2...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-40.504577,-137.72339,-36.551587,108.677426,174.345255,227.353214,-193.319703,-16.86005,-273.916022,-0.011723
3,FWZOFSHJDAIJQE-UHFFFAOYSA-N,"Cannabidiol, O,O-bis trimethylsilyl ester",InChI=1S/C27H46O2Si2/c1-11-12-13-14-22-18-25(2...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,21.791783,-46.735659,-76.700534,97.209693,211.574958,235.900725,-187.86013,95.050095,-349.902262,40.688636
4,FWZOFSHJDAIJQE-UHFFFAOYSA-N,"Cannabidiol, O,O-bis trimethylsilyl ester",InChI=1S/C27H46O2Si2/c1-11-12-13-14-22-18-25(2...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-87.14504,-94.339602,-131.743793,75.186829,210.267458,174.633966,-33.020011,148.762387,-263.94383,27.794688


In [32]:
tms_fingerprints_df.shape, tms_embedding_df.shape, tms_merged.shape

((104, 169), (3144, 302), (3052, 469))

In [37]:
tbdms_merged.to_csv(f'./embeddings/tbdms_{fingerprint_name}_fingerprint_and_embedding.csv', index=False)
tms_merged.to_csv(f'./embeddings/tms_{fingerprint_name}_fingerprint_and_embedding.csv', index=False)