In [52]:
from dreams.utils.data import MSData
import numpy as np
import pandas as pd
from dreams.api import dreams_embeddings
from sklearn.metrics.pairwise import cosine_similarity
import umap
from dreams.utils.mols import formula_type
import seaborn as sns
import matplotlib.pyplot as plt
from dreams.utils.plots import init_plotting
from matplotlib.colors import LinearSegmentedColormap

In [None]:
data_full = MSData.from_mgf('data/mgf_MoNA_experimental.mgf', in_mem=False)
print(data_full)
print(data_full.columns())

Get meaningful subset of dataset

In [None]:
# ASSUMPTION: SPECTRUM_TYPE representative of the dataset
# some datapoints do not have a SPECTRUM_TYPE. Why?

# dataset is too large -> proportional subsampling on SPECTRUM_TYPE
# there are 10 unique SPECTRUM_TYPE for 167,958 spectra

spectrum_types = data_full['SPECTRUM_TYPE']
spectrum_types = np.array(spectrum_types)
unique_types, type_counts = np.unique(spectrum_types, return_counts=True)

print("Spectrum Types and their counts:")
for type, count in zip(unique_types, type_counts):
    print(f"{type}: {count}")


type_proportions = type_counts / len(spectrum_types)
# embedding takes approx. 1h for 20k embeddings on local machine
total_samples = 10000
samples_per_type = np.round(type_proportions * total_samples).astype(int)

sampled_indices = []

for spectrum_type, n_samples in zip(unique_types, samples_per_type):
    type_indices = np.where(spectrum_types == spectrum_type)[0]
    
    # If we have fewer spectra of this type than we want to sample, take all of them
    if len(type_indices) <= n_samples:
        sampled_indices.extend(type_indices)
    else:
        sampled_indices.extend(np.random.choice(type_indices, size=n_samples, replace=False))

# Shuffle the sampled indices
np.random.shuffle(sampled_indices)

# Ensure we have exactly 10000 samples
sampled_indices = sampled_indices[:total_samples]
sampled_spectra = data_full.get_spectra()[sampled_indices]
print(f"Total sampled indices: {len(sampled_indices)}")
print("Print 3 random samples (sanity check):")
print_indices = np.random.choice(sampled_indices, size=3, replace=False)
for i in print_indices:
    print(data_full.at(i))

data_short = data_full.form_subset(sampled_indices, "data/MoNA_experimental_short.hdf5")
print("\nSampled dataset saved!\n")


Non finetuned version

In [None]:

# use pretrained embedding model
dreams_embs = dreams_embeddings("data/MoNA_experimental_short.hdf5")


In [6]:
sims = cosine_similarity(dreams_embs)

Proposed novelty score:
Inverse average cosine similarity of spectrum i against the whole subset

Explanation:
If a spectrum is similar to many points of the subset, it must mean that it is not really unique, so it should get a low novelty score. Analogously, a datapoint is considered novel if it is different from most points of the dataset.

Weaknesses:
identical to one sample, different to the rest?
main diagonal has 1

In [23]:
novelty_scores = []
for i in range(len(sims)):
    avg_sim = np.mean(sims[i])
    novel_score = 1 - avg_sim
    novelty_scores.append(novel_score)

In [None]:
# convert to pandas for ease of operations
pd_short = data_short.to_pandas()
pd_short["NOVELTY_SCORE"] = novelty_scores
pd_short["DREAMS_EMBEDDING"] = dreams_embs.tolist()

# convert back to MSData and save as hdf5 and mgf
# pd_short.sort_values("precursor_mz", inplace=True)
# data = MSData.from_pandas(pd_short, hdf5_pth="data/MoNA_experimental_short_annotated.hdf5")
# data.to_mgf("data/MoNA_experimental_short_annotated.mgf")

# sort by novelty score
#pd_short.sort_values("NOVELTY_SCORE", ascending=False)

UMAP projection

In [None]:
reducer = umap.UMAP(metric='cosine', min_dist=0.4, n_neighbors=50)
embs_umap = reducer.fit_transform(np.array(pd_short["DREAMS_EMBEDDING"].to_list()))

colors = ["blue", "white", "red"]
n_bins = 100
cmap = LinearSegmentedColormap.from_list("custom", colors, N=n_bins)

# Set up the plot
plt.figure(figsize=(8, 6))

# Create the scatter plot
scatter = plt.scatter(
    x=embs_umap[:, 0],
    y=embs_umap[:, 1],
    c=pd_short["NOVELTY_SCORE"],
    cmap=cmap,
    alpha=1,
    s=1.6
)

# Add a colorbar
cbar = plt.colorbar(scatter)
cbar.set_label("Novelty Score", rotation=270, labelpad=15)

# Set labels and title
plt.xlabel("UMAP 1")
plt.ylabel("UMAP 2")
plt.title("UMAP Projection with Novelty Score")

# Show the plot
plt.tight_layout()
plt.show()