In [62]:
import os
import sys
from pathlib import Path
# Make sure all code is in the PATH.
sys.path.append("../src/")

In [63]:
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib.colors import LogNorm

In [64]:
# Plot styling.
plt.style.use(["seaborn-white", "seaborn-paper"])
plt.rc("font", family="sans-serif")
sns.set_palette(["#9e0059", "#6da7de", "#ee266d", "#dee000", "#eb861e"])
sns.set_context("paper")

# ensure directory
Path("results/img/").mkdir(parents=True, exist_ok=True)

## Finished setup, start plotting

In [66]:
# filename = "results/gnps_lib_sqrt_True_500000pairs_6min_signals_200maxdelta.parquet"
filename = "results/BILELIB19_all_sqrt_False_100pairs_6min_signals_True_requirestruc_4i0-200deltamz_0mods.parquet"
# filename = "results/20220418_ALL_GNPS_NO_PROPOGATED_acetyl_sqrt_False_500000pairs_6min_signals_42i01056specific_delta_0mods.parquet"
# filename = "results/20220418_ALL_GNPS_NO_PROPOGATED_oxygen_sqrt_True_500000pairs_6min_signals_15i9949specific_delta_0mods.parquet"
# filename = "results/BILELIB19_as_exchange_sqrt_True_10pairs_6min_signals_200maxdelta_8mods.parquet"
# filename = "results/gnps_lib_sqrt_False_500000pairs_6min_signals_200maxdelta.parquet"
# filename = "results/BILELIB19_oxygen_sqrt_False_500000pairs_6min_signals_15i9949specific_delta.parquet"
# filename = "results/BILELIB19_as_exchange_sqrt_False_500000pairs_6min_signals_200maxdelta_8mods.parquet"
# filename = "results/BILELIB19_bile_conjugates_sqrt_False_500000pairs_6min_signals_200maxdelta_5mods.parquet"

# use filename to identify exported files
analysis_id = Path(filename).stem

similarities = pd.read_parquet(filename)
similarities.head(5)

Unnamed: 0,id1,id2,delta_mz,tanimoto,cos_score,cos_matched_intensity,cos_max_contribution,cos_n_greq_2p,cos_matches,mod_score,mod_matched_intensity,mod_max_contribution,mod_n_greq_2p,mod_matches,nl_score,nl_matched_intensity,nl_max_contribution,nl_n_greq_2p,nl_matches
0,CCMSLIB00006582618,CCMSLIB00006584912,12.036,0.7827,0.070971,0.723836,0.009573,0,90,0.215335,0.853506,0.139366,1,94,0.157793,0.598486,0.139366,1,56
1,CCMSLIB00006582197,CCMSLIB00006582347,71.052,0.4146,0.092883,0.506133,0.035505,2,58,0.583416,0.961309,0.280779,5,68,0.49063,0.374612,0.280779,3,11
2,CCMSLIB00006584388,CCMSLIB00006582180,57.984,0.3979,0.075893,0.645073,0.028774,1,82,0.591496,0.97641,0.212512,5,93,0.5262,0.710029,0.212512,4,28
3,CCMSLIB00006584082,CCMSLIB00006582786,60.003,0.714,0.03931,0.741386,0.00528,0,93,0.117195,0.950737,0.072315,1,104,0.080784,0.613411,0.072315,1,40
4,CCMSLIB00005465223,CCMSLIB00006584498,78.01,0.4856,0.057475,0.908518,0.006245,0,393,0.068691,0.940469,0.00877,0,477,0.014085,0.417171,0.00877,0,238


In [None]:
# remove rows where all scores are 0
similarities = similarities[(similarities["cos_score"]>0) & (similarities["mod_score"]>0) & (similarities["nl_score"]>0)]
len(similarities)

In [None]:
total = len(similarities)
stats = {
    'total': total,
    'cos higher': len(similarities[similarities["cos_score"]>similarities["nl_score"]])/ total,
    'nl higher': len(similarities[similarities["cos_score"]<similarities["nl_score"]])/ total
}
print(stats)

In [None]:
df = pd.DataFrame()
df['cosine-nl'] = similarities['cos_score'] - similarities['nl_score']
ax = sns.histplot(data=df, binwidth=0.04)
ax.set_xlim((-1,1))

plt.savefig("results/img/diff_{}.png".format(analysis_id), dpi=300, bbox_inches="tight")
plt.show()
plt.close()

In [None]:
plots = [
        ["nl_score", "cos_score"],
        ["cos_score", "mod_score"],
        ["nl_score", "mod_score"],
        ["nl_max_contribution", "nl_score"],
        ["cos_max_contribution", "cos_score"],
        ["mod_max_contribution", "mod_score"],
        ["nl_matched_intensity", "nl_score"],
        ["cos_matched_intensity", "cos_score"],
        ["mod_matched_intensity", "mod_score"],
        ["nl_matched_intensity", "cos_matched_intensity"],
        ["nl_max_contribution", "cos_max_contribution"],
        ["nl_max_contribution", "mod_max_contribution"],
    ]

fig, axes = plt.subplots(4, 3, figsize=(7, 12 / 1.618))

bins = 50
tick_locators = mticker.FixedLocator(np.arange(0, bins + 1, bins / 4))
tick_labels = np.asarray([f"{a:.2f}" for a in np.arange(0, 1.01, 0.25)])

for i, ax in enumerate(axes.flatten()):
    if i >= len(plots):
        break

    (xlabel, ylabel) = plots[i]
    hist, _, _ = np.histogram2d(
        similarities[xlabel],
        similarities[ylabel],
        bins=bins,
        range=[[0, 1], [0, 1]],
    )
    heatmap = sns.heatmap(
        np.rot90(hist),
        cmap="viridis",
        cbar=False,
        square=True,
        xticklabels=False,
        yticklabels=False,
        ax=ax,
        norm=LogNorm(),
    )
    ax.yaxis.set_major_locator(tick_locators)
    ax.set_yticklabels(tick_labels[::-1])
    ax.xaxis.set_major_locator(tick_locators)
    ax.set_xticklabels(tick_labels)
    for _, spine in heatmap.spines.items():
        spine.set_visible(True)
    ax.set_xlabel(xlabel.replace("_", " ").capitalize())
    ax.set_ylabel(ylabel.replace("_", " ").capitalize())


plt.tight_layout()

plt.savefig("results/img/score_contributions_{}.png".format(analysis_id), dpi=300, bbox_inches="tight")
plt.show()
plt.close()

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(10, 4))

xlabels = ["cos_n_greq_2p","nl_n_greq_2p","mod_n_greq_2p"]
max_x = similarities[["cos_n_greq_2p","nl_n_greq_2p","mod_n_greq_2p"]].max().max()
max_y = max(similarities[xlabel].value_counts()[0] for xlabel in xlabels) * 1.02

for i, ax in enumerate(axes.flatten()):
    xlabel = xlabels[i]

    hist = sns.histplot(data=similarities, x=xlabel, binwidth=1, ax= ax)
    # for _, spine in hist.spines.items():
    #     spine.set_visible(True)
    # ax.set_xlabel(xlabel.replace("_", " ").capitalize())
    # ax.set_ylabel(ylabel.replace("_", " ").capitalize())

    ax.set_xlim((0,max_x))
    ax.set_ylim((0,max_y))

plt.tight_layout()

plt.savefig("results/img/greater_2p_signals_{}.png".format(analysis_id), dpi=300, bbox_inches="tight")
plt.show()
plt.close()

In [None]:
g = sns.pairplot(similarities, kind="hist",
             vars=["cos_score", "nl_score", "mod_score", "cos_max_contribution", "nl_max_contribution"])
for axes in g.axes:
    for a in axes:
        a.set_xlim((0,1))
        a.set_ylim((0,1))

plt.savefig("results/img/pair_{}.png".format(analysis_id), dpi=300, bbox_inches="tight")
plt.show()
plt.close()