# plotting functions of supplmentary figure 2 in the manuscript
This figure shows the following:
- example micrograph of a dataset with radiation damage
- example micrograph of a dataset without radiation damage
- 3D recsonstruction of spike protein, dataset simulated with radiation damage
- boxplot showing the particle picking precision as a function of exposure
- boxplot showing the particle picking recall as a function of exposure
- plot showing the decreased precision of picked particle locations as a function of exposure

In [None]:
# imports
import os
import mrcfile
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np
from pipeliner.mrc_image_tools import mrc_thumbnail
from gemmi import cif
from roodmus.analysis.utils import load_data


## panel A
plotting the micrograph with radiation damage

In [None]:
project_dir = "/home/mjoosten1/projects/roodmus/data/DESRES-Trajectory_sarscov2-11021571-all-glueCA_fractionated"
figures_dir = os.path.join(project_dir, "figures")
ugraph_dir = os.path.join(project_dir, "MotionCorr", "job007", "Movies")
movies_dir = os.path.join(project_dir, "Movies")
ugraph_file = os.path.join(ugraph_dir, "000000.mrc")
print(f"found micrograph: {ugraph_file}")

ugraph_thumbnail = mrc_thumbnail(ugraph_file, 512, os.path.join(figures_dir, f"{ugraph_file.replace('.mrc', '_thumbnail')}.png"))

fig, ax = plt.subplots(figsize=(7, 7))
ax.imshow(ugraph_thumbnail, cmap="gray")
ax.axis("off")

## panel B
plotting the micrograph without radiation damage

In [None]:
project_dir = "/tudelft/mjoosten1/staff-umbrella/ajlab/MJ/projects/Roodmus/data/DE-Shaw_covid_spike_protein/DESRES-Trajectory_sarscov2-11021571-all-glueCA"
figures_dir = os.path.join(project_dir, "figures")
ugraph_dir = os.path.join(project_dir, "Micrographs")
ugraph_file = os.path.join(ugraph_dir, "000000.mrc")
print(f"found micrograph: {ugraph_file}")

ugraph_thumbnail = mrc_thumbnail(ugraph_file, 512, os.path.join(figures_dir, f"{ugraph_file.replace('.mrc', '_thumbnail')}.png"))

fig, ax = plt.subplots(figsize=(7, 7))
ax.imshow(ugraph_thumbnail, cmap="gray")
ax.axis("off")


## panel C
need to flip the handedness of the map and the local resolution mask

In [None]:
project_dir = "/home/mjoosten1/projects/roodmus/data/DESRES-Trajectory_sarscov2-11021571-all-glueCA_fractionated"
refined_map = os.path.join(project_dir, "Refine3D", "job048", "run_class001.mrc")
locres_mask = os.path.join(project_dir, "LocalRes", "job061", "relion_locres.mrc")
postprocess_map = os.path.join(project_dir, "PostProcess", "job050", "postprocess.mrc")

with mrcfile.open(refined_map, mode='r') as mrc:
    refined_map_data = mrc.data

with mrcfile.new(refined_map.replace(".mrc", "_flipped.mrc"), overwrite=True) as mrc:
    mrc.set_data(np.flip(refined_map_data, axis=0).astype(np.float32))


with mrcfile.open(locres_mask, mode='r') as mrc:
    locres_mask_data = mrc.data

with mrcfile.new(locres_mask.replace(".mrc", "_flipped.mrc"), overwrite=True) as mrc:
    mrc.set_data(np.flip(locres_mask_data, axis=0).astype(np.float32))

with mrcfile.open(postprocess_map, mode='r') as mrc:
    postprocess_map_data = mrc.data

with mrcfile.new(postprocess_map.replace(".mrc", "_flipped.mrc"), overwrite=True) as mrc:
    mrc.set_data(np.flip(postprocess_map_data, axis=0).astype(np.float32))


## panel D
plotting the boxplot of particle picking precision as a function of exposure

In [None]:
project_dir = "/home/mjoosten1/projects/roodmus/data/20231017_EMPIAR_SNR_comparison"
figures_dir = os.path.join(project_dir, "figures")
if not os.path.exists(figures_dir):
    os.makedirs(figures_dir)
particle_diameter = 100 # approximate particle diameter in Angstroms
ugraph_shape = (4000, 4000) # shape of the micrograph in pixels. Only needs to be given if the metadata file is a .star file
verbose = True # prints out progress statements
ignore_missing_files = True # if .mrc files are missing, the analysis will still be performed
enable_tqdm = True # enables tqdm progress bars

data = {
    0: {
        "exposure": 45,
        "LoG": "job004",
        "Class2D": "job005",
        "topaz": "job010",
        "homogeneous": "job016"
    },
    1: {
        "exposure": 35,
        "LoG": "job037",
        "Class2D": "job038",
        "topaz": "job042",
        "homogeneous": "job048"
    },
    2: {
        "exposure": 25,
        "LoG": "job054",
        "Class2D": "job055",
        "topaz": "job059",
        "homogeneous": "job065"
    },
    3: {
        "exposure": 15,
        "LoG": "job071",
        "Class2D": "job072",
        "topaz": "job076",
        "homogeneous": "job082"
    },
    4: {
        "exposure": 5,
        "LoG": "job088",
        "Class2D": "job089",
        "topaz": None,
        "homogeneous": None
    },
    5: {
        "exposure": 10,
        "LoG": "job093",
        "Class2D": "job094",
        "topaz": None,
        "homogeneous": None
    },
    6: {
        "exposure": 12,
        "LoG": "job098",
        "Class2D": "job099",
        "topaz": None,
        "homogeneous": None
    },
        7: {
        "exposure": 8,
        "LoG": "job114",
        "Class2D": "job115",
        "topaz": None,
        "homogeneous": None,
    },
}

for key, item in data.items():
    exposure = f"{item['exposure']}".zfill(2)
    config_dir = os.path.join(project_dir, f"mrc_epa_{exposure}")
    print(config_dir)
    meta_files = [
        os.path.join(project_dir, "Extract", item["LoG"], "particles.star"),
        # os.path.join(project_dir, "Class2D", item["Class2D"], "run_it025_data.star"),
    ]
    jobtypes = {
        os.path.join(project_dir, "Extract", item["LoG"], "particles.star"): "LoG",
        os.path.join(project_dir, "Class2D", item["Class2D"], "run_it025_data.star"): "Class2D",
    }
    if item["topaz"]:
        meta_files.append(os.path.join(project_dir, "Extract", item["topaz"], "particles.star"))
        jobtypes[os.path.join(project_dir, "Extract", item["topaz"], "particles.star")] = "topaz"
    if item["homogeneous"]:
        meta_files.append(os.path.join(project_dir, "Refine3D", item["homogeneous"], "run_data.star"))
        jobtypes[os.path.join(project_dir, "Refine3D", item["homogeneous"], "run_data.star")] = "homogeneous"

    for i, meta_file in enumerate(meta_files):
        if i == 0:
            analysis = load_data(meta_file, config_dir, particle_diameter, ugraph_shape=ugraph_shape, verbose=verbose, enable_tqdm=enable_tqdm, ignore_missing_files=ignore_missing_files) # creates the class
        else:
            analysis.add_data(meta_file, config_dir, verbose=verbose) # updates the class with the next metadata file
    
    df_picked = pd.DataFrame(analysis.results_picking)
    df_truth = pd.DataFrame(analysis.results_truth)

    # compute precision and recall
    # df_precision, df_picked = analysis.compute_precision(df_picked, df_truth, verbose=verbose)
    p_match, _, p_unmatched, t_unmatched, _ = analysis._match_particles(
        meta_files,
        df_picked,
        df_truth,
        verbose=False,
        enable_tqdm=True,
    )
    df_precision = analysis.compute_1to1_match_precision(
        p_match,
        p_unmatched,
        t_unmatched,
        df_truth,
        verbose=False,
    )

    # add a column to the picked data frame that indicates exposure
    df_picked["exposure"] = item["exposure"]
    df_precision["exposure"] = item["exposure"]

    df_overlap = analysis.compute_overlap(df_picked, df_truth, verbose=verbose)

    # add a column to the overlap data frame that indicates exposure
    df_overlap["exposure"] = item["exposure"]

    if key == 0:
        df_precision_all = df_precision
        df_picked_all = df_picked
        df_overlap_all = df_overlap
        df_truth_all = df_truth
    else:
        df_precision_all = pd.concat([df_precision_all, df_precision])
        df_picked_all = pd.concat([df_picked_all, df_picked])
        df_overlap_all = pd.concat([df_overlap_all, df_overlap])
        df_truth_all = pd.concat([df_truth_all, df_truth])

jobtypes_all = {}
for value in data.values():
    jobtypes_all[value["LoG"]] = "LoG"
    # jobtypes_all[value["Class2D"]] = "Class2D"
    if value["topaz"]:
        jobtypes_all[value["topaz"]] = "topaz"
    if value["homogeneous"]:
        jobtypes_all[value["homogeneous"]] = "homogeneous"
df_precision_all["job"] = df_precision_all["metadata_filename"].apply(lambda x: x.split("/")[-2])
df_precision_all["jobtype"] = df_precision_all["job"].map(jobtypes_all)
df_picked_all["job"] = df_picked_all["metadata_filename"].apply(lambda x: x.split("/")[-2])
df_picked_all["jobtype"] = df_picked_all["job"].map(jobtypes_all)

In [None]:
df_precision_LoG = df_precision_all.groupby("jobtype").get_group("LoG")

fig, ax = plt.subplots(figsize=(7, 3.5))
# create stripplot for precision per exposure, with different columns for different metadata files
sns.stripplot(x="exposure", y="precision", hue="defocus", data=df_precision_LoG, ax=ax, jitter=0.1, dodge=False, alpha=0.5, legend=False, palette="RdYlBu")
sns.boxplot(x="exposure", y="precision", hue="exposure", data=df_precision_LoG, ax=ax, dodge=False, palette="Blues")

ax.set_xlabel("Fluence ($e^-$/$\AA^2$)", fontsize=16)
ax.set_ylabel("Precision", fontsize=16)
sm = plt.cm.ScalarMappable(
    cmap="RdYlBu",
    norm=plt.Normalize(
        vmin=df_precision["defocus"].min()/10000,
        vmax=df_precision["defocus"].max()/10000,
    ),
)
sm._A = []
cbar = fig.colorbar(sm, ax=ax)
cbar.set_label("Defocus (Å)", rotation=270, labelpad=20, fontsize=16)
cbar.ax.tick_params(labelsize=14)
ax.legend().remove()

# fig.savefig(os.path.join(figures_dir, "precision_vs_exposure.pdf"), bbox_inches="tight")
print(f"saved figure to: {os.path.join(figures_dir, 'precision_vs_exposure.pdf')}")

## panel E
plotting the boxplot of particle picking recall as a function of exposure

In [None]:
# sort df_precision_all by exposure
df_precision_LoG = df_precision_all.groupby("jobtype").get_group("LoG")

fig, ax = plt.subplots(figsize=(7, 3.5))
# create stripplot for precision per exposure, with different columns for different metadata files
sns.stripplot(x="exposure", y="recall", hue="defocus", data=df_precision_LoG, ax=ax, jitter=0.1, dodge=False, alpha=0.5, legend=False, palette="RdYlBu")
sns.boxplot(x="exposure", y="recall", hue="exposure", data=df_precision_LoG, ax=ax, dodge=False, palette="Blues")

ax.set_xlabel("Fluence ($e^-$/$\AA^2$)", fontsize=16)
ax.set_ylabel("Recall", fontsize=16)
sm = plt.cm.ScalarMappable(
    cmap="RdYlBu",
    norm=plt.Normalize(
        vmin=df_precision["defocus"].min()/10000,
        vmax=df_precision["defocus"].max()/10000,
    ),
)
sm._A = []
cbar = fig.colorbar(sm, ax=ax)
cbar.set_label("Defocus (\u03BCm)", rotation=270, labelpad=20, fontsize=16)
cbar.ax.tick_params(labelsize=14)
ax.legend().remove()

# fig.savefig(os.path.join(figures_dir, "recall_vs_exposure.pdf"), bbox_inches="tight")
print(f"saved plot to {os.path.join(figures_dir, 'recall_vs_exposure.pdf')}")



## panel F
Plot of the average distance between a picked particle and the closest truth particle. This acts as a measurement of the accuracy of the particle picking. The data gets filtered to only include TP particles (i.e. particles that are within 1 particle radius of a truth particle). Because of this, the increase in average distance cannot be explained by more FP particle picks, and only by less precise picking of good particles.

In [None]:
project_dir = "/home/mjoosten1/projects/roodmus/data/20231017_EMPIAR_SNR_comparison"
figures_dir = os.path.join(project_dir, "figures")
if not os.path.exists(figures_dir):
    os.makedirs(figures_dir)
particle_diameter = 100 # approximate particle diameter in Angstroms
ugraph_shape = (4000, 4000) # shape of the micrograph in pixels. Only needs to be given if the metadata file is a .star file
verbose = True # prints out progress statements
ignore_missing_files = True # if .mrc files are missing, the analysis will still be performed
enable_tqdm = True # enables tqdm progress bars

data = {
    0: {
        "exposure": 45,
        "LoG": "job004",
        "Class2D": "job005",
        "topaz": "job010",
        "homogeneous": "job016"
    },
    1: {
        "exposure": 35,
        "LoG": "job037",
        "Class2D": "job038",
        "topaz": "job042",
        "homogeneous": "job048"
    },
    2: {
        "exposure": 25,
        "LoG": "job054",
        "Class2D": "job055",
        "topaz": "job059",
        "homogeneous": "job065"
    },
    3: {
        "exposure": 15,
        "LoG": "job071",
        "Class2D": "job072",
        "topaz": "job076",
        "homogeneous": "job082"
    },
    4: {
        "exposure": 5,
        "LoG": "job088",
        "Class2D": "job089",
        "topaz": None,
        "homogeneous": None
    },
    5: {
        "exposure": 10,
        "LoG": "job093",
        "Class2D": "job094",
        "topaz": None,
        "homogeneous": None
    },
    6: {
        "exposure": 12,
        "LoG": "job098",
        "Class2D": "job099",
        "topaz": None,
        "homogeneous": None
    },
        7: {
        "exposure": 8,
        "LoG": "job114",
        "Class2D": "job115",
        "topaz": None,
        "homogeneous": None,
    },
}

for key, item in data.items():
    exposure = f"{item['exposure']}".zfill(2)
    config_dir = os.path.join(project_dir, f"mrc_epa_{exposure}")
    print(config_dir)
    meta_files = [
        os.path.join(project_dir, "Extract", item["LoG"], "particles.star"),
        # os.path.join(project_dir, "Class2D", item["Class2D"], "run_it025_data.star"),
    ]
    jobtypes = {
        os.path.join(project_dir, "Extract", item["LoG"], "particles.star"): "LoG",
        os.path.join(project_dir, "Class2D", item["Class2D"], "run_it025_data.star"): "Class2D",
    }
    if item["topaz"]:
        meta_files.append(os.path.join(project_dir, "Extract", item["topaz"], "particles.star"))
        jobtypes[os.path.join(project_dir, "Extract", item["topaz"], "particles.star")] = "topaz"
    if item["homogeneous"]:
        meta_files.append(os.path.join(project_dir, "Refine3D", item["homogeneous"], "run_data.star"))
        jobtypes[os.path.join(project_dir, "Refine3D", item["homogeneous"], "run_data.star")] = "homogeneous"

    for i, meta_file in enumerate(meta_files):
        if i == 0:
            analysis = load_data(meta_file, config_dir, particle_diameter, ugraph_shape=ugraph_shape, verbose=verbose, enable_tqdm=enable_tqdm, ignore_missing_files=ignore_missing_files) # creates the class
        else:
            analysis.add_data(meta_file, config_dir, verbose=verbose) # updates the class with the next metadata file
    
    df_picked = pd.DataFrame(analysis.results_picking)
    df_truth = pd.DataFrame(analysis.results_truth)

    # compute precision and recall
    df_precision, df_picked = analysis.compute_precision(df_picked, df_truth, verbose=verbose)
    # p_match, _, p_unmatched, t_unmatched, closest_truth_index = analysis._match_particles(
    #     meta_files,
    #     df_picked,
    #     df_truth,
    #     verbose=False,
    #     enable_tqdm=True,
    # )
    # df_precision = analysis.compute_1to1_match_precision(
    #     p_match,
    #     p_unmatched,
    #     t_unmatched,
    #     df_truth,
    #     verbose=False,
    # )
    # df_truth["pdb_index"] = df_truth["pdb_filename"].apply(lambda x: int(x.strip(".pdb").split("_")[-1]))
    # df_picked["closest_truth_index"] = closest_truth_index
    # df_picked["TP"] = df_picked["closest_truth_index"].apply(lambda x: np.isnan(x) == False)
    # df_picked["closest_pdb_index"] = df_picked["closest_truth_index"].apply(lambda x: df_truth.loc[x, "pdb_index"] if np.isnan(x) == False else np.nan)

    # add a column to the picked data frame that indicates exposure
    df_picked["exposure"] = item["exposure"]
    df_precision["exposure"] = item["exposure"]

    df_overlap = analysis.compute_overlap(df_picked, df_truth, verbose=verbose)

    # add a column to the overlap data frame that indicates exposure
    df_overlap["exposure"] = item["exposure"]

    if key == 0:
        df_precision_all = df_precision
        df_picked_all = df_picked
        df_overlap_all = df_overlap
        df_truth_all = df_truth
    else:
        df_precision_all = pd.concat([df_precision_all, df_precision])
        df_picked_all = pd.concat([df_picked_all, df_picked])
        df_overlap_all = pd.concat([df_overlap_all, df_overlap])
        df_truth_all = pd.concat([df_truth_all, df_truth])

jobtypes_all = {}
for value in data.values():
    jobtypes_all[value["LoG"]] = "LoG"
    # jobtypes_all[value["Class2D"]] = "Class2D"
    if value["topaz"]:
        jobtypes_all[value["topaz"]] = "topaz"
    if value["homogeneous"]:
        jobtypes_all[value["homogeneous"]] = "homogeneous"
df_precision_all["job"] = df_precision_all["metadata_filename"].apply(lambda x: x.split("/")[-2])
df_precision_all["jobtype"] = df_precision_all["job"].map(jobtypes_all)
df_picked_all["job"] = df_picked_all["metadata_filename"].apply(lambda x: x.split("/")[-2])
df_picked_all["jobtype"] = df_picked_all["job"].map(jobtypes_all)

In [None]:
# plot the average distance to the closest particle as a function of exposure
df_picked_all["job"] = df_picked_all["metadata_filename"].apply(lambda x: x.split("/")[-2])
df_picked_all["jobtype"] = df_picked_all["job"].map(jobtypes_all)
df_picked_LoG = df_picked_all[df_picked_all["jobtype"] == "LoG"]
df_picked_LoG_TP = df_picked_LoG.groupby("TP").get_group(True)
fig, ax = plt.subplots(figsize=(3.5, 3.5))
# sns.violinplot(x="exposure", y="closest_dist", data=df_picked_LoG_TP, ax=ax, inner="quartile", palette="Blues", legend=False)
sns.boxplot(x="exposure", y="closest_dist", hue="exposure", data=df_picked_LoG_TP, ax=ax, palette="Blues", dodge=False)
# sns.stripplot(x="exposure", y="closest_dist", data=df_picked_LoG_TP, ax=ax, jitter=0.1, dodge=True, alpha=0.5, color="k")
ax.set_xlabel("Fluence ($e^-$/$\AA^2$)", fontsize=16)
ax.set_ylabel("Distance ($\AA$)", fontsize=16)
ax.tick_params(labelsize=14)
ax.legend().remove()

# fig.savefig(os.path.join(figures_dir, "distance_to_closest_particle_vs_exposure.pdf"), bbox_inches="tight")
print(f"saved image to {os.path.join(figures_dir, 'distance_to_closest_particle_vs_exposure.pdf')}")
