## Notebook showcasing the various options in Roodmus for analysis and visualisations
In this notebook, the user can load metadata from one or several jobs from a processing pipeline done in RELION or cryoSPARC. This metadata, along with the ground-truth particle parameters are loaded into data frames, which allow for easy and conveniet plotting. We also provide several convenient functions to make plots.


In [1]:
### imports
# general
import os
import numpy as np
import pandas as pd

# roodmus
from roodmus.analysis.utils import load_data
from roodmus.analysis.plot_ctf import plot_CTF, plot_defocus_scatter
from roodmus.analysis.plot_picking import (
    label_micrograph_picked, 
    label_micrograph_truth,
    label_micrograph_truth_and_picked, 
    plot_precision, plot_recall, 
    plot_boundary_investigation,
    plot_overlap_investigation, 
    plot_precision_and_recall, 
    plot_f1_score
)
from roodmus.analysis.plot_frames import plot_frame_distribution
from roodmus.analysis.plot_classes import plot_2Dclass_precision, plot_2Dclasses_frames
from roodmus.analysis.plot_alignment import plot_picked_pose_distribution, plot_true_pose_distribution


In [5]:
### data loading
config_dir = "path/to/config/dir"
meta_files = [
    "path/to/metadata/file1.cs",
    "path/to/metadata/file2.cs",
    ["path/to/metadata/file3.cs", "path/to/metadata/file4.cs"], # multiple metadata files are fused
]

jobtypes = {
    "path/to/metadata/file1.cs": "jobtype1",
    "path/to/metadata/file2.cs": "jobtype2",
    "path/to/metadata/file3.cs": "jobtype3",
}

particle_diameter = 100 # approximate particle diameter in Angstroms
ugraph_shape = (4000, 4000) # shape of the micrograph in pixels. Only needs to be given if the metadata file is a .star file
verbose = True # prints out progress statements
ignore_missing_files = True # if .mrc files are missing, the analysis will still be performed
enable_tqdm = True # enables tqdm progress bars

for i, meta_file in enumerate(meta_files):
    if i == 0:
        analysis = load_data(meta_file, config_dir, particle_diameter, ugraph_shape=ugraph_shape, verbose=verbose, enable_tqdm=enable_tqdm, ignore_missing_files=ignore_missing_files) # creates the class
    else:
        analysis.add_data(meta_file, config_dir, verbose=verbose) # updates the class with the next metadata file


debug
loading metadata from data/6xm5_steered_Roodmus_2/cryoSPARC/J515_topaz_picked_particles.cs...
loaded metadata from data/6xm5_steered_Roodmus_2/cryoSPARC/J515_topaz_picked_particles.cs. determined file type: cs


Dictionaries now contain 368967 particles and 0 true particles
added 368967 particles from data/6xm5_steered_Roodmus_2/cryoSPARC/J515_topaz_picked_particles.cs


loading micrographs: 100%|██████████| 800/800 [05:12<00:00,  2.56it/s, micrograph=000799.mrc]


Loaded ground-truth particle positions from config files
Dictionaries now contain 368967 particles and 200000 true particles
Added 200000 particles from /home/mjoosten1/projects/roodmus/data/6xm5_steered_Roodmus_2/mrc/
debug
loading metadata from data/6xm5_steered_Roodmus_2/cryoSPARC/J518_050_particles.cs...
loaded metadata from data/6xm5_steered_Roodmus_2/cryoSPARC/J518_050_particles.cs. determined file type: cs
checking if ugraphs exist...


Dictionaries now contain 582453 particles and 200000 true particles
added 213486 particles from data/6xm5_steered_Roodmus_2/cryoSPARC/J518_050_particles.cs
debug
loading metadata from ['data/6xm5_steered_Roodmus_2/cryoSPARC/J519_class_00_final_particles.cs', 'data/6xm5_steered_Roodmus_2/cryoSPARC/J519_passthrough_particles_class_0.cs']...
loaded metadata from data/6xm5_steered_Roodmus_2/cryoSPARC/J519_class_00_final_particles.cs. determined file type: cs
loaded metadata from data/6xm5_steered_Roodmus_2/cryoSPARC/J519_passthrough_particles_class_0

In [None]:
### turn the loaded data into a pandas dataframe
df_picked = pd.DataFrame(analysis.results_picking)
df_truth = pd.DataFrame(analysis.results_truth)
df_picked.tail()


Unnamed: 0,metadata_filename,ugraph_filename,position_x,position_y,euler_phi,euler_theta,euler_psi,ugraph_shape,defocusU,defocusV,class2D
1542279,data/6xm5_steered_Roodmus_2/cryoSPARC/J577_pas...,000798.mrc,3116.0,2064.0,-0.925144,2.674026,0.604917,"[4000, 4000]",14764.269531,14764.269531,40.0
1542280,data/6xm5_steered_Roodmus_2/cryoSPARC/J577_pas...,000798.mrc,3572.0,2696.0,0.490432,2.525035,-2.88962,"[4000, 4000]",14764.269531,14764.269531,28.0
1542281,data/6xm5_steered_Roodmus_2/cryoSPARC/J577_pas...,000798.mrc,3680.0,1124.0,2.343655,2.700262,2.166481,"[4000, 4000]",14764.269531,14764.269531,12.0
1542282,data/6xm5_steered_Roodmus_2/cryoSPARC/J577_pas...,000798.mrc,3336.0,872.0,0.684857,2.917804,3.091442,"[4000, 4000]",14764.269531,14764.269531,14.0
1542283,data/6xm5_steered_Roodmus_2/cryoSPARC/J577_pas...,000798.mrc,560.0,520.0,-0.919896,2.510094,-0.627793,"[4000, 4000]",14764.269531,14764.269531,40.0


In [None]:
### saving the dataframes
# it is recommended to save the dataframes after running the rest of the notebook, as they may be modified by downstream analysis

df_picked.to_csv("picked_particles.csv")
df_truth.to_csv("truth_particles.csv")

### CTF estimation


In [None]:
### scatter plot of the estimated vs. the true defocus values
meta_index = 0 # index of the metadata file to plot

palette = "RdBu"

fig, ax = plot_defocus_scatter(df_picked,
                                meta_files[meta_index],
                                df_truth,
                                palette=palette)

fig.savefig("defocus_scatter.png", dpi=600, bbox_inches="tight")
fig.savefig("defocus_scatter.pdf", bbox_inches="tight")


In [None]:
### plot the CTF estimation for a single micrograph
meta_index = 0 # index of the metadata file to plot
ugraph_index = 3 # which micrograph to plot

fig, ax = plot_CTF(df_picked, meta_files[meta_index],
                    df_truth, config_dir, ugraph_index)

fig.savefig("CTF.png", dpi=600, bbox_inches="tight")
fig.savefig("CTF.pdf", bbox_inches="tight")


In [None]:
### plot the CTF for the particle with the largest defocus error (should take no more than a few seconds)
max_error_index = 0
max_error = 0
for i, groupname in enumerate(df_picked.groupby(["ugraph_filename"]).groups.keys()):
    defocus_estimated = df_picked.groupby(["ugraph_filename"]).get_group(groupname)["defocusU"].mean()
    defcous_true = np.abs(df_truth.groupby(["ugraph_filename"]).get_group(groupname)["defocus"].mean())
    error = np.abs(defocus_estimated - defcous_true)
    if error > max_error:
        max_error = error
        max_error_index = i

fig, ax = plot_CTF(df_picked, None, df_truth, config_dir, max_error_index)

# save high quality figure
fig.savefig("CTF_max_error.png", dpi=600, bbox_inches="tight")
fig.savefig("CTF_max_error.pdf", bbox_inches="tight")



### Particle picking

In [None]:
### plot the picked particles
ugraph_index = 0 # which micrograph to plot
metadata_index = 0 # which metadata file to plot

metadata_filename = meta_files[metadata_index]
if isinstance(metadata_filename, list):
    metadata_filename = metadata_filename[0]

fig, ax = label_micrograph_picked(df_picked, meta_files[meta_index],
                                   ugraph_index, config_dir, box_width=48, box_height=48, verbose=verbose)

fig.savefig("picked_particles.png", dpi=600, bbox_inches="tight")
fig.savefig("picked_particles.pdf", bbox_inches="tight")


In [None]:
### plot the truth particles
ugraph_index = 3 # which micrograph to plot

fig, ax = label_micrograph_truth(df_truth, ugraph_index, config_dir, box_width=32, box_height=32, verbose=verbose)

fig.savefig("truth_particles.png", dpi=600, bbox_inches="tight")
fig.savefig("truth_particles.pdf", bbox_inches="tight")


In [None]:
### plot the truth and picked particles
ugraph_index = 3 # which micrograph to plot
metadata_index = 3 # which metadata file to plot

metadata_filename = meta_files[metadata_index]
if isinstance(metadata_filename, list):
    metadata_filename = metadata_filename[0]
    
fig, ax = label_micrograph_truth_and_picked(df_picked, meta_files[meta_index],
                                             df_truth, ugraph_index, config_dir, box_width=48, box_height=48, verbose=verbose)

fig.savefig("truth_and_picked_particles.png", dpi=600, bbox_inches="tight")
fig.savefig("truth_and_picked_particles.pdf", bbox_inches="tight")


In [7]:
### compute precision and recall (may take a few minutes)
df_precision, df_picked = analysis.compute_precision(df_picked, df_truth, verbose=verbose)
df_precision.tail()


For each micrograph, for each metadata file, compute the precision, recall and multiplicity
Speed of computation depends on the number of particles in the micrograph. progressbar is not accurate
Total number of groups to loop over: 8798
Number of micgrographs: 800
Number of metadata files: 11
Starting loop over groups


computing precision: 100%|██████████| 8798/8798 [09:13<00:00, 15.88it/s, precision=0.988, recall=0.335, multiplicity=0.348] 


time taken to compute precision: 558.5891480445862


Unnamed: 0,metadata_filename,ugraph_filename,defocus,num_particles_picked,num_particles_truth,TP,FP,FN,precision,recall,multiplicity
8793,data/6xm5_steered_Roodmus_2/cryoSPARC/J577_pas...,000794.mrc,-14359.149102,299,250,221,78,40,0.73913,0.846743,0.908
8794,data/6xm5_steered_Roodmus_2/cryoSPARC/J577_pas...,000795.mrc,-20297.050076,254,250,208,46,46,0.818898,0.818898,0.86
8795,data/6xm5_steered_Roodmus_2/cryoSPARC/J577_pas...,000796.mrc,-4646.860356,187,250,177,10,77,0.946524,0.69685,0.732
8796,data/6xm5_steered_Roodmus_2/cryoSPARC/J577_pas...,000797.mrc,-10571.399862,293,250,224,69,45,0.764505,0.832714,0.916
8797,data/6xm5_steered_Roodmus_2/cryoSPARC/J577_pas...,000798.mrc,-14799.755446,83,250,82,1,163,0.987952,0.334694,0.348


In [None]:
### plot boxplot for precision and recall
order = []
for r in meta_files:
    if type(r) == str:
        order.append(r)
    else:
        order.append(r[0])    
fig, ax = plot_precision(df_precision, jobtypes, order)
xticklabels = ax.get_xticklabels()
ax.set_xticklabels(xticklabels, fontsize=14)
ax.set_title("")
fig.set_size_inches(7, 7)

fig.savefig("precision.png", dpi=600, bbox_inches="tight")
fig.savefig("precision.pdf", bbox_inches="tight")

fig, ax = plot_recall(df_precision, jobtypes, order)
xticklabels = ax.get_xticklabels()
ax.set_xticklabels(xticklabels, fontsize=14)
ax.set_title("")
fig.set_size_inches(7, 7)

fig.savefig("recall.png", dpi=600, bbox_inches="tight")
fig.savefig("recall.pdf", bbox_inches="tight")


In [None]:
### alternatively, plot the precision and recall in the same plot
fig, ax = plot_precision_and_recall(df_precision, jobtypes)

fig.savefig("precision_and_recall.png", dpi=600, bbox_inches="tight")
fig.savefig("precision_and_recall.pdf", bbox_inches="tight")


In [None]:
### plot f1-score
fig, ax = plot_f1_score(df_precision, jobtypes)

fig.savefig("f1_score.png", dpi=600, bbox_inches="tight")
fig.savefig("f1_score.pdf", bbox_inches="tight")


In [None]:
### plot the picked particles, now with the TP and FP marked in green and red
ugraph_index = 3 # which micrograph to plot
metadata_index = 4 # which metadata file to plot

metadata_filename = meta_files[metadata_index]
if isinstance(metadata_filename, list):
    metadata_filename = metadata_filename[0]

fig, ax = label_micrograph_picked(df_picked, meta_files[meta_index], ugraph_index, config_dir, box_width=48, box_height=48, verbose=verbose)

fig.savefig("picked_particles_TP_FP.png", dpi=600, bbox_inches="tight")
fig.savefig("picked_particles_TP_FP.pdf", bbox_inches="tight")


In [None]:
### plot the distribution of the particles in the ugraphs in x, y, and z directions
metadata_index = 0 # which metadata file to plot
bin_width = [100, 100, 10] # bin width for x, y, z
axis = ["x", "y", "z"]

metadata_filename = meta_files[metadata_index]
if isinstance(metadata_filename, list):
    metadata_filename = metadata_filename[0]
    
for a, bnwdth in zip(axis, bin_width):
    fig, ax = plot_boundary_investigation(df_truth, df_picked, metadata_filename, jobtypes, bnwdth, axis=a)

    fig.savefig("boundary_investigation_{}.png".format(a), dpi=600, bbox_inches="tight")
    fig.savefig("boundary_investigation_{}.pdf".format(a), bbox_inches="tight")


In [None]:
df_overlap = analysis.compute_overlap(df_picked, df_truth, verbose=verbose)
df_overlap.head()

In [None]:
### plot the overlap between the picked and truth particles
metadata_index = None # which metadata file to plot. If None, all metadata files are plotted

if metadata_index is None:
    metadata_filename = None
else:
    metadata_filename = meta_files[metadata_index]
    if isinstance(metadata_filename, list):
        metadata_filename = metadata_filename[0]

fig, ax = plot_overlap_investigation(df_overlap, metadata_filename, job_types=jobtypes)

fig.savefig("overlap_investigation.png", dpi=600, bbox_inches="tight")
fig.savefig("overlap_investigation.pdf", bbox_inches="tight")


In [None]:
### plot the distribution of trajectory frames in a metadata file
metadata_index = 6 # which metadata file to plot

metadata_filename = meta_files[metadata_index]
if isinstance(metadata_filename, list):
    metadata_filename = metadata_filename[0]
print(metadata_filename)

fig, ax = plot_frame_distribution(df_picked, metadata_filename, df_truth, particle_diameter, jobtypes)

fig.savefig("frame_distribution.png", dpi=600, bbox_inches="tight")
fig.savefig("frame_distribution.pdf", bbox_inches="tight")


### 2D classification

In [None]:
### plot the precision per class
from importlib import reload
import roodmus.analysis.plot_classes
reload(roodmus.analysis.plot_classes)
from roodmus.analysis.plot_classes import plot_2Dclass_precision
metadata_index = 8 # which metadata file to plot. Must have a class2D column
palette = "coolwarm"

metadata_filename = meta_files[metadata_index]
if isinstance(metadata_filename, list):
    metadata_filename = metadata_filename[0]

fig, ax = plot_2Dclass_precision(df_picked, metadata_filename, jobtypes)

fig.savefig("2Dclass_precision.png", dpi=600, bbox_inches="tight")
fig.savefig("2Dclass_precision.pdf", bbox_inches="tight")


In [None]:
### plot the distribution of frames over the 2D classes
from importlib import reload
import roodmus.analysis.plot_classes
reload(roodmus.analysis.plot_classes)
from roodmus.analysis.plot_classes import plot_2Dclasses_frames
metadata_index = 1 # which metadata file to plot. Must have a class2D column
palette = "YlOrRd"

metadata_filename = meta_files[metadata_index]
if isinstance(metadata_filename, list):
    metadata_filename = metadata_filename[0]

fig, ax = plot_2Dclasses_frames(df_picked, metadata_filename, bin_factor=100, palette=palette)
ax.set_xlabel("trajectory frame", fontsize=14)
ax.set_xticks([])
ax.set_ylabel("2D class", fontsize=14)

fig.savefig("2Dclass_frames.png", dpi=600, bbox_inches="tight")	
fig.savefig("2Dclass_frames.pdf", bbox_inches="tight")


### 3D alignment

In [None]:
### plot the distribution of particle poses in the picked and truth particles
meta_index = 0

metadata_filename = meta_files[meta_index]
if isinstance(metadata_filename, list):
    metadata_filename = metadata_filename[0]
print(metadata_filename)

grid, vmin, vmax = plot_picked_pose_distribution(df_picked, metadata_filename)

grid.fig.savefig("picked_pose_distribution.png", dpi=600, bbox_inches="tight")
grid.fig.savefig("picked_pose_distribution.pdf", bbox_inches="tight")



In [None]:
grid = plot_true_pose_distribution(df_truth, vmin, vmax)

grid.fig.savefig("true_pose_distribution.png", dpi=600, bbox_inches="tight")
grid.fig.savefig("true_pose_distribution.pdf", bbox_inches="tight")
