# MALDI Extraction

## Libraries

In [None]:
import os
import pathlib

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pyimzml.ImzMLParser import ImzMLParser
from maldi_tools import extraction, plotting

## File Paths

In [None]:
data_name = "panc2055_imzML"
data_file = pathlib.Path(data_name) / "panc2055.imzML"

In [None]:
base_dir = pathlib.Path("../data")
imzml_dir = base_dir / "imzml"
library_dir = base_dir / "libraries"
extraction_dir = base_dir / data_name / "extracted"
debug_dir = base_dir / data_name / "debug"

In [None]:
data_path = imzml_dir / data_file

In [None]:
# Create directories
for directory in [base_dir, library_dir, extraction_dir, debug_dir]:
    if not os.path.exists(directory):
        directory.mkdir(parents=True, exist_ok=True)

## Plotting Parameters

In [None]:
plt.rcParams["figure.figsize"] = (20, 13)
plt.rcParams["ytick.color"] = "w"
plt.rcParams["xtick.color"] = "w"
plt.rcParams["axes.labelcolor"] = "w"
plt.rcParams["axes.edgecolor"] = "w"
plt.rcParams["axes.facecolor"] = "black"
plt.rcParams["savefig.edgecolor"] = "w"
plt.rcParams["savefig.facecolor"] = "black"
plt.rcParams["figure.facecolor"] = "black"
plt.rcParams["figure.constrained_layout.use"] = False

## Load necessary files

### ImzML Data file

In [None]:
imz_data = ImzMLParser(data_path, include_spectra_metadata="full")

### Library Peak List

In [None]:
library_peak_list = library_dir / "glycan_peaklist_KL.csv"
library_peak_df = pd.read_csv(library_peak_list)

library_peak_df.head()

## Constants

In [None]:
intensity_percentile = 99

## Spectrum Extraction

Extract the *m/z* and *intensity* values.

In [None]:
total_mass_df, thresholds = extraction.extract_spectra(
    imz_data=imz_data, intensity_percentile=intensity_percentile
)

In [None]:
display(total_mass_df)

### Global Intensity Threshold

Display the $n$ largest intensities, as well as the $m$-th intensity percentile, and set that as the *global intensity threshold*.

In [None]:
largest_intensity_count = 10

In [None]:
total_mass_df.nlargest(largest_intensity_count, ["intensity"])

In [None]:
global_intensity_threshold = np.percentile(total_mass_df["intensity"].values, intensity_percentile)
print(f"Global Intensity Threshold: {global_intensity_threshold}")

## Peak Detection

### Rolling Window Method

In [None]:
log_intensities, log_int_percentile = extraction.rolling_window(
    total_mass_df=total_mass_df, intensity_percentile=intensity_percentile, window_size=5000
)

### Plot Intensities

In [None]:
plotting.plot_intensities(
    total_mass_df=total_mass_df,
    log_intensities=log_intensities,
    log_int_percentile=log_int_percentile,
    global_intensity_threshold=global_intensity_threshold,
)

### Signal Extraction

In [None]:
peak_candidate_idxs, peak_candidates = extraction.signal_extraction(
    total_mass_df=total_mass_df, log_int_percentile=log_int_percentile
)

In [None]:
print(f"Candiate Peak Count: {len(peak_candidates)}")

In [None]:
plotting.plot_discovered_peaks(
    total_mass_df=total_mass_df,
    peak_candidate_idxs=peak_candidate_idxs,
    peak_candidates=peak_candidates,
    global_intensity_threshold=global_intensity_threshold,
)

### Get Peak Widths

In [None]:
peak_df, l_ips_r, r_ips_r, peak_widths_height = extraction.get_peak_widths(
    total_mass_df=total_mass_df,
    peak_candidate_idxs=peak_candidate_idxs,
    peak_candidates=peak_candidates,
    thresholds=thresholds,
)

### Save Peak Spectra

In [None]:
save_peak_spectra_debug = True

In [None]:
panel_df = extraction.peak_spectra(
    total_mass_df=total_mass_df,
    peak_df=peak_df,
    peak_candidate_idxs=peak_candidate_idxs,
    peak_candidates=peak_candidates,
    peak_widths_height=peak_widths_height,
    l_ips_r=l_ips_r,
    r_ips_r=r_ips_r,
    save_peak_spectra_debug=save_peak_spectra_debug,
    debug_dir=debug_dir,
)

In [None]:
panel_df

## Integrate Coordinates

Generate the images and save them as TIFFs in `extraction_dir`. Each file is named after their corresponding peak m/z value, truncated to 4 decimal places. The dimensions of each image correspond to the maximum x- and y-coordinates extracted from the slide.

In [None]:
extraction.coordinate_integration(peak_df=peak_df, imz_data=imz_data, extraction_dir=extraction_dir)

### Histogram preview of the Intensities of a given Peak

Set a value for `desired_peak_hist` (ideally something from your library) and it'll find the nearest peak, and display a histogram of the intensities of the image with `bin_count` bins.

In [None]:
desired_peak_hist = 1809.639659
bin_count = 40

In [None]:
plotting.plot_peak_hist(peak=desired_peak_hist, bin_count=bin_count)

## Match Glycan Library with Extracted Peaks

Set a value for the maximum ppm tolerance between a peak and its corresponding match in the `library_peak_df` specified. Matched peak images are saved as TIFFs to the `library_matched` subfolder inside `extraction_dir` and are named after their matched peak m/z value.

In [None]:
ppm = 100

In [None]:
matched_peaks_df = extraction.library_matching(
    library_peak_df=library_peak_df, ppm=ppm, extraction_dir=extraction_dir
)

In [None]:
plotting.save_matched_peak_images(
    matched_peaks_df=matched_peaks_df, extraction_dir=extraction_dir
)

## Core Naming and Cropping

For TMAs, each core is extracted all at once. However, this makes it difficult to locate the exact positions of each core. Additionally, the default names assigned to each core aren't particularly useful because they don't contain any information about their position on the TMA.

This section will help you assign informative names to each core and afterwards, segment out the locations of specific cores to generate FOV-level statistics.

It is helpful first to create an all-encompassing mask that defines the locations of all the cores. This will make it clear where the TMA was scanned for the naming step. You will need to provide the path to one of your extracted glycan images first.

* `glycan_img_path`: path to one glycan image, needed to properly dimension the mask
* `glycan_mask_path`: where the mask will be saved

In [None]:
glycan_img_path = "path/to/glycan_img.tiff"
glycan_mask_path = "path/to/glycan_mask.png"

# generate and save the glycan mask
extraction.generate_glycan_mask(
    imz_data=imz_data,
    glycan_img_path=glycan_img_path,
    glycan_mask_path=glycan_mask_path
)

Each core on the TMA should be appropriately named by the <a href=https://tsai.stanford.edu/research/maldi_tma/>TSAI MALDI tiler</a>. You will need to provide the PNG saved at `glycan_mask_path` as input. **Ensure that this step is completed before running the following sections.**

The poslog files for your TMA run will contain each scanned coordinate in the exact order it was scanned. This, along with the tiler output, will be needed to map each coordinate to its respective core.

* `centroid_path`: TSAI MALDI tiler output, contains name of each core mapped to respective centroid
* `poslog_paths`: list of poslog files used for the TMA, contains all coordinates in order of acquisition. **Make sure this matches up with the order of acquisition for your run.**

In [None]:
centroid_path = "path/to/centroids.json"
poslog_paths = ["path/to/poslog1.txt", "path/to/poslog2.txt"]

# map coordinates to core names
region_core_info = extraction.map_coordinates_to_core_name(
    imz_data=imz_data,
    centroid_path=centroid_path,
    poslog_paths=poslog_paths
)

To generate FOV-level statistics, an individual mask for each core named by TSAI will be saved. They can then be loaded in as needed in the FOV-level-statistic-generating functions.

* `glycan_crop_save_dir`: the directory where these masks will be saved

In [None]:
glycan_crop_save_dir = "path/to/glycan/crops"
if not os.path.exists(glycan_crop_save_dir):
    os.makedirs(glycan_crop_save_dir)

extraction.generate_glycan_crop_masks(
    glycan_mask_path=glycan_mask_path,
    region_core_info=region_core_info,
    glycan_crop_save_dir=glycan_crop_save_dir
)

Run the following cell to visualize the masks for certain cores for testing.

* `cores_to_crop`: define all the cores you want to visualize their masks for. If multiple cores are specified, the individual masks are combined. Set to `None` to crop all cores out.

In [None]:
cores_to_crop = ["R1C1", "R1C2"]

# extract a binary mask with just the cores specified
core_cropping_mask = extraction.load_glycan_crop_masks(
    glycan_crop_save_dir=glycan_crop_save_dir,
    cores_to_crop=cores_to_crop
)

# visualize the mask
_ = plt.imshow(core_cropping_mask)