# MALDI Extraction

## Libraries

In [None]:
import os
import pathlib

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pyimzml.ImzMLParser import ImzMLParser
from maldi_tools import extraction, plotting

## File Paths

In [None]:
data_name = "panc2055_imzML"
data_file = pathlib.Path(data_name) / "panc2055.imzML"

In [None]:
base_dir = pathlib.Path("../data")
imzml_dir = base_dir / "imzml"
library_dir = base_dir / "libraries"
extraction_dir = base_dir / data_name / "extracted"
debug_dir = base_dir / data_name / "debug"

In [None]:
data_path = imzml_dir / data_file

In [None]:
# Create directories
for directory in [base_dir, library_dir, extraction_dir, debug_dir]:
    if not os.path.exists(directory):
        directory.mkdir(parents=True, exist_ok=True)

## Plotting Parameters

In [None]:
plt.rcParams["figure.figsize"] = (20, 13)
plt.rcParams["ytick.color"] = "w"
plt.rcParams["xtick.color"] = "w"
plt.rcParams["axes.labelcolor"] = "w"
plt.rcParams["axes.edgecolor"] = "w"
plt.rcParams["axes.facecolor"] = "black"
plt.rcParams["savefig.edgecolor"] = "w"
plt.rcParams["savefig.facecolor"] = "black"
plt.rcParams["figure.facecolor"] = "black"
plt.rcParams["figure.constrained_layout.use"] = False

## Load necessary files

### ImzML Data file

In [None]:
imz_data = ImzMLParser(data_path, include_spectra_metadata="full")

### Library Peak List

In [None]:
library_peak_list = library_dir / "glycan_peaklist_KL.csv"
library_peak_df = pd.read_csv(library_peak_list)

library_peak_df.head()

## Constants

In [None]:
intensity_percentile = 99

## Spectrum Extraction

Extract the *m/z* and *intensity* values.

In [None]:
total_mass_df, thresholds = extraction.extract_spectra(
    imz_data=imz_data, intensity_percentile=intensity_percentile
)

In [None]:
display(total_mass_df)

### Global Intensity Threshold

Display the $n$ largest intensities, as well as the $m$-th intensity percentile, and set that as the *global intensity threshold*.

In [None]:
largest_intensity_count = 10

In [None]:
total_mass_df.nlargest(largest_intensity_count, ["intensity"])

In [None]:
global_intensity_threshold = np.percentile(total_mass_df["intensity"].values, intensity_percentile)
print(f"Global Intensity Threshold: {global_intensity_threshold}")

## Peak Detection

### Rolling Window Method

In [None]:
log_intensities, log_int_percentile = extraction.rolling_window(
    total_mass_df=total_mass_df, intensity_percentile=intensity_percentile, window_size=5000
)

### Plot Intensities

In [None]:
plotting.plot_intensities(
    total_mass_df=total_mass_df,
    log_intensities=log_intensities,
    log_int_percentile=log_int_percentile,
    global_intensity_threshold=global_intensity_threshold,
)

### Signal Extraction

In [None]:
peak_candidate_idxs, peak_candidates = extraction.signal_extraction(
    total_mass_df=total_mass_df, log_int_percentile=log_int_percentile
)

In [None]:
print(f"Candiate Peak Count: {len(peak_candidates)}")

In [None]:
plotting.plot_discovered_peaks(
    total_mass_df=total_mass_df,
    peak_candidate_idxs=peak_candidate_idxs,
    peak_candidates=peak_candidates,
    global_intensity_threshold=global_intensity_threshold,
)

### Get Peak Widths

In [None]:
peak_df, l_ips_r, r_ips_r, peak_widths_height = extraction.get_peak_widths(
    total_mass_df=total_mass_df,
    peak_candidate_idxs=peak_candidate_idxs,
    peak_candidates=peak_candidates,
    thresholds=thresholds,
)

### Save Peak Spectra

In [None]:
peak_df, l_ips_r, r_ips_r, peak_widths_height = extraction.get_peak_widths(
    total_mass_df=total_mass_df,
    peak_candidate_idxs=peak_candidate_idxs,
    peak_candidates=peak_candidates,
    thresholds=thresholds,
)

In [None]:
save_peak_spectra_debug = True

In [None]:
panel_df = extraction.peak_spectra(
    total_mass_df=total_mass_df,
    peak_df=peak_df,
    peak_candidate_idxs=peak_candidate_idxs,
    peak_candidates=peak_candidates,
    peak_widths_height=peak_widths_height,
    l_ips_r=l_ips_r,
    r_ips_r=r_ips_r,
    save_peak_spectra_debug=save_peak_spectra_debug,
    debug_dir=debug_dir,
)

In [None]:
panel_df

## Integrate Coordinates

Generate the images and save them in an *xarray*, where the dimensions are: Image (indexed by peak value), $x$, and $y$.

In [None]:
extraction.coordinate_integration(peak_df=peak_df, imz_data=imz_data)

### Histogram preview of the Intensities of a given Peak

Set a value for `desired_peak_hist` (ideally something from your library) and it'll find the nearest peak, and display a histogram of the intensities of the image with `bin_count` bins.

In [None]:
desired_peak_hist = 1809.639659
bin_count = 40

In [None]:
plotting.plot_peak_hist(peak=desired_peak_hist, bin_count=bin_count)

## Match Glycan Library with Extracted Peaks

### Constants

In [None]:
ppm = 100

In [None]:
matched_peaks_df = extraction.library_matching(
    library_peak_df=library_peak_df, ppm=ppm, extraction_dir=extraction_dir
)

In [None]:
plotting.save_matched_peak_images(
    matched_peaks_df=matched_peaks_df, extraction_dir=extraction_dir
)