# Generation of candidate suspect list

- Julia M. Gauglitz
- Wout Bittremieux

In [None]:
import ftplib
import functools
import logging
import math
from typing import Dict

import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import numpy as np
import pandas as pd
import scipy.signal as ssignal
import seaborn as sns
import tqdm.notebook as tqdm
from sklearn.neighbors import KernelDensity

In [None]:
# Plot styling.
plt.style.use(['seaborn-white', 'seaborn-paper'])
plt.rc('font', family='serif')
sns.set_palette('Set1')
sns.set_context('paper', font_scale=1.3)

In [None]:
logging.basicConfig(format='{asctime} [{levelname}/{processName}] {message}',
                    style='{', level=logging.INFO)
logging.captureWarnings(True)
logger = logging.getLogger('suspect_list')
logger.setLevel(logging.INFO)

### Suspect wrangling functions

In [None]:
mass_shift_annotations = pd.read_csv(
    'https://docs.google.com/spreadsheets/d/'
    '1-xh2XpSqdsa4yU-ATpDRxmpZEH6ht982jCCATFOpkyM/'
    'export?format=csv&gid=566878567')
mass_shift_annotations['mz delta'] = (mass_shift_annotations['mz delta']
                                      .astype(np.float64))
mass_shift_annotations['priority'] = (mass_shift_annotations['priority']
                                      .astype(np.uint8))

In [None]:
def filter_ids(ids: pd.DataFrame, max_ppm: float = 20,
               min_shared_peaks: int = 6) -> pd.DataFrame:
    """
    Filter high-quality identifications according to the given maximum ppm
    deviation and minimum number of shared peaks. Identifications without an
    InChI will be omitted as well.
    
    Arguments
    ---------
    ids : pd.DataFrame
        The tabular identifications retrieved from GNPS.
    max_ppm : float
        The maximum ppm deviation.
    min_shared_peaks : int
        The minimum number of shared peaks.
    
    Returns
    -------
    pd.DataFrame
        The identifications retained after filtering.
    """
    return (ids[(ids['MZErrorPPM'].abs() <= max_ppm) &
                (ids['SharedPeaks'] >= min_shared_peaks)]
            .dropna(subset=['INCHI']))

In [None]:
def filter_pairs(pairs: pd.DataFrame, min_cosine: float = 0.8) \
        -> pd.DataFrame:
    """
    Only consider pairs with a cosine similarity that exceeds the given
    cosine threshold.
    
    Arguments
    ---------
    pairs : pd.DataFrame
        The tabular pairs retrieved from GNPS.
    min_cosine : float
        The minimum cosine used to retain high-quality pairs.
    
    Returns
    -------
    pd.DataFrame
        The pairs filtered by minimum cosine similarity.
    """
    return pairs[pairs['Cosine'] >= min_cosine]

In [None]:
def filter_clusters(cluster_info: pd.DataFrame) -> pd.DataFrame:
    """
    For each cluster select as representative the scan with the highest
    precursor intensity.
    
    Arguments
    ---------
    cluster_info : pd.DataFrame
        The tabular cluster info retrieved from GNPS.
    
    Returns
    -------
    pd.DataFrame
        Clusters without duplicated spectra by keeping only the scan with the
        highest precursor intensity for each cluster.
    """
    cluster_info = (
        cluster_info.reindex(cluster_info.groupby(
            ['dataset', 'cluster index'])['sum(precursor intensity)'].idxmax())
        .dropna().reset_index(drop=True)
        [['dataset', 'cluster index', 'parent mass', 'ScanNumber',
          'Original_Path']])
    cluster_info['cluster index'] = cluster_info['cluster index'].astype(int)
    cluster_info['ScanNumber'] = cluster_info['ScanNumber'].astype(int)
    return cluster_info

In [None]:
def generate_suspects(ids: pd.DataFrame, pairs: pd.DataFrame,
                      summary: pd.DataFrame) -> pd.DataFrame:
    """
    Generate suspects from identifications and aligned spectra pairs.
    Provenance about the spectra pairs is added from the summary.
    
    Arguments
    ---------
    ids : pd.DataFrame
        The filtered identifications.
    pairs : pd.DataFrame
        The filtered pairs.
    summary : pd.DataFrame
        The filtered summary information for the clusters.
    
    Returns
    -------
    pd.DataFrame
        A DataFrame with information about both spectra forming a suspect
        identification.
    """
    # Form suspects of library and unidentified spectra pairs.
    suspects = pd.concat([
        pd.merge(pairs, ids, left_on=['dataset', 'CLUSTERID1'],
                 right_on=['dataset', '#Scan#'])
        .drop(columns=['CLUSTERID1'])
        .rename(columns={'CLUSTERID2': 'SuspectIndex'}),
        pd.merge(pairs, ids, left_on=['dataset', 'CLUSTERID2'],
                 right_on=['dataset', '#Scan#'])
        .drop(columns=['CLUSTERID2'])
        .rename(columns={'CLUSTERID1': 'SuspectIndex'})],
        ignore_index=True, sort=False).dropna(axis=1)
    
    # TODO: Properly handle this warning.
    if not suspects['SuspectIndex'].is_unique:
        logger.warning('Multiple analog matches per suspect scan found')
    
    # Add provenance information for the library and suspect scans.
    suspects = (suspects[['dataset', 'INCHI', 'Compound_Name', 'Adduct',
                          'Cosine', 'Precursor_MZ', 'SpectrumID', '#Scan#',
                          'SuspectIndex']]
                .rename(columns={'Compound_Name': 'CompoundName',
                                 'Precursor_MZ': 'LibraryPrecursorMZ',
                                 'SpectrumID': 'LibraryID',
                                 '#Scan#': 'ClusterScanNr'}))
    suspects = (pd.merge(suspects, summary,
                         left_on=['dataset', 'SuspectIndex'],
                         right_on=['dataset', 'cluster index'])
                .drop(columns=['SuspectIndex', 'cluster index'])
                .rename(columns={'parent mass': 'SuspectPrecursorMZ',
                                 'Original_Path': 'SuspectPath',
                                 'ScanNumber': 'SuspectScanNr'}))
    return suspects.drop(columns=['dataset'])

In [None]:
def group_mass_shifts(
        suspects: pd.DataFrame, mass_shift_annotations: pd.DataFrame,
        min_delta_mz: float = 0.5, interval_width: float = 1.0,
        bin_width: float = 0.002, peak_height: float = 10,
        max_dist: float = 0.01) -> pd.DataFrame:
    """
    Group close mass shifts.
    
    Mass shifts are binned and the group delta m/z is detected by finding
    peaks in the histogram.
    
    
    Arguments
    ---------
    suspects : pd.DataFrame
        The suspects from which mass shifts are grouped.
    mass_shift_annotations : pd.DataFrame
        Mass shift explanations.
    min_delta_mz : float
        The minimum (absolute) delta m/z for suspects to be retained.
    interval_width : float
        The size of the interval in which mass shifts are binned, centered
        around unit masses.
    bin_width : float
        The bin width used to construct the histogram.
    peak_height : float
        The minimum height for a peak to be considered as a group.
    max_dist : float
        The maximum m/z difference that group members can have with the
        group's peak.
    
    Returns
    -------
    pd.DataFrame
        The suspects with grouped mass shifts.
    """
    suspects['DeltaMZ'] = \
        suspects['SuspectPrecursorMZ'] - suspects['LibraryPrecursorMZ']
    # Remove suspects with an insufficient mass shift.
    suspects = suspects[suspects['DeltaMZ'].abs() > min_delta_mz].copy()
    # Assign putative identifications to the mass shifts.
    for mz in np.arange(math.floor(suspects['DeltaMZ'].min()),
                        math.ceil(suspects['DeltaMZ'].max() + interval_width),
                        interval_width):
        suspects_interval = suspects[suspects['DeltaMZ'].between(
            mz - interval_width / 2, mz + interval_width / 2)]
        if len(suspects_interval) == 0:
            continue
        # Get peaks for frequent deltas in the histogram.
        bins = (np.linspace(mz - interval_width / 2,
                            mz + interval_width / 2,
                            int(interval_width / bin_width) + 1)
                + bin_width / 2)
        hist, _ = np.histogram(suspects_interval['DeltaMZ'], bins=bins)
        peaks_i, prominences = ssignal.find_peaks(
            hist, height=peak_height, distance=max_dist / bin_width,
            prominence=(None, None))
        if len(peaks_i) == 0:
            continue
        # Assign deltas to their closest peak.
        mask_peaks = np.unique(np.hstack(
            [suspects_interval.index[suspects_interval['DeltaMZ']
                                     .between(min_mz, max_mz)]
             for min_mz, max_mz in zip(bins[prominences['left_bases']],
                                       bins[prominences['right_bases']])]))
        mz_diffs = np.vstack([
            np.abs(suspects.loc[mask_peaks, 'DeltaMZ'] - peak)
            for peak in bins[peaks_i]])
        # Also make sure that delta assignments don't exceed the maximum
        # distance.
        mask_mz_diffs = mz_diffs.min(axis=0) < max_dist
        mz_diffs = mz_diffs[:, mask_mz_diffs]
        mask_peaks = mask_peaks[mask_mz_diffs]
        peak_assignments = mz_diffs.argmin(axis=0)
        # Assign putative explanations to the grouped mass shifts.
        for delta_mz, peak_i in zip(bins[peaks_i], range(len(peaks_i))):
            mask_delta_mz = mask_peaks[peak_assignments == peak_i]
            suspects.loc[mask_delta_mz, 'GroupDeltaMZ'] = delta_mz
            putative_id = mass_shift_annotations[
                (mass_shift_annotations['mz delta'].abs()
                 - abs(delta_mz)).abs() < max_dist / 2]
            putative_id = putative_id.sort_values(
                ['priority', 'atomic difference', 'rationale'])
            if len(putative_id) == 0:
                suspects.loc[mask_delta_mz, 'AtomicDifference'] = 'unknown'
                suspects.loc[mask_delta_mz, 'Rationale'] = 'unspecified'
            else:
                # TODO: priority.
                suspects.loc[mask_delta_mz, 'AtomicDifference'] = '|'.join(
                    putative_id['atomic difference'].fillna('unspecified'))
                suspects.loc[mask_delta_mz, 'Rationale'] = '|'.join(
                    putative_id['rationale'].fillna('unspecified'))
    # Set delta m/z's for ungrouped suspects.
    suspects['GroupDeltaMZ'].fillna(suspects['DeltaMZ'], inplace=True)
    
    return (suspects.sort_values(['CompoundName', 'GroupDeltaMZ'])
            .reset_index(drop=True)
            [['INCHI', 'CompoundName', 'Adduct', 'DeltaMZ', 'GroupDeltaMZ',
              'AtomicDifference', 'Rationale', 'Cosine',
              'LibraryPrecursorMZ', 'LibraryID', 'ClusterScanNr',
              'SuspectPrecursorMZ', 'SuspectScanNr', 'SuspectPath']])

## Generate suspect library entries

Criteria to form a suspect:

- Identification ≤ 20 ppm.
- Identification ≥ 6 shared peaks.
- Identification has to include InChI.
- Cosine ≥ 0.8.
- The spectrum with maximal precursor intensity is chosen as cluster representative.

Criteria to assign group delta m/z's:

- Only delta _m_/_z_'s that exceed 0.5 Da are considered.
- Delta _m_/_z_'s are examined within each 1 _m_/_z_ window separately (centered around unit _m_/_z_'s).
- Delta _m_/_z_'s are binned with 0.002 bin width.
- Peaks with minimum height 10 are extracted from the delta _m_/_z_ histograms.
- Delta _m_/_z_'s between the left and right bases of each peak and at maximum 0.01 Da distance from the peak _m_/_z_'s are grouped.

Suspect filtering:

- Suspects whose delta _m_/_z_ occurs less than 10 times are discarded.

In [None]:
max_ppm = 20
min_shared_peaks = 6
min_cosine = 0.8
min_delta_mz = 0.5
interval_width = 1.0
bin_width = 0.002
peak_height = 10
max_dist = 0.01
min_group_size = 10

In [None]:
base_url = 'MSV000084314/updates/2020-10-08_mwang87_d7c866dd/other'
ftp_prefix = f'ftp://massive.ucsd.edu/{base_url}'

# Get the MassIVE IDs for all datasets processed in the living data analyses.
ftp = ftplib.FTP('massive.ucsd.edu')
ftp.login()
ftp.cwd(f'{base_url}/CLUSTERINFO')
msv_ids = [filename[:filename.find('_')] for filename in ftp.nlst()]

# Generate the suspects.
ids, pairs, clusters = [], [], []
logger.info('Retrieve cluster information')
for msv_id in tqdm.tqdm(msv_ids, desc='Datasets processed', unit='dataset'):
    max_tries = 5
    while max_tries > 0:
        try:
            ids.append(pd.read_csv(
                f'{ftp_prefix}/IDENTIFICATIONS/{msv_id}_identifications.tsv',
                sep='\t', usecols=[
                    'Compound_Name', 'Adduct', 'Precursor_MZ', 'INCHI',
                    'SpectrumID', 'LibraryQualityString', '#Scan#',
                    'MZErrorPPM', 'SharedPeaks']))
            ids[-1]['dataset'] = msv_id
            pairs.append(pd.read_csv(
                f'{ftp_prefix}/PAIRS/{msv_id}_pairs.tsv', sep='\t',
                usecols=['CLUSTERID1', 'CLUSTERID2', 'Cosine']))
            pairs[-1]['dataset'] = msv_id
            clusters.append(pd.read_csv(
                f'{ftp_prefix}/CLUSTERINFO/{msv_id}_clustering.tsv',
                sep='\t', usecols=[
                    'cluster index', 'sum(precursor intensity)',
                    'parent mass', 'Original_Path', 'ScanNumber']))
            clusters[-1]['dataset'] = msv_id
        except ValueError:
            logger.warning("Couldn't process dataset %s", msv_id)
            max_tries = 0
        except IOError:
            max_tries -= 1
        else:
            max_tries = 0
    
# Collect and wrangle the data from GNPS.
logger.info('Compile suspect pairs')
ids = filter_ids(pd.concat(ids, ignore_index=True), max_ppm, min_shared_peaks)
pairs = filter_pairs(pd.concat(pairs, ignore_index=True), min_cosine)
clusters = filter_clusters(pd.concat(clusters, ignore_index=True))
suspects_unfiltered = generate_suspects(ids, pairs, clusters)
# Group and assign suspects by observed delta m/z.
logger.info('Assign putative explanations to mass shifts')
suspects = group_mass_shifts(suspects_unfiltered, mass_shift_annotations,
                             min_delta_mz, interval_width, bin_width,
                             peak_height, max_dist)
# Only use the top suspect (by cosine score) per combination of library
# spectrum and putative identification.
suspects_unique = (suspects.sort_values(['Cosine'], ascending=False)
                   .drop_duplicates(['CompoundName', 'GroupDeltaMZ']))

In [None]:
delta_mzs = (suspects['GroupDeltaMZ'].value_counts().reset_index()
             .rename(columns={'GroupDeltaMZ': 'Count', 'index': 'GroupDeltaMZ'})
             .sort_values('Count', ascending=False))

In [None]:
suspects_unique_filtered = (suspects_unique[
    suspects_unique['GroupDeltaMZ'].isin(
        delta_mzs.loc[delta_mzs['Count'] >= min_group_size, 'GroupDeltaMZ'])])

In [None]:
print(f'Total: {len(suspects):,} suspects collected')
print(f'After duplicate removal and filtering '
      f'(delta m/z occurs at least {min_group_size} times): '
      f'{len(suspects_unique_filtered):,} unique suspects')

### Export suspects

In [None]:
suspects_unfiltered.to_csv('../../data/suspects_unfiltered.csv', index=False)
suspects_unique_filtered.to_csv('../../data/suspects_unique.csv', index=False)