# Generation of candidate suspect list

Julia M. Gauglitz

Date: 4/20/2020

In [None]:
import functools
from typing import Dict

import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import numpy as np
import pandas as pd
import seaborn as sns
import tqdm.notebook as tqdm

# Functions

Function 'id_clean' used for editing IDs table

Define library ID inclusion criteria: 
1. MZErrorPPM <= 20
2. SharedPeaks >= 6
3. Include only entries with an INCHI; INCHI is not equal to 'N/A'

In [None]:
def filter_ids(ids: pd.DataFrame, max_ppm: float = 20,
               min_shared_peaks: int = 6) -> pd.DataFrame:
    """
    Filter high-quality identifications according to the given maximum ppm
    deviation and minimum number of shared peaks. Identifications without an
    InChI will be omitted as well.
    
    Arguments
    ---------
    ids : pd.DataFrame
        The tabular identifications retrieved from GNPS.
    max_ppm : float
        The maximum ppm deviation.
    min_shared_peaks : int
        The minimum number of shared peaks.
    
    Returns
    -------
    pd.DataFrame
        The identifications retained after filtering.
    """
    return (ids[(ids['MZErrorPPM'].abs() <= max_ppm) &
                (ids['SharedPeaks'] >= min_shared_peaks)]
            .dropna(subset=['INCHI']))

Editing pairs table

Define criteria for pairs dictionary to include as candidate analog annotations (didn't filter any in example MSV000078547)
1. Cosine >= 0.8
2. Define DeltaMZ to be taken into account (i.e. 14, 16, 28); m/z delta +/- 20ppm difference at 1500 m/z ; assign putative_ID

In [None]:
known_mass_diffs = {14.0: 'CH2', 16.0: 'O', 28.0: 'C2H4'}


def get_frequent_mass_diffs(
        delta_mzs: pd.Series, round_digits: int = 1,
    min_mass_diff: float = 1, min_count: int = 20,
    known_mass_diffs : Dict[float, str] = known_mass_diffs) -> pd.DataFrame:
    """
    Get frequently reported delta m/zs.
    
    Arguments
    ---------
    delta_mzs : pd.Series
        A Series of observed delta m/zs.
    round_digits : int
        The number of decimals to use for rounding the delta m/zs.
    min_mass_diff : float
        The minimum absolute delta m/z to consider (i.e. exclude 0 deltas).
    min_count : int
        The minimum number of times the delta m/zs need to occur to be
        considered.
    known_mass_diffs : Dict[float, str]
        A dictionary with as keys mass differences and as values likely
        explanations.
    
    Returns
    -------
    pd.DataFrame
        A DataFrame with as columns the observed delta m/z, the number of
        times this delta m/z occurs, and its likely explanation (or the delta
        m/z repeated if not possible).
    """
    mass_diffs = (delta_mzs
                  .apply(functools.partial(round, ndigits=round_digits))
                  .value_counts().reset_index()
                  .rename(columns={'index': 'DeltaMZ', 'DeltaMZ': 'count'}))
    # Exclude unshifted and infrequent mass differences.
    mass_diffs = mass_diffs[(mass_diffs['DeltaMZ'].abs() > min_mass_diff) &
                            (mass_diffs['count'] > min_count)]
    # Explain known/unknown mass differences.
    mass_diffs['PutativeID'] = (mass_diffs['DeltaMZ'].abs()
                                .map(known_mass_diffs))
    mass_diffs['PutativeID'] = mass_diffs['PutativeID'].fillna(
        mass_diffs['DeltaMZ'])
    return mass_diffs
    

def pairs_explain_mass_diff(pairs: pd.DataFrame, delta_mass: float = 0.05,
                            min_cosine: float = 0.8) -> pd.DataFrame:
    """
    Match mass differences between cluster pairs to known elements.
    
    Arguments
    ---------
    pairs : pd.DataFrame
        The tabular pairs retrieved from GNPS.
    delta_mass : float
        The delta mass (Da) used to matched pairs mass differences to known
        elements.
    min_cosine : float
        The minimum cosine used to retain high-quality pairs.
    
    Returns
    -------
    pd.DataFrame
        The pairs to which an element could be matched based on their mass
        differences.
    """
    # Omit pairs with a low cosine score.
    pairs = pairs[pairs['Cosine'] >= min_cosine]
    # Find frequently occuring mass differences.
    mass_diffs = get_frequent_mass_diffs(pairs['DeltaMZ'])
    # Match mass differences to putative identifications.
    pairs['PutativeID'] = np.nan
    for mass_shift, putative_id in zip(mass_diffs['DeltaMZ'],
                                       mass_diffs['PutativeID']):
        matched_mass_shift = pairs['DeltaMZ'].between(
            mass_shift - delta_mass, mass_shift + delta_mass)
        pairs.loc[matched_mass_shift, 'PutativeID'] = putative_id
    return pairs.dropna(subset=['PutativeID'])

Function for editing summary table

In [None]:
def summary_max_intensity_scan_per_cluster(summary: pd.DataFrame) \
        -> pd.DataFrame:
    """
    For each cluster select as representative the scan with the highest
    precursor intensity.
    
    Arguments
    ---------
    summary : pd.DataFrame
        The tabular summary retrieved from GNPS.
    
    Returns
    -------
    pd.DataFrame
        The summary with only the scans with the highest precursor intensity
        for each cluster.
    """
    summary = (summary.reindex(summary.groupby(['dataset', 'cluster index'])
                               ['sum(precursor intensity)'].idxmax())
               .dropna().reset_index(drop=True)
               [['dataset', 'cluster index', 'ScanNumber', 'Original_Path']])
    summary['cluster index'] = summary['cluster index'].astype(int)
    summary['ScanNumber'] = summary['ScanNumber'].astype(int)
    return summary

# ID clustered spectra to add to library

ID clustered spectra to add to library

Function: Find the pairs, based on filtered input files. Create a column that contains the opposite clusterid, which is the scan number needed to add a new suspect annotation.

In [None]:
def combine_suspect_pairs(ids: pd.DataFrame, pairs: pd.DataFrame,
                          summary: pd.DataFrame) -> pd.DataFrame:
    """
    Combine suspects pairs with identification and library information.
    
    Arguments
    ---------
    ids : pd.DataFrame
        The filtered identifications.
    pairs : pd.DataFrame
        The pairs with mass difference explanations.
    summary : pd.DataFrame
        The summary information for the clusters.
    
    Returns
    -------
    pd.DataFrame
        A DataFrame with information about both spectra forming the suspect
        identification.
    """
    # Combine pairs with identifications to form suspects.
    suspects_plus = (pd.merge(pairs, ids, left_on=['dataset', 'CLUSTERID1'],
                              right_on=['dataset', '#Scan#'])
                     .drop(columns=['CLUSTERID1'])
                     .rename(columns={'CLUSTERID2': 'SuspectIndex'}))
    suspects_minus = (pd.merge(pairs, ids, left_on=['dataset', 'CLUSTERID2'],
                               right_on=['dataset', '#Scan#'])
                      .drop(columns=['CLUSTERID2'])
                      .rename(columns={'CLUSTERID1': 'SuspectIndex'}))
    suspects = pd.concat([suspects_plus, suspects_minus],
                         ignore_index=True, sort=False).dropna(axis=1)
    suspects['Mod'] = pd.Series(['addition'] * len(suspects_plus) +
                                ['loss'] * len(suspects_minus))
    # Only take the top suspect by cosine similarity.
    suspects = (suspects.sort_values(['Cosine'], ascending=False)
                .drop_duplicates(['Compound_Name', 'Mod', 'PutativeID']))
    # TODO: Properly handle this warning.
    if not suspects['SuspectIndex'].is_unique:
        print('Multiple analog matches per suspect scan found')
    
    # Add provenance information for the library and suspect scans.
    suspects = (suspects[['dataset', 'Compound_Name', 'Mod', 'PutativeID',
                          'DeltaMZ', 'SpectrumID', '#Scan#', 'SuspectIndex']]
                .rename(columns={'Compound_Name': 'CompoundName',
                                 'SpectrumID': 'LibraryID',
                                 '#Scan#': 'ClusterScanNr'}))
    suspects = (pd.merge(suspects, summary, left_on=['dataset', 'SuspectIndex'],
                         right_on=['dataset', 'cluster index'])
                .drop(columns=['SuspectIndex', 'cluster index'])
                .rename(columns={'Original_Path': 'SuspectPath',
                                 'ScanNumber': 'SuspectScanNr'}))
    return suspects.drop(columns=['dataset'])

# Generate suspect library using functions defined above

In [None]:
suspects_all = []
ftp_prefix = 'ftp://massive.ucsd.edu/MSV000084314/other'
ids, pairs, summary = [], [], []
for msv_id in tqdm.tqdm(pd.read_csv('datasets.csv', header=None).squeeze()[:5],
                        desc='Datasets read', unit='dataset'):
    ids.append(pd.read_csv(
        f'{ftp_prefix}/IDENTIFICATIONS/{msv_id}_identifications.tsv',
        sep='\t', usecols=['Compound_Name', 'Adduct', 'Precursor_MZ',
                           'INCHI', 'SpectrumID', 'LibraryQualityString',
                           '#Scan#', 'MZErrorPPM', 'SharedPeaks']))
    ids[-1]['dataset'] = msv_id
    pairs.append(pd.read_csv(
        f'{ftp_prefix}/PAIRS/{msv_id}_pairs.tsv', sep='\t',
        usecols=['CLUSTERID1', 'CLUSTERID2', 'DeltaMZ', 'Cosine']))
    pairs[-1]['dataset'] = msv_id
    summary.append(pd.read_csv(
        f'{ftp_prefix}/CLUSTERSUMMARY/{msv_id}_summary.tsv', sep='\t',
        usecols=['cluster index', 'sum(precursor intensity)', 'ScanNumber',
                 'Original_Path']))
    summary[-1]['dataset'] = msv_id
    
ids = filter_ids(pd.concat(ids, ignore_index=True))
pairs = pairs_explain_mass_diff(pd.concat(pairs, ignore_index=True))
summary = summary_max_intensity_scan_per_cluster(
    pd.concat(summary, ignore_index=True))
suspects = combine_suspect_pairs(ids, pairs, summary)

In [None]:
width = 7
height = width / 1.618
fig, ax = plt.subplots(figsize=(width * 1.5, height / 1.5))

delta_mzs = get_frequent_mass_diffs(pairs['DeltaMZ'])
ax.bar(delta_mzs['DeltaMZ'], delta_mzs['count'], width=0.4, color='black')

sns.despine(ax=ax)

ax.set_xlabel('Delta m/z')
ax.set_ylabel(f'Number of pairs')

plt.savefig('delta_mz.png', dpi=300, bbox_inches='tight')
plt.show()
plt.close()

In [None]:
delta_mzs.to_csv('delta_mz.csv', index=False)

# Create output for suspect library

output: spectral library batch file

#batch upload for adding spectral library
(1 spectrum per analog)

In [None]:
suspects.to_csv('suspect_library_5MSV_20200520.csv', index=False)

To investigate: 

Check if molecular formula varies by the same atoms as proposed based on the nominal mass difference?

In [None]:
#add in conditionals of what to change / or data to summarize with regards to overlaps

In [None]:
#columns to get from elsewhere: 'PI', 'Data Collector', 'Instrument', 'Ion_Source', 'IonMode' - based on the Unique Filepath