# Generation of candidate suspect list

Julia M. Gauglitz

Date: 4/20/2020

In [None]:
import numpy as np
import pandas as pd
import tqdm.notebook as tqdm

# Functions

Function 'id_clean' used for editing IDs table

Define library ID inclusion criteria: 
1. MZErrorPPM <= 20
2. SharedPeaks >= 6
3. Include only entries with an INCHI; INCHI is not equal to 'N/A'

In [None]:
def filter_ids(ids: pd.DataFrame, max_ppm: float = 20,
               min_shared_peaks: int = 6) -> pd.DataFrame:
    """
    Filter high-quality identifications according to the given maximum ppm
    deviation and minimum number of shared peaks. Identifications without an
    InChI will be omitted as well.
    
    Arguments
    ---------
    ids : pd.DataFrame
        The tabular identifications retrieved from GNPS.
    max_ppm : float
        The maximum ppm deviation.
    min_shared_peaks : int
        The minimum number of shared peaks.
    
    Returns
    -------
    pd.DataFrame
        The identifications retained after filtering.
    """
    return (ids[(ids['MZErrorPPM'].abs() <= max_ppm) &
                (ids['SharedPeaks'] >= min_shared_peaks)]
            .dropna(subset=['INCHI']))

Editing pairs table

Define criteria for pairs dictionary to include as candidate analog annotations (didn't filter any in example MSV000078547)
1. Cosine >= 0.8
2. Define DeltaMZ to be taken into account (i.e. 14, 16, 28); m/z delta +/- 20ppm difference at 1500 m/z ; assign putative_ID

In [None]:
elements = {'CH2': 14.016, 'O': 15.9965, 'C2H4': 28.032}

def pairs_explain_mass_diff(pairs: pd.DataFrame, delta_mass: float = 0.05,
                            min_cosine: float = 0.8) -> pd.DataFrame:
    """
    Match mass differences between cluster pairs to known elements.
    
    Arguments
    ---------
    pairs : pd.DataFrame
        The tabular pairs retrieved from GNPS.
    delta_mass : float
        The delta mass (Da) used to matched pairs mass differences to known
        elements.
    min_cosine : float
        The minimum cosine used to retain high-quality pairs.
    
    Returns
    -------
    pd.DataFrame
        The pairs to which an element could be matched based on their mass
        differences.
    """
    # Omit pairs with a low cosine score.
    pairs = pairs[pairs['Cosine'] >= min_cosine]
    # Match mass differences to putative identifications.
    pairs['PutativeID'] = np.nan
    for element, mass_shift in elements.items():
        matched_mass_shift = pairs['DeltaMZ'].abs().between(
            mass_shift - delta_mass, mass_shift + delta_mass)
        pairs.loc[matched_mass_shift, 'PutativeID'] = element
    return pairs.dropna(subset=['PutativeID'])

Function for editing summary table

In [None]:
def summary_max_intensity_scan_per_cluster(summary: pd.DataFrame) \
        -> pd.DataFrame:
    """
    For each cluster select as representative the scan with the highest
    precursor intensity.
    
    Arguments
    ---------
    summary : pd.DataFrame
        The tabular summary retrieved from GNPS.
    
    Returns
    -------
    pd.DataFrame
        The summary with only the scans with the highest precursor intensity
        for each cluster.
    """
    return (
        summary.reindex(summary.groupby('cluster index')
                        ['sum(precursor intensity)'].idxmax())
        .reset_index(drop=True)
        [['cluster index', 'ScanNumber', 'Original_Path']])

# ID clustered spectra to add to library

ID clustered spectra to add to library

Function: Find the pairs, based on filtered input files. Create a column that contains the opposite clusterid, which is the scan number needed to add a new suspect annotation.

In [None]:
def combine_suspect_pairs(ids: pd.DataFrame, pairs: pd.DataFrame,
                          summary: pd.DataFrame) -> pd.DataFrame:
    """
    Combine suspects pairs with identification and library information.
    
    Arguments
    ---------
    ids : pd.DataFrame
        The filtered identifications.
    pairs : pd.DataFrame
        The pairs with mass difference explanations.
    summary : pd.DataFrame
        The summary information for the clusters.
    
    Returns
    -------
    pd.DataFrame
        A DataFrame with information about both spectra forming the suspect
        identification.
    """
    suspects = (pd.concat(
        [(pd.merge(pairs, ids, left_on='CLUSTERID1', right_on='#Scan#')
          .drop(columns=['CLUSTERID1'])
          .rename(columns={'CLUSTERID2': 'SuspectIndex'})),
         (pd.merge(pairs, ids, left_on='CLUSTERID2', right_on='#Scan#')
          .drop(columns=['CLUSTERID2'])
          .rename(columns={'CLUSTERID1': 'SuspectIndex'}))],
        ignore_index=True, sort=False).dropna(axis=1))
    sign = (suspects['DeltaMZ'] > 0).map({False: '-', True: '+'})
    suspects['Suspect'] = ('Suspect related to ' +
                           suspects['Compound_Name'] + ' ' + sign + ' ' +
                           suspects['PutativeID'])
    # TODO: Properly handle these warnings.
    if not suspects['Suspect'].is_unique:
        print('Multiple suspect matches per LibraryID found')
    if not suspects['SuspectIndex'].is_unique:
        print('Multiple analog matches per suspect scan found')
    
    suspects = pd.merge(suspects, summary, left_on='SuspectIndex',
                        right_on='cluster index')
    suspects = pd.merge(suspects, summary, left_on='#Scan#',
                        right_on='cluster index', suffixes=('', '_library'))
    return suspects

# Generate suspect library using functions defined above

In [None]:
suspects_all = []
ftp_prefix = 'ftp://massive.ucsd.edu/MSV000084314/other'
for msv_id in tqdm.tqdm(pd.read_csv('filenames3.csv').squeeze()[:5],
                        desc='Datasets processed', unit='dataset'):
    ids = pd.read_csv(
        f'{ftp_prefix}/IDENTIFICATIONS/{msv_id}_identifications.tsv',
        sep='\t', usecols=['Compound_Name', 'Adduct', 'Precursor_MZ',
                           'INCHI', 'SpectrumID', 'LibraryQualityString',
                           '#Scan#', 'MZErrorPPM', 'SharedPeaks'])
    pairs = pd.read_csv(
        f'{ftp_prefix}/PAIRS/{msv_id}_pairs.tsv', sep='\t',
        usecols=['CLUSTERID1', 'CLUSTERID2', 'DeltaMZ', 'Cosine'])
    summary = pd.read_csv(
        f'{ftp_prefix}/CLUSTERSUMMARY/{msv_id}_summary.tsv', sep='\t',
        usecols=['cluster index', 'sum(precursor intensity)', 'ScanNumber',
                 'Original_Path'])
    
    ids = filter_ids(ids)
    pairs = pairs_explain_mass_diff(pairs)
    summary = summary_max_intensity_scan_per_cluster(summary)
    suspects = combine_suspect_pairs(ids, pairs, summary)
    if len(suspects) > 0:
        suspects_all.append(suspects)
suspects_all = pd.concat(suspects_all, ignore_index=True)

In [None]:
suspects_all

# Create output for suspect library

output: spectral library batch file

#batch upload for adding spectral library
(1 spectrum per analog)

In [None]:
# suspects_all.to_csv('suspect_library_5MSV_20200423.txt', sep='\t',
#                     index=False)

To investigate: 

Check if molecular formula varies by the same atoms as proposed based on the nominal mass difference?

In [None]:
#add in conditionals of what to change / or data to summarize with regards to overlaps

In [None]:
#columns to get from elsewhere: 'PI', 'Data Collector', 'Instrument', 'Ion_Source', 'IonMode' - based on the Unique Filepath