# Generation of candidate suspect list

Julia M. Gauglitz

Date: 4/20/2020

In [1]:
import pandas as pd
import numpy as np

# Functions

Function 'id_clean' used for editing IDs table

Define library ID inclusion criteria: 
1. MZErrorPPM <= 20
2. SharedPeaks >= 6
3. Include only entries with an INCHI; INCHI is not equal to 'N/A'

In [2]:
#example call: id_clean(IDs)
def id_clean(IDs):
    INCHI = IDs['INCHI']
    INCHI_TF = INCHI.notna()
    IDs['INCHI_TF'] = INCHI_TF
    IDs_subset = IDs.loc[(IDs['MZErrorPPM'] <= 20.0) & (IDs['SharedPeaks'] >= 6) & (IDs['INCHI_TF'] == True)]
    return IDs_subset

Editing pairs table

Define criteria for pairs dictionary to include as candidate analog annotations (didn't filter any in example MSV000078547)
1. Cosine >= 0.8
2. Define DeltaMZ to be taken into account (i.e. 14, 16, 28); m/z delta +/- 20ppm difference at 1500 m/z ; assign putative_ID

In [3]:
#example call: pairs_clean(pairs)
def pairs_clean(pairs):
    #create new column for abs of deltamz
    pairs['abs_DeltaMZ'] = pairs['DeltaMZ'].abs()
    #add putative ID of mass difference into column 'putative_ID'
    pairs.loc[(pairs['abs_DeltaMZ'] > 13.956) & (pairs['abs_DeltaMZ'] < 14.076), 'putative_ID'] = 'CH2'
    pairs.loc[(pairs['abs_DeltaMZ'] > 15.936) & (pairs['abs_DeltaMZ'] < 16.056), 'putative_ID'] = 'O'
    pairs.loc[(pairs['abs_DeltaMZ'] > 27.972) & (pairs['abs_DeltaMZ'] < 28.092), 'putative_ID'] = 'C2H4'
    #add in additional - if name not known, then make equal to the mean mass
    #drop rows that don't have an entry in putative_ID
    pairs['putative_ID'].replace('', np.nan, inplace=True)
    pairs_subset = pairs.dropna(subset=['putative_ID'])
    #subset pairs df to omit low value cosine scores
    pairs_subset = pairs_subset.loc[(pairs_subset['Cosine'] >= 0.8)]
    return pairs_subset

Function for editing summary table

In [4]:
#subset summary to only include the spectrum with the max precursor intensity. 
#group by cluster index, then get the index of the max column value and then select that row from the dataframe.
#example call: summary_clean(summary)
def summary_clean(summary):
    summary_max_filter = summary.loc[summary.groupby('cluster index')['sum(precursor intensity)'].idxmax()]
    #copying table
    summary_max_lib = summary_max_filter.copy()
    #adding in columns to merge
    summary_max_lib['cluster_index_lib'] = summary_max_lib['cluster index']
    summary_max_lib['ScanNumber_lib'] = summary_max_lib['ScanNumber']
    summary_max_lib['Original_Path_lib'] = summary_max_lib['Original_Path']
    summary_max_lib = summary_max_lib[['cluster_index_lib','ScanNumber_lib','Original_Path_lib']].copy()
    return summary_max_filter, summary_max_lib

# ID clustered spectra to add to library

ID clustered spectra to add to library

Function: Find the pairs, based on filtered input files. Create a column that contains the opposite clusterid, which is the scan number needed to add a new suspect annotation.

In [5]:
#example call: concat_pairs(IDs_subset,pairs_subset,summary_max_filter) ; as long as the inputs have been defined previously
def concat_pairs(IDs_subset,pairs_subset,summary_max_filter,summary_max_lib):
    clusterid1_pairs = pd.merge(pairs_subset, IDs_subset, left_on='CLUSTERID1', right_on='#Scan#')
    clusterid1_pairs['suspect_index'] = clusterid1_pairs['CLUSTERID2']
    clusterid1_pairs['sign'] = '+'
    clusterid2_pairs = pd.merge(pairs_subset, IDs_subset, left_on='CLUSTERID2', right_on='#Scan#')
    clusterid2_pairs['suspect_index'] = clusterid2_pairs['CLUSTERID1']
    clusterid2_pairs['sign'] = '-'
    #concatenate the two dataframes
    suspect_pairs = pd.concat([clusterid1_pairs, clusterid2_pairs])
    suspect_pairs.reset_index(drop=True, inplace=True)
    #define suspect compound name
    suspect_pairs["Suspect Name"] = "Suspect related to " + suspect_pairs["Compound_Name"] + " " + suspect_pairs["sign"] + suspect_pairs["putative_ID"]
    
    #if this is a function, should spit out a warning
    boolean = not suspect_pairs['Suspect Name'].is_unique
    boolean

    if boolean == True:
        print('There is more than one suspect match per LibraryID')
    
    #if this is a function, should spit out a warning
    boolean = not suspect_pairs['suspect_index'].is_unique
    boolean

    if boolean == True:
        print('There is more than one analog match per suspect scan')
    
    #the first columns refer to the annotation that was used to propagate to the suspect
    #then come the suspect name, index
    new_suspect_pairs = suspect_pairs[['#Scan#', 'abs_DeltaMZ', 'Compound_Name', 'Precursor_MZ', 'Adduct', 'LibraryQualityString', 'SpectrumID', 'Suspect Name', 'suspect_index']].copy()
    
    #to add: add in the filepath and scan number for the identified compound - 
    #will be the feature with highest precursor intensity that matches to #Scan#
    suspect_library = pd.merge(new_suspect_pairs, summary_max_filter, left_on='suspect_index', right_on='cluster index')
    suspect_library2 = pd.merge(suspect_library, summary_max_lib, left_on='#Scan#', right_on='cluster_index_lib')
    
    return suspect_library2

# Generate suspect library using functions defined above

In [6]:
#extract list of MSV IDs from CLUSTERSUMMARY folder
MSV923_apr232020 = pd.read_csv('filenames3.csv')

In [7]:
results = []
msv_ids = MSV923_apr232020['filenames3.txt'].tolist()
#msv_ids = ['MSV000078547', 'MSV000078548', 'MSV000078551', 'MSV000078552', 'MSV000078556']
for msv_id in msv_ids:
    IDs = pd.read_csv(f'ftp://massive.ucsd.edu/MSV000084314/other/IDENTIFICATIONS/{msv_id}_identifications.tsv', sep='\t')
    pairs = pd.read_csv(f'ftp://massive.ucsd.edu/MSV000084314/other/PAIRS/{msv_id}_pairs.tsv', sep='\t')
    summary = pd.read_csv(f'ftp://massive.ucsd.edu/MSV000084314/other/CLUSTERSUMMARY/{msv_id}_summary.tsv', sep='\t')
    
    IDs_subset = id_clean(IDs)
    pairs_subset = pairs_clean(pairs)
    summary_max_filter, summary_max_lib = summary_clean(summary)
    suspect_library2 = concat_pairs(IDs_subset,pairs_subset,summary_max_filter,summary_max_lib)
    results.append(suspect_library2)
    
final_result = pd.concat(results)

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  """


There is more than one suspect match per LibraryID
There is more than one analog match per suspect scan
There is more than one suspect match per LibraryID
There is more than one analog match per suspect scan
There is more than one analog match per suspect scan
There is more than one suspect match per LibraryID
There is more than one suspect match per LibraryID
There is more than one suspect match per LibraryID
There is more than one analog match per suspect scan
There is more than one suspect match per LibraryID
There is more than one analog match per suspect scan


ValueError: cannot set a frame with no defined index and a scalar

In [None]:
final_result.reset_index(drop=True, inplace=True)
final_result

# Create output for suspect library

output: spectral library batch file

#batch upload for adding spectral library
(1 spectrum per analog)

In [None]:
#final_result.to_csv('suspect_library_5MSV_20200423.txt', sep='\t',index=False)

To investigate: 

Check if molecular formula varies by the same atoms as proposed based on the nominal mass difference?

In [None]:
#add in conditionals of what to change / or data to summarize with regards to overlaps

In [None]:
#columns to get from elsewhere: 'PI', 'Data Collector', 'Instrument', 'Ion_Source', 'IonMode' - based on the Unique Filepath