# Generation of candidate suspect list

Julia M. Gauglitz

Date: 4/20/2020

In [1]:
import pandas as pd
import numpy as np

Read in GNPS files

In [2]:
IDs = pd.read_csv("MSV000078547_identifications.tsv", sep='\t')
pairs = pd.read_csv("MSV000078547_pairs.tsv", sep='\t')
summary = pd.read_csv("MSV000078547_summary.tsv", sep='\t')

Define library ID inclusion criteria: 
1. MZErrorPPM <= 20
2. SharedPeaks >= 6
3. Include only entries with an INCHI; INCHI is not equal to 'N/A'

In [3]:
INCHI = IDs['INCHI']
INCHI_TF = INCHI.notna()

In [4]:
#adding column with TF values for INCHI
IDs['INCHI_TF'] = INCHI_TF

In [5]:
#subset table to only include rows that have low ppm error and 6 or more shared peaks
IDs_subset = IDs.loc[(IDs['MZErrorPPM'] <= 20.0) & (IDs['SharedPeaks'] >= 6) & (IDs['INCHI_TF'] == True)]

In [6]:
IDs_subset.head()

Unnamed: 0,Compound_Name,Ion_Source,Instrument,Compound_Source,PI,Data_Collector,Adduct,Scan,Precursor_MZ,ExactMass,...,Organism,TIC_Query,RT_Query,MZErrorPPM,SharedPeaks,MassDiff,LibMZ,SpecMZ,SpecCharge,INCHI_TF
10,C17-Sphinganine,LC-ESI,qTof,Lysate,Dorrestein,Quinn,M+H,-1,288.291,287.481,...,GNPS-LIBRARY,7763.21,0,3.38742,7,0.000977,288.291,288.29,0,True
11,Beauvericin,ESI,HCD,Isolated,NIST,NIST,M+Na,-1,806.399,0.0,...,nist_17_gnps_library,3850.47,0,1.66515,7,0.001343,806.399,806.4,0,True
30,Ethylenediaminetetraacetic acid,ESI,IT/ion trap,Isolated,NIST,NIST,M+Na,-1,315.08,0.0,...,nist_17_gnps_library,1832.68,0,0.29057,7,9.2e-05,315.08,315.08,0,True
41,Ethylenediaminetetraacetic acid,ESI,IT/ion trap,Isolated,NIST,NIST,M-H+2Na,-1,337.062,0.0,...,nist_17_gnps_library,1944.67,0,5.34186,8,0.001801,337.062,337.06,0,True
42,Erucamide,ESI,IT/ion trap,Isolated,NIST,NIST,M+H,-1,338.342,0.0,...,nist_17_gnps_library,21550.6,0,4.23928,7,0.001434,338.342,338.343,1,True


Define criteria for pairs dictionary to include as candidate analog annotations (didn't filter any in example MSV000078547)
1. Cosine >= 0.8
2. Define DeltaMZ to be taken into account (i.e. 14, 16, 28); m/z delta +/- 20ppm difference at 1500 m/z ; assign putative_ID

In [7]:
#create new column for abs of deltamz
pairs['abs_DeltaMZ'] = pairs['DeltaMZ'].abs()

In [8]:
#add putative ID of mass difference into column 'putative_ID'
pairs.loc[(pairs['abs_DeltaMZ'] > 13.956) & (pairs['abs_DeltaMZ'] < 14.076), 'putative_ID'] = 'CH2'
pairs.loc[(pairs['abs_DeltaMZ'] > 15.936) & (pairs['abs_DeltaMZ'] < 16.056), 'putative_ID'] = 'O'
pairs.loc[(pairs['abs_DeltaMZ'] > 27.972) & (pairs['abs_DeltaMZ'] < 28.092), 'putative_ID'] = 'C2H4'
#add in additional - if name not known, then make equal to the mean mass

In [10]:
#drop rows that don't have an entry in putative_ID
pairs['putative_ID'].replace('', np.nan, inplace=True)
pairs_subset = pairs.dropna(subset=['putative_ID'])

In [12]:
pairs_subset.head()

Unnamed: 0,CLUSTERID1,CLUSTERID2,DeltaMZ,MEH,Cosine,OtherScore,abs_DeltaMZ,putative_ID
6,91,123,-28.029,0.0,0.905455,0.704403,28.029,C2H4
10,111,149,-28.0277,0.0,0.912133,0.938232,28.0277,C2H4
19,147,172,-14.02,0.0,0.8185,0.678544,14.02,CH2
21,149,176,-15.9978,0.0,0.818981,0.667743,15.9978,O
22,149,199,-28.0378,0.0,0.821735,0.831735,28.0378,C2H4


In [13]:
#subset pairs df to omit low value cosine scores
pairs_subset = pairs_subset.loc[(pairs_subset['Cosine'] >= 0.8)]

# ID clustered spectra to add to library

Find the pairs, based on filtered input files. Create a column that contains the opposite clusterid, which is the scan number needed to add a new suspect annotation.

In [14]:
clusterid1_pairs = pd.merge(pairs_subset, IDs_subset, left_on='CLUSTERID1', right_on='#Scan#')
clusterid1_pairs['suspect_index'] = clusterid1_pairs['CLUSTERID2']
clusterid1_pairs['sign'] = '+'

In [16]:
clusterid2_pairs = pd.merge(pairs_subset, IDs_subset, left_on='CLUSTERID2', right_on='#Scan#')
clusterid2_pairs['suspect_index'] = clusterid2_pairs['CLUSTERID1']
clusterid2_pairs['sign'] = '-'

In [17]:
#concatenate the two dataframes
suspect_pairs = pd.concat([clusterid1_pairs, clusterid2_pairs])
suspect_pairs.reset_index(drop=True, inplace=True)

In [18]:
#define suspect compound name
suspect_pairs["Suspect Name"] = "Suspect related to " + suspect_pairs["Compound_Name"] + " " + suspect_pairs["sign"] + suspect_pairs["putative_ID"]

In [19]:
suspect_pairs

Unnamed: 0,CLUSTERID1,CLUSTERID2,DeltaMZ,MEH,Cosine,OtherScore,abs_DeltaMZ,putative_ID,Compound_Name,Ion_Source,...,MZErrorPPM,SharedPeaks,MassDiff,LibMZ,SpecMZ,SpecCharge,INCHI_TF,suspect_index,sign,Suspect Name
0,147,172,-14.02,0.0,0.8185,0.678544,14.02,CH2,C17-Sphinganine,LC-ESI,...,3.38742,7,0.000977,288.291,288.29,0,True,172,+,Suspect related to C17-Sphinganine +CH2
1,1520,1551,-15.998,0.0,0.835552,0.631343,15.998,O,Beauvericin,ESI,...,1.66515,7,0.001343,806.399,806.4,0,True,1551,+,Suspect related to Beauvericin +O
2,386,410,-15.99,0.0,0.831178,0.5988,15.99,O,Tyr-Pro-Phe,ESI,...,18.0441,6,0.00769,426.202,426.21,0,True,386,-,Suspect related to Tyr-Pro-Phe -O


In [20]:
suspect_pairs.columns

Index(['CLUSTERID1', 'CLUSTERID2', 'DeltaMZ', 'MEH', 'Cosine', 'OtherScore',
       'abs_DeltaMZ', 'putative_ID', 'Compound_Name', 'Ion_Source',
       'Instrument', 'Compound_Source', 'PI', 'Data_Collector', 'Adduct',
       'Scan', 'Precursor_MZ', 'ExactMass', 'Charge', 'CAS_Number',
       'Pubmed_ID', 'Smiles', 'INCHI', 'INCHI_AUX', 'Library_Class',
       'SpectrumID', 'IonMode', 'UpdateWorkflowName', 'LibraryQualityString',
       'TaskID', '#Scan#', 'SpectrumFile', 'LibraryName', 'MQScore',
       'Organism', 'TIC_Query', 'RT_Query', 'MZErrorPPM', 'SharedPeaks',
       'MassDiff', 'LibMZ', 'SpecMZ', 'SpecCharge', 'INCHI_TF',
       'suspect_index', 'sign', 'Suspect Name'],
      dtype='object')

In [23]:
#if this is a function, should spit out a warning
boolean = not suspect_pairs['Suspect Name'].is_unique
boolean

if boolean == True:
    print('There is more than one suspect match per LibraryID')

In [24]:
#if this is a function, should spit out a warning
boolean = not suspect_pairs['suspect_index'].is_unique
boolean

if boolean == True:
    print('There is more than one analog match per suspect scan')

In [None]:
#add in conditionals of what to change / or data to summarize with regards to overlaps

# Create output for suspect library

output: spectral library batch file

#batch upload for adding spectral library
(1 spectrum per analog)

In [25]:
suspect_pairs.columns

Index(['CLUSTERID1', 'CLUSTERID2', 'DeltaMZ', 'MEH', 'Cosine', 'OtherScore',
       'abs_DeltaMZ', 'putative_ID', 'Compound_Name', 'Ion_Source',
       'Instrument', 'Compound_Source', 'PI', 'Data_Collector', 'Adduct',
       'Scan', 'Precursor_MZ', 'ExactMass', 'Charge', 'CAS_Number',
       'Pubmed_ID', 'Smiles', 'INCHI', 'INCHI_AUX', 'Library_Class',
       'SpectrumID', 'IonMode', 'UpdateWorkflowName', 'LibraryQualityString',
       'TaskID', '#Scan#', 'SpectrumFile', 'LibraryName', 'MQScore',
       'Organism', 'TIC_Query', 'RT_Query', 'MZErrorPPM', 'SharedPeaks',
       'MassDiff', 'LibMZ', 'SpecMZ', 'SpecCharge', 'INCHI_TF',
       'suspect_index', 'sign', 'Suspect Name'],
      dtype='object')

In [27]:
new_suspect_pairs = suspect_pairs[['#Scan#', 'abs_DeltaMZ', 'Compound_Name', 'Precursor_MZ', 'Adduct', 'LibraryQualityString', 'SpectrumID', 'Suspect Name', 'suspect_index']].copy()

#columns to get from elsewhere: 'PI', 'Data Collector', 'Instrument', 'Ion_Source', 'IonMode' - based on the Unique Filepath

In [28]:
#the first columns refer to the annotation that was used to propagate to the suspect
#then come the suspect name, index
new_suspect_pairs

Unnamed: 0,#Scan#,abs_DeltaMZ,Compound_Name,Precursor_MZ,Adduct,LibraryQualityString,SpectrumID,Suspect Name,suspect_index
0,147,14.02,C17-Sphinganine,288.291,M+H,Bronze,CCMSLIB00000579284,Suspect related to C17-Sphinganine +CH2,172
1,1520,15.998,Beauvericin,806.399,M+Na,Bronze,CCMSLIB00003416348,Suspect related to Beauvericin +O,1551
2,410,15.99,Tyr-Pro-Phe,426.202,M+H,Bronze,CCMSLIB00003729791,Suspect related to Tyr-Pro-Phe -O,386


In [29]:
suspect_library = pd.merge(new_suspect_pairs, summary, left_on='suspect_index', right_on='cluster index')

In [30]:
suspect_library.head()

Unnamed: 0,#Scan#,abs_DeltaMZ,Compound_Name,Precursor_MZ,Adduct,LibraryQualityString,SpectrumID,Suspect Name,suspect_index,cluster index,AllFiles,sum(precursor intensity),RTMean,RTStdErr,parent mass,ScanNumber,ProteosafeFilePath,Original_Path
0,147,14.02,C17-Sphinganine,288.291,M+H,Bronze,CCMSLIB00000579284,Suspect related to C17-Sphinganine +CH2,172,172,inputspectra/spec-00012.mzXML,85082.3,216.752,0,302.31,102,spec/spec-00012.mzXML,f.MSV000078547/spectrum/DLab/121207_aby_B_DD_M...
1,147,14.02,C17-Sphinganine,288.291,M+H,Bronze,CCMSLIB00000579284,Suspect related to C17-Sphinganine +CH2,172,172,inputspectra/spec-00012.mzXML,88048.4,227.979,0,302.31,109,spec/spec-00012.mzXML,f.MSV000078547/spectrum/DLab/121207_aby_B_DD_M...
2,147,14.02,C17-Sphinganine,288.291,M+H,Bronze,CCMSLIB00000579284,Suspect related to C17-Sphinganine +CH2,172,172,inputspectra/spec-00016.mzXML,119687.0,85.3721,0,302.31,55,spec/spec-00016.mzXML,f.MSV000078547/spectrum/DLab/121207_aby_D_DD_M...
3,147,14.02,C17-Sphinganine,288.291,M+H,Bronze,CCMSLIB00000579284,Suspect related to C17-Sphinganine +CH2,172,172,inputspectra/spec-00016.mzXML,105935.0,92.1508,0,302.309,60,spec/spec-00016.mzXML,f.MSV000078547/spectrum/DLab/121207_aby_D_DD_M...
4,147,14.02,C17-Sphinganine,288.291,M+H,Bronze,CCMSLIB00000579284,Suspect related to C17-Sphinganine +CH2,172,172,inputspectra/spec-00016.mzXML,33657.3,313.94,0,303.31,184,spec/spec-00016.mzXML,f.MSV000078547/spectrum/DLab/121207_aby_D_DD_M...


In [31]:
suspect_library.to_csv('suspect_library_20200421.txt', sep='\t',index=False)

To investigate: 

Check if molecular formula varies by the same atoms as proposed based on the nominal mass difference?