# Generate MSMS-Chooser Annotation Table

In [1]:
import pandas as pd
import os
import requests

## Match Filenames with Compounds

In [23]:
standards_dir = 'plant_hormones'

worklist_dir = os.path.join(standards_dir, '20220308_JGI-AK-TH_TN_507992_PlantStds_Set1_QE-HF_HILICZ_USHXG01602.csv') #worklist used to collect standards data
names_dir = os.path.join(standards_dir,'plant-hormones_names_hilic.csv') #where to save .csv with filenames, compound names, and GNPS names
inchi_name_map_dir = os.path.join(standards_dir, '20211208_plant-hormones_inchi-smiles.csv') #map of worklist compound_names (group names), inchi keys, smiles, inchis

worklist_df = pd.read_csv(worklist_dir, header=1)
inchi_name_map = pd.read_csv(inchi_name_map_dir)

In [24]:
filenames_full = worklist_df['File Name'].to_dict()
filenames = []

for name in filenames_full.values():
    ms_level = name.split('_')[10]
    group = name.split('_')[12]
    
    if ms_level=='MSMS' and group!='QC':
        filenames.append(name)

In [25]:
names_df = pd.DataFrame(columns=['filename', 'compound_name', 'gnps_compound_name', 'inchi_key', 'inchi', 'smiles'])
names_df['filename'] = filenames
for idx, row in names_df.iterrows():
    compound_name = row['filename'].split('_')[12]
    optional_field = row['filename'].split('_')[14]
    collision_energy = optional_field.split('-')[1]
    
    gnps_compound_name = compound_name+'-'+collision_energy

    row['compound_name'] = compound_name
    row['gnps_compound_name'] = gnps_compound_name
names_df.to_csv(names_dir, index=False)

In [26]:
inchi_name_map['smiles'].loc[inchi_name_map['worklist_name']=='MethylIndole3AceticAcid-MeIAA'].values[0]

'CN1C=C(C2=CC=CC=C21)CC(=O)O'

In [27]:
inchi_name_map.head()

Unnamed: 0,name,formatted_name,abbreviation,worklist_name,formula,monoisotopic_mass,inchi_key,smiles,inchi
0,indole-3-acetic acid,Indole-3-Acetic Acid,IAA,Indole3AceticAcid-IAA,C10H9NO2,175.0633,SEOVTRFCIGRIMH-UHFFFAOYSA-N,C1=CC=C2C(=C1)C(=CN2)CC(=O)O,InChI=1S/C10H9NO2/c12-10(13)5-7-6-11-9-4-2-1-3...
1,methyl indole-3-acetic acid,Methyl Indole-3-Acetic Acid,MeIAA,MethylIndole3AceticAcid-MeIAA,C11H11NO2,189.079,NAIPEFIYIQFVFC-UHFFFAOYSA-N,CN1C=C(C2=CC=CC=C21)CC(=O)O,InChI=1S/C11H11NO2/c1-12-7-8(6-11(13)14)9-4-2-...
2,\nMELATONIN,Melatonin,TRA,Tryptamine-TRA,C13H16N2O2,232.1212,DRLFMBDRBRZALE-UHFFFAOYSA-N,CC(=O)NCCC1=CNC2=C1C=C(C=C2)OC,InChI=1S/C13H16N2O2/c1-9(16)14-6-5-10-8-15-13-...
3,\nL-Tryptophan,L-Tryptophan,TRP,Tryptophan-TRP,C11H12N2O2,204.0899,QIVBCDIJIAJPQS-VIFPVBQESA-N,C1=CC=C2C(=C1)C(=CN2)C[C@@H](C(=O)O)N,InChI=1S/C11H12N2O2/c12-9(11(14)15)5-7-6-13-10...
4,indole-3-acetamide,Indole-3-Acetamide,IAM,Indole3Acetamide-IAM,C10H10N2O,174.0793,ZOAMBXDOGPRZLP-UHFFFAOYSA-N,C1=CC=C2C(=C1)C(=CN2)CC(=O)N,InChI=1S/C10H10N2O/c11-10(13)5-7-6-12-9-4-2-1-...


In [28]:
names_df.head()

Unnamed: 0,filename,compound_name,gnps_compound_name,inchi_key,inchi,smiles
0,20220308_JGI-AK-TH_TN_507992_PlantStds_Set1_QE...,Indole3AceticAcid-IAA,Indole3AceticAcid-IAA-CE102040,,,
1,20220308_JGI-AK-TH_TN_507992_PlantStds_Set1_QE...,Indole3AceticAcid-IAA,Indole3AceticAcid-IAA-CE205060,,,
2,20220308_JGI-AK-TH_TN_507992_PlantStds_Set1_QE...,Indole3AceticAcid-IAA,Indole3AceticAcid-IAA-CE102040,,,
3,20220308_JGI-AK-TH_TN_507992_PlantStds_Set1_QE...,Indole3AceticAcid-IAA,Indole3AceticAcid-IAA-CE205060,,,
4,20220308_JGI-AK-TH_TN_507992_PlantStds_Set1_QE...,MethylIndole3AceticAcid-MeIAA,MethylIndole3AceticAcid-MeIAA-CE102040,,,


In [31]:
for idx, row in names_df.iterrows():
    for filename in filenames:
        if row['compound_name'] in filename:
            try:
                names_df.at[idx, 'inchi_key'] = inchi_name_map['inchi_key'].loc[inchi_name_map['worklist_name']==row['compound_name']].values[0]
                names_df.at[idx, 'inchi'] = inchi_name_map['inchi'].loc[inchi_name_map['worklist_name']==row['compound_name']].values[0]
                names_df.at[idx, 'smiles'] = inchi_name_map['smiles'].loc[inchi_name_map['worklist_name']==row['compound_name']].values[0]
            except:
                print('unable to find ' + str(filename))
        else:
            continue

unable to find 20220308_JGI-AK-TH_TN_507992_PlantStds_Set1_QE-HF_HILICZ_USHXG01602_POS_MSMS_18_PhenylAceticAcid-PAA_1_Rg70to1050-CE102040-10ugmL-S1_Run99
unable to find 20220308_JGI-AK-TH_TN_507992_PlantStds_Set1_QE-HF_HILICZ_USHXG01602_POS_MSMS_18_PhenylAceticAcid-PAA_1_Rg70to1050-CE205060-10ugmL-S1_Run100
unable to find 20220308_JGI-AK-TH_TN_507992_PlantStds_Set1_QE-HF_HILICZ_USHXG01602_NEG_MSMS_18_PhenylAceticAcid-PAA_1_Rg70to1050-CE102040-10ugmL-S1_Run101
unable to find 20220308_JGI-AK-TH_TN_507992_PlantStds_Set1_QE-HF_HILICZ_USHXG01602_NEG_MSMS_18_PhenylAceticAcid-PAA_1_Rg70to1050-CE205060-10ugmL-S1_Run102
unable to find 20220308_JGI-AK-TH_TN_507992_PlantStds_Set1_QE-HF_HILICZ_USHXG01602_POS_MSMS_18_PhenylAceticAcid-PAA_1_Rg70to1050-CE102040-10ugmL-S1_Run99
unable to find 20220308_JGI-AK-TH_TN_507992_PlantStds_Set1_QE-HF_HILICZ_USHXG01602_POS_MSMS_18_PhenylAceticAcid-PAA_1_Rg70to1050-CE205060-10ugmL-S1_Run100
unable to find 20220308_JGI-AK-TH_TN_507992_PlantStds_Set1_QE-HF_HILICZ_

In [32]:
names_df.head()

Unnamed: 0,filename,compound_name,gnps_compound_name,inchi_key,inchi,smiles
0,20220308_JGI-AK-TH_TN_507992_PlantStds_Set1_QE...,Indole3AceticAcid-IAA,Indole3AceticAcid-IAA-CE102040,SEOVTRFCIGRIMH-UHFFFAOYSA-N,InChI=1S/C10H9NO2/c12-10(13)5-7-6-11-9-4-2-1-3...,C1=CC=C2C(=C1)C(=CN2)CC(=O)O
1,20220308_JGI-AK-TH_TN_507992_PlantStds_Set1_QE...,Indole3AceticAcid-IAA,Indole3AceticAcid-IAA-CE205060,SEOVTRFCIGRIMH-UHFFFAOYSA-N,InChI=1S/C10H9NO2/c12-10(13)5-7-6-11-9-4-2-1-3...,C1=CC=C2C(=C1)C(=CN2)CC(=O)O
2,20220308_JGI-AK-TH_TN_507992_PlantStds_Set1_QE...,Indole3AceticAcid-IAA,Indole3AceticAcid-IAA-CE102040,SEOVTRFCIGRIMH-UHFFFAOYSA-N,InChI=1S/C10H9NO2/c12-10(13)5-7-6-11-9-4-2-1-3...,C1=CC=C2C(=C1)C(=CN2)CC(=O)O
3,20220308_JGI-AK-TH_TN_507992_PlantStds_Set1_QE...,Indole3AceticAcid-IAA,Indole3AceticAcid-IAA-CE205060,SEOVTRFCIGRIMH-UHFFFAOYSA-N,InChI=1S/C10H9NO2/c12-10(13)5-7-6-11-9-4-2-1-3...,C1=CC=C2C(=C1)C(=CN2)CC(=O)O
4,20220308_JGI-AK-TH_TN_507992_PlantStds_Set1_QE...,MethylIndole3AceticAcid-MeIAA,MethylIndole3AceticAcid-MeIAA-CE102040,NAIPEFIYIQFVFC-UHFFFAOYSA-N,InChI=1S/C11H11NO2/c1-12-7-8(6-11(13)14)9-4-2-...,CN1C=C(C2=CC=CC=C21)CC(=O)O


In [34]:
names_df.to_csv(os.path.join(standards_dir, 'matched_names.csv'))

In [None]:
names_df['gnps_compound_name']

## Generate MSMS-Chooser Submission Sheet

In [None]:
msms_chooser_df = pd.read_csv('plant_hormones/msms_chooser_submission_template.tsv', sep='\t')

In [None]:
compound_names = names_df['gnps_compound_name'].tolist()
inchi_keys = names_df['inchi_key'].tolist()
inchis = names_df['inchi'].tolist()
smiles = names_df['smiles'].tolist()

In [None]:
filenames_mzml = []
polarities = []
for filename in filenames:
    polarity = filename.split('_')[9]
    filenames_mzml.append(filename+'.mzML')
    if polarity == 'POS':
        polarities.append('Positive')
    if polarity == 'NEG':
        polarities.append('Negative')

In [None]:
msms_chooser_df['FILENAME'] = filenames_mzml
msms_chooser_df['IONMODE'] = polarities
msms_chooser_df['COMPOUND_NAME'] = compound_names
msms_chooser_df['SMILES'] = smiles
msms_chooser_df['INCHI'] = inchis

In [None]:
msms_chooser_df.head()

In [None]:
msms_chooser_df.to_csv('plant_hormones/MSMS-Chooser Submission.tsv', sep='\t')