# Generate MSMS-Chooser Annotation Table

In [None]:
import pandas as pd
import os
import requests

## Match Filenames with Compounds

In [None]:
standards_dir = 'plant_hormones'

worklist_dir = os.path.join(standards_dir, '20220308_JGI-AK-TH_TN_507992_PlantStds_Set1_QE-HF_HILICZ_USHXG01602.csv') #worklist used to collect standards data
names_dir = os.path.join(standards_dir,'plant-hormones_names_hilic.csv') #where to save .csv with filenames, compound names, and GNPS names
inchi_name_map_dir = os.path.join(standards_dir, '20211208_plant-hormones_inchi-smiles.csv') #map of worklist compound_names (group names), inchi keys, smiles, inchis

worklist_df = pd.read_csv(worklist_dir, header=1)
inchi_name_map = pd.read_csv(inchi_name_map_dir)

In [None]:
filenames_full = worklist_df['File Name'].to_dict()
filenames = []

for name in filenames_full.values():
    ms_level = name.split('_')[10]
    group = name.split('_')[12]
    
    if ms_level=='MSMS' and group!='QC':
        filenames.append(name)

In [None]:
names_df = pd.DataFrame(columns=['filename', 'compound_name', 'gnps_compound_name', 'inchi_key', 'inchi', 'smiles'])
names_df['filename'] = filenames
for idx, row in names_df.iterrows():
    compound_name = row['filename'].split('_')[12]
    optional_field = row['filename'].split('_')[14]
    collision_energy = optional_field.split('-')[1]
    
    gnps_compound_name = compound_name+'-'+collision_energy

    row['compound_name'] = compound_name
    row['gnps_compound_name'] = gnps_compound_name
names_df.to_csv(names_dir, index=False)

In [None]:
inchi_name_map['smiles'].loc[inchi_name_map['worklist_name']=='MethylIndole3AceticAcid-MeIAA'].values[0]

In [None]:
inchi_name_map.head()

In [None]:
names_df.head()

In [None]:
for idx, row in names_df.iterrows():
    for filename in filenames:
        if row['compound_name'] in filename:
            try:
                names_df.at[idx, 'inchi_key'] = inchi_name_map['inchi_key'].loc[inchi_name_map['worklist_name']==row['compound_name']].values[0]
                names_df.at[idx, 'inchi'] = inchi_name_map['inchi'].loc[inchi_name_map['worklist_name']==row['compound_name']].values[0]
                names_df.at[idx, 'smiles'] = inchi_name_map['smiles'].loc[inchi_name_map['worklist_name']==row['compound_name']].values[0]
            except:
                print('unable to find ' + str(filename))
        else:
            continue

In [None]:
names_df.head()

In [None]:
names_df.to_csv(os.path.join(standards_dir, 'matched_names.csv'))

## Generate MSMS-Chooser Submission Sheet

In [None]:
msms_chooser_df = pd.read_csv('plant_hormones/msms_chooser_submission_template.tsv', sep='\t')

In [None]:
compound_names = names_df['gnps_compound_name'].tolist()
inchi_keys = names_df['inchi_key'].tolist()
inchis = names_df['inchi'].tolist()
smiles = names_df['smiles'].tolist()

In [None]:
filenames_mzml = []
polarities = []
for filename in filenames:
    polarity = filename.split('_')[9]
    filenames_mzml.append(filename+'.mzML')
    if polarity == 'POS':
        polarities.append('Positive')
    if polarity == 'NEG':
        polarities.append('Negative')

In [None]:
msms_chooser_df['FILENAME'] = filenames_mzml
msms_chooser_df['IONMODE'] = polarities
msms_chooser_df['COMPOUND_NAME'] = compound_names
msms_chooser_df['SMILES'] = smiles
msms_chooser_df['INCHI'] = inchis

In [None]:
msms_chooser_df.head()

In [None]:
msms_chooser_df.to_csv(os.path.join(standards_dir, 'MSMS-Chooser Submission.tsv'), sep='\t')