In [9]:
import pandas as pd
import requests
import requests_cache
from tqdm import tqdm
import os
import glob
from markdown import markdown

requests_cache.install_cache('demo_cache', allowable_codes=(200,400,404,500))

In [10]:
# Reading frequent data
#suspect_list_df = pd.read_csv("suspect_library_freq.csv")
suspect_list_df = pd.read_csv("suspect_library_infreq.csv")

In [11]:
suspect_list_df.head()

Unnamed: 0,INCHI,CompoundName,Adduct,DeltaMZ,GroupDeltaMZ,AtomicDifference,Rationale,Cosine,LibraryPrecursorMZ,LibraryID,ClusterScanNr,SuspectPrecursorMZ,SuspectScanNr,SuspectPath
0,InChI=1S/C29H50O2/c1-20(2)12-9-13-21(3)14-10-1...,(+)-.alpha.-Tocopherol,M+H,-193.059,-193.13,,,0.873882,431.388,CCMSLIB00003201507,244167,624.447,2802,f.MSV000083475/ccms_peak/RAW/PLATE7/7F5_8_55_s...
1,InChI=1S/C27H36O10/c1-4-5-6-7-13-10-17(28)19-1...,(+)-11-Nor-.DELTA.9-tetrahydrocannabinol-9-car...,M+H,200.156,200.13,unknown|unknown,2-amino-8-oxo-decanoic acid|N-methyl-4-butenyl...,0.817689,521.238,CCMSLIB00003493335,603029,321.082,167,f.MSV000082493/ccms_peak/urine/DM000088438_RC1...
2,InChI=1S/C15H20O4/c1-10(7-13(17)18)5-6-15(19)1...,(+)-Abscisic acid,2M+H,184.093,184.11,unknown,2-carboxy-6-hydroxyoctahydroindole,0.801917,529.28,CCMSLIB00003704198,164267,345.187,1473,f.MSV000079098/spectrum/GNPS_Cichewicz_Fungi_C...
3,InChI=1S/C15H20O4/c1-10(7-13(17)18)5-6-15(19)1...,(+)-Abscisic acid,2M+H,246.99,246.99,,,0.892768,529.28,CCMSLIB00003226713,92338,282.29,3264,f.MSV000079356/ccms_peak/Plant/Plate2_BC4_01_8...
4,InChI=1S/C15H20O4/c1-10(7-13(17)18)5-6-15(19)1...,(+)-Abscisic acid,2M+H,247.11,247.11,,,0.906836,529.28,CCMSLIB00003226714,56136,282.17,536,f.MSV000081810/ccms_peak/Exudates library/2015...


In [12]:
suspect_list_records = suspect_list_df.to_dict(orient="records")

In [13]:
for record in tqdm(suspect_list_records):
    inchi = record["INCHI"]
    if len(inchi) < 10:
        continue
    url = "https://gnps-structure.ucsd.edu/classyfire?inchi={}".format(inchi)
    r = requests.get(url)
    if r.status_code == 200:
        try:
            classyfire_json = r.json()
            record["superclass"] = classyfire_json["superclass"]["name"]
            record["class"] = classyfire_json["class"]["name"]
        except:
            continue

100%|██████████| 7929/7929 [00:23<00:00, 331.01it/s]


In [14]:
# Getting Dataset Information
datasets_df = pd.DataFrame(requests.get("https://massive.ucsd.edu/ProteoSAFe/datasets_json.jsp").json()["datasets"])
datasets_df["DatasetInstrument"] = datasets_df["instrument"]
datasets_df = datasets_df[["dataset", "DatasetInstrument"]]

In [15]:
# Getting Library Information
libraries_df = pd.DataFrame(requests.get("https://gnps-external.ucsd.edu/gnpslibraryjson").json())
libraries_df["LibraryInstrument"] = libraries_df["Instrument"]
libraries_df = libraries_df[["spectrum_id", "LibraryInstrument"]]

In [16]:
suspect_list_df = pd.DataFrame(suspect_list_records)

suspect_list_df.to_csv("suspect_enriched.csv", sep=",", index=False)

def process_enrich(suspect_list_df):
    suspects = suspect_list_df
    
    suspects['dataset'] = suspects['SuspectPath'].str.slice(2, 14)
    suspects['usi1'] = ('mzspec:' + suspects['dataset'] + ':' +
                        suspects['SuspectPath'].apply(os.path.basename) +
                        ':scan:' + suspects['SuspectScanNr'].astype(str))
    suspects['usi2'] = 'mzspec:GNPSLIBRARY:' + suspects['LibraryID']
    # TODO: New style USI.
    # suspects['usi2'] = ('mzdraft:GNPS:GNPS-LIBRARY:accession:' +
    #                     suspects['LibraryID'])
    suspects['usi3'] = ('mzspec:MSV000084314:' + suspects['dataset'] +
                        '.mgf:scan:' + suspects['ClusterScanNr'].astype(str))
    
    mirror_urls = ('https://metabolomics-usi.ucsd.edu/mirror?usi1=' +
                   suspects['usi1'] + '&usi2=' + suspects['usi2'] +
                   '&mz_min=50&mz_max=500')
    mirror_alt_urls = ('https://metabolomics-usi.ucsd.edu/mirror?usi1=' +
                       suspects['usi1'] + '&usi2=' + suspects['usi3'] +
                       '&mz_min=50&mz_max=500')
    spectrum_urls = ('https://metabolomics-usi.ucsd.edu/spectrum/?usi=' +
                     suspects['usi1'] + '&mz_min=50&mz_max=500')
    
    suspects['mirror_urls'] = mirror_urls
    suspects['mirror_alt_urls'] = mirror_alt_urls
    suspects['spectrum_urls'] = spectrum_urls
    
    suspects = suspects.merge(datasets_df, how='left', on="dataset")
    suspects = suspects.merge(libraries_df, how='left', left_on="LibraryID", right_on="spectrum_id")
    
    return suspects
    
# Doing a filtration on the scan number to make sure its not -1
suspect_list_df = suspect_list_df[suspect_list_df["SuspectScanNr"] != -1]
suspect_list_df = process_enrich(suspect_list_df)

suspect_list_df = suspect_list_df[['CompoundName', 'Adduct', 'LibraryPrecursorMZ', 'DeltaMZ', "GroupDeltaMZ", "AtomicDifference", "Rationale", "superclass", "class", "LibraryInstrument", "DatasetInstrument", "mirror_urls", "mirror_alt_urls", "spectrum_urls", "INCHI", "SuspectPath"]]
suspect_list_df.to_csv("suspect_enriched_sheets.csv", sep=",", index=False)

In [40]:
all_classes = list(set(suspect_list_df["class"]))

In [41]:
output_folder = "class_split_suspects"
for class_name in all_classes:
    output_filename = os.path.join(output_folder, str(class_name) + ".csv")
    filtered_df = suspect_list_df[suspect_list_df["class"] == class_name]
    filtered_df.to_csv(output_filename, sep=",", index=False)

In [50]:
### Outputting mark down files

def process_md(input_filename, output_md, output_html):
    #suspects = pd.read_csv(input_filename).iloc
    suspects = pd.read_csv(input_filename)
    suspects['dataset'] = suspects['SuspectPath'].str.slice(2, 14)
    suspects['usi1'] = ('mzspec:' + suspects['dataset'] + ':' +
                        suspects['SuspectPath'].apply(os.path.basename) +
                        ':scan:' + suspects['SuspectScanNr'].astype(str))
    suspects['usi2'] = 'mzspec:GNPSLIBRARY:' + suspects['LibraryID']
    # TODO: New style USI.
    # suspects['usi2'] = ('mzdraft:GNPS:GNPS-LIBRARY:accession:' +
    #                     suspects['LibraryID'])
    suspects['usi3'] = ('mzspec:MSV000084314:' + suspects['dataset'] +
                        '.mgf:scan:' + suspects['ClusterScanNr'].astype(str))
    
    explanations = [
        ' / '.join([f'{atomic_diffs} ({rationales})'
                    for atomic_diffs, rationales in zip(atomic_diffs.split('|'),
                                                        rationales.split('|'))])
        for atomic_diffs, rationales in zip(
            suspects['AtomicDifference'].fillna('unknown'),
            suspects['Rationale'].fillna('unknown'))]

    output_list = ['| Suspect | Mirror Library | Mirror Dataset Cluster | Image |',
                   '| --- | --- | --- | --- |']
    suspects_str = ('<ul><li><b>Suspect:</b> ' + suspects['CompoundName'] + ' ['
                    + suspects['Adduct'] + '] '
                    + suspects['DeltaMZ'].map('{:=+9.3f}'.format) + ' ['
                    + suspects['GroupDeltaMZ'].map('{:+.2f}'.format) + ']'
                    + '</li>' +
                    '<li><b>Library:</b> [' + suspects['LibraryID'] + ']'
                    '(https://gnps.ucsd.edu/ProteoSAFe/gnpslibraryspectrum.jsp?'
                    'SpectrumID=' + suspects['LibraryID'] + ')'
                    + '</li>'
                    + '<li><b>Putative explanation:</b> ' + explanations
                    + '</li></ul>')
    mirror_urls = ('https://metabolomics-usi.ucsd.edu/svg/mirror?usi1=' +
                   suspects['usi1'] + '&usi2=' + suspects['usi2'] +
                   '&mz_min=50&mz_max=500')
    mirror_alt_urls = ('https://metabolomics-usi.ucsd.edu/svg/mirror?usi1=' +
                       suspects['usi1'] + '&usi2=' + suspects['usi3'] +
                       '&mz_min=50&mz_max=500')
    spectrum_urls = ('https://metabolomics-usi.ucsd.edu/svg/?usi=' +
                     suspects['usi1'] + '&mz_min=50&mz_max=500')
    for sus, url1, url2, url3 in zip(
            suspects_str, mirror_urls, mirror_alt_urls, spectrum_urls):
        output_list.append(f'| {sus} | ![]({url1}) | ![]({url2}) | [View USI]({url3})| ')
        
    with open(output_md, 'w') as f_out:
        f_out.write('\n'.join(output_list))
    with open(output_html, 'w') as f_out:
        f_out.write(markdown('\n'.join(output_list), extensions=['tables']))

        


In [54]:
input_csv_files = glob.glob(os.path.join(output_folder, "*.csv"))
output_md_folder = "class_split_suspects_md"
output_html_folder = "class_split_suspects_html"
for input_filename in input_csv_files:
    print(input_filename)
    output_md_filename = os.path.join(output_md_folder, os.path.basename(input_filename.replace('.csv', '.md')))
    output_html_filename = os.path.join(output_html_folder, os.path.basename(input_filename.replace('.csv', '.html')))
    
    process_md(input_filename, output_md_filename, output_html_filename)
    

class_split_suspects/Emetine alkaloids.csv
class_split_suspects/Pteridines and derivatives.csv
class_split_suspects/Kavalactones.csv
class_split_suspects/Aporphines.csv
class_split_suspects/5'-deoxyribonucleosides.csv
class_split_suspects/Benzoxazines.csv
class_split_suspects/Macrolides and analogues.csv
class_split_suspects/Benzene and substituted derivatives.csv
class_split_suspects/nan.csv
class_split_suspects/Triphenyl compounds.csv
class_split_suspects/Fatty Acyls.csv
class_split_suspects/Thiadiazinanes.csv
class_split_suspects/Pyrimidine nucleosides.csv
class_split_suspects/Protopine alkaloids.csv
class_split_suspects/Prenol lipids.csv
class_split_suspects/Glycerolipids.csv
class_split_suspects/Indanes.csv
class_split_suspects/Naphthofurans.csv
class_split_suspects/Benzothiazepines.csv
class_split_suspects/Biotin and derivatives.csv
class_split_suspects/Peptidomimetics.csv
class_split_suspects/Stilbenes.csv
class_split_suspects/Dithiolanes.csv
class_split_suspects/Rhoeadine alkal