In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import requests
import seaborn as sns
import spectrum_utils.utils as suu
import tqdm.notebook as tqdm
from scipy.stats import mannwhitneyu
from statsmodels.stats.multitest import multipletests

In [None]:
# Plot styling.
plt.style.use(['seaborn-white', 'seaborn-paper'])
plt.rc('font', family='serif')
sns.set_palette(['#9e0059', '#6da7de', '#ee266d', '#dee000', '#eb861e'])
sns.set_context('paper', font_scale=1.3)    # Single-column figure.

In [None]:
suspects = pd.read_parquet('../../data/interim/suspects_unique.parquet')

acylcarnitines = suspects[suspects['CompoundName'].str.lower()
                          .str.contains('carnitine', regex=False)]

In [None]:
task_id_std, task_id_sus = 'b55aef34', '50e3d8ae'
filename = ('../../data/processed/MOLECULAR-LIBRARYSEARCH-V2-{}-'
            'view_all_annotations_DB-main.tsv')
filename_ids_std = (
    pd.read_csv(
        filename.format(task_id_std),
        usecols=['full_CCMS_path', 'Compound_Name', 'SpecMZ', 'RT_Query'],
        sep='\t')
    .rename(columns={'full_CCMS_path': 'Filename',
                     'Compound_Name': 'CompoundName',
                     'SpecMZ': 'PrecursorMz',
                     'RT_Query': 'RT'}))
filename_ids_std['Dataset'] = (filename_ids_std['Filename']
                               .str.split('/', 1).str[0])
filename_ids_sus = (
    pd.read_csv(
        filename.format(task_id_sus),
        usecols=['full_CCMS_path', 'Compound_Name', 'SpecMZ', 'RT_Query'],
        sep='\t')
    .rename(columns={'full_CCMS_path': 'Filename',
                     'Compound_Name': 'CompoundName',
                     'SpecMZ': 'PrecursorMz',
                     'RT_Query': 'RT'}))
filename_ids_sus['Dataset'] = (filename_ids_sus['Filename']
                               .str.split('/', 1).str[0])

In [None]:
rosmap_metadata = pd.read_csv(
    '../../data/external/metadata_ROSMAP/U19_ROSMAP_METADATA_FINALIZED_CSV.csv',
    usecols=['filename', 'ATTRIBUTE_ceradsc']).rename(
    columns={'filename': 'Filename', 'ATTRIBUTE_ceradsc': 'CeradScore'})
rosmap_metadata['Filename'] = (rosmap_metadata['Filename'].str.split().str[0]
                               .str.replace(r'mzXML$', 'mzML', regex=True)
                               .str.zfill(13))
mask = rosmap_metadata['CeradScore'] <= 2
rosmap_metadata.loc[mask, 'Alzheimer'] = 'Yes (CERAD ≤ 2)'
rosmap_metadata.loc[~mask, 'Alzheimer'] = 'No  (CERAD ≥ 3)'

In [None]:
def clean_labels(labels):
    labels = labels.str.strip()
    # Clean spectral matches to NIST tags.
    labels = (labels.str.replace('Spectral Match to ', '', regex=False)
              .str.replace(' from NIST14', '', regex=False))
    # Clean MASSBANK tags.
    labels = labels.str.replace(r'Massbank:RP\d{6} ([^ |]+)[^ ]*',
                                lambda m: m.group(1), regex=True)
    # Remove fragmentation info.
    labels = labels.str.replace(r' - (\d+\.\d+)? (eV|Ev)', '', regex=True)
    # Clean suspect tags.
    labels = labels.str.replace(
        f'Suspect related to (.+) \(predicted molecular formula: [^)]+\) '
        r'with delta m/z (-?\d+\.\d+) \(putative explanation: .+\)',
        lambda m: (f'{m.group(1)} {"+" if float(m.group(2)) > 0 else "-"} '
                   f'{abs(float(m.group(2))):.3f} Da'), regex=True)
    # Use consistent title case.
    labels = labels.str.title().str.replace('Dl-', 'DL-')
    
    return labels

In [None]:
rosmap_ids_std = filename_ids_std[filename_ids_std['Dataset']
                                  == 'MSV000086415'].copy()
rosmap_ids_std = rosmap_ids_std[
    np.logical_or.reduce(
        [rosmap_ids_std['CompoundName'].str.contains(compound, regex=False)
         for compound in acylcarnitines['CompoundName'].unique()])]
rosmap_ids_std['CompoundName'] = clean_labels(
    rosmap_ids_std['CompoundName'].copy())
rosmap_ids_std = (rosmap_ids_std.groupby('CompoundName')
                  [['PrecursorMz', 'RT']]
                  .agg('median').reset_index())

rosmap_ids_sus = filename_ids_sus[filename_ids_sus['Dataset']
                                  == 'MSV000086415'].copy()
rosmap_ids_sus = rosmap_ids_sus[
    np.logical_or.reduce(
        [rosmap_ids_sus['CompoundName'].str.contains(compound, regex=False)
         for compound in acylcarnitines['CompoundName'].unique()])]
rosmap_ids_sus['CompoundName'] = clean_labels(
    rosmap_ids_sus['CompoundName'].copy())
rosmap_ids_sus = (rosmap_ids_sus.groupby('CompoundName')
                  [['PrecursorMz', 'RT']]
                  .agg('median').reset_index())

In [None]:
# Precursor m/z tolerance: 100 ppm
# Retention time tolerance: 20 seconds
mz_tol, rt_tol = 100, 20

In [None]:
# Merge annotations with near-identical precursor m/z and RT.
rosmap_ids_sus_merged, prev_mz, prev_rt = [], -np.inf, -np.inf
for _, row in rosmap_ids_sus.sort_values(
        ['PrecursorMz', 'CompoundName', 'RT']).iterrows():
    mz_diff_ppm = abs(suu.mass_diff(row['PrecursorMz'], prev_mz, False))
    rt_diff = abs(row['RT'] - prev_rt)
    if mz_diff_ppm < mz_tol and rt_diff < rt_tol:
        rosmap_ids_sus_merged[-1][0].append(row["CompoundName"])
        rosmap_ids_sus_merged[-1][1].append(row['PrecursorMz'])
        rosmap_ids_sus_merged[-1][2].append(row['RT'])
    else:
        rosmap_ids_sus_merged.append([[row['CompoundName']],
                                      [row['PrecursorMz']],
                                      [row['RT']]])
    prev_mz, prev_rt = row['PrecursorMz'], row['RT']    
rosmap_ids_sus_merged = pd.DataFrame(
    rosmap_ids_sus_merged, columns=['CompoundName', 'PrecursorMz', 'RT'])

def _key(label):
    if ' - ' in label or ' + ' in label:
        return label
    else:
        # List non-suspect compounds first.
        return f'0{label}'

rosmap_ids_sus_merged['CompoundName'] = (rosmap_ids_sus_merged['CompoundName']
                                         .apply(lambda l: sorted(l, key=_key))
                                         .str.join('\n/ '))
rosmap_ids_sus_merged['PrecursorMz'] = (rosmap_ids_sus_merged['PrecursorMz']
                                        .apply(np.mean))
rosmap_ids_sus_merged['RT'] = rosmap_ids_sus_merged['RT'].apply(np.mean)

In [None]:
index_col = rosmap_ids_sus_merged.columns.values.tolist()
rosmap_xic_sus = pd.DataFrame(columns=[
    *index_col, *sorted(rosmap_metadata['Filename'].unique())])
rosmap_xic_sus[index_col] = rosmap_ids_sus_merged
rosmap_xic_sus = rosmap_xic_sus.set_index(index_col).sort_index().astype(float)

In [None]:
# Extract XICs for all features.
for compound, precursor_mz, rt in tqdm.tqdm(rosmap_xic_sus.index):
    tolerance_ppm_to_mz = mz_tol / 10**6 * precursor_mz 
    req = requests.get(f'http://gnps-xic.ucsd.edu/api/integrate/'
                       f'MSV000086415?xictarget={precursor_mz}&'
                       f'mztolerance={tolerance_ppm_to_mz}&'
                       f'rtrange={(rt - rt_tol) / 60}-{(rt + rt_tol) / 60}')
    for file_xic in req.json():
        if file_xic['filename'] in rosmap_xic_sus.columns:
            rosmap_xic_sus.loc[(compound, precursor_mz, rt),
                               file_xic['filename']] = float(file_xic['integral'])
rosmap_xic_sus = rosmap_xic_sus.dropna(axis=1)

In [None]:
rosmap_xic_t = rosmap_xic_sus.T.droplevel([1, 2], 'columns')
pvalues = []
for compound in rosmap_xic_t:
    xic = rosmap_metadata.merge(
        rosmap_xic_t[compound].to_frame().rename(columns={compound: 'XIC'}),
        left_on='Filename', right_index=True)
    xic = xic[xic['XIC'] > 0].dropna()
    xic_ad = xic.loc[xic['Alzheimer'] == 'Yes (CERAD ≤ 2)', 'XIC'].values
    xic_healthy = xic.loc[xic['Alzheimer'] == 'No  (CERAD ≥ 3)', 'XIC'].values
    if len(xic_ad) < 50 or len(xic_healthy) < 50:
        pvalue = 1
    else:
        _, pvalue = mannwhitneyu(xic_ad, xic_healthy)
    pvalues.append((compound, pvalue))
pvalues = pd.DataFrame(pvalues, columns=['CompoundName', 'pValue'])
pvalues['pValueCorr'] = multipletests(pvalues['pValue'], 0.05, 'fdr_bh')[1]

In [None]:
rosmap_xic_plot = pd.merge(
    rosmap_xic_sus.reset_index().melt(id_vars='CompoundName',
                                      value_vars=rosmap_xic_sus.columns,
                                      var_name='Filename', value_name='XIC'),
    rosmap_metadata, on='Filename')
rosmap_xic_plot = rosmap_xic_plot[rosmap_xic_plot['XIC'] > 0].dropna()
sign_compounds = (pd.merge(pvalues, (rosmap_xic_sus.index.to_frame()
                                     .reset_index(drop=True)),
                           on='CompoundName')
                  .sort_values(['pValueCorr'])
                  .loc[pvalues['pValueCorr'] < 0.05, 'CompoundName'])
rosmap_xic_sign = rosmap_xic_plot[rosmap_xic_plot['CompoundName']
                                  .isin(sign_compounds)]

In [None]:
(pd.merge(pvalues, (rosmap_xic_sus.index.to_frame().reset_index(drop=True)),
          on='CompoundName')
          .sort_values(['pValueCorr', 'pValue'])
          .loc[pvalues['pValueCorr'] < 0.05])

In [None]:
height = 7
width = height / 1.618
fig, ax = plt.subplots(figsize=(width, height * 0.75))

hue_order = ['No  (CERAD ≥ 3)', 'Yes (CERAD ≤ 2)']
sns.boxplot(data=rosmap_xic_sign, y='CompoundName', x='XIC', hue='Alzheimer',
            order=sign_compounds, hue_order=hue_order, fliersize=0, ax=ax)
sns.stripplot(data=rosmap_xic_sign, y='CompoundName', x='XIC', hue='Alzheimer',
              order=sign_compounds, hue_order=hue_order, dodge=True,
              edgecolor='black', linewidth=0.8, marker='.', ax=ax)

ax.set_xscale('log')

ax.set_xlabel('Abundance (XIC)')
ax.set_ylabel(None)

for label in ax.yaxis.get_ticklabels():
    label.set_size('x-small')

handles, lbls = ax.get_legend_handles_labels()
lbls = [l.split()[0] for l in lbls]
fig.legend(handles[:2], lbls[:2], loc='upper center', bbox_to_anchor=(0.5, 1),
           ncol=2, bbox_transform=fig.transFigure, title='Alzheimer')
ax.legend().set_visible(False)

sns.despine(ax=ax)

plt.savefig('acylcarnitine.png', dpi=300, bbox_inches='tight',
            facecolor='white')
plt.show()
plt.close()