# MSMS Standards Evaluation Tool (MSMS-Set)

In [None]:
import os
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import glob
#want to save pdf fonts? then do this:
import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42

#path to metatlas repo:
metatlas_dir = '/global/homes/t/tharwood/repos/metatlas'

from sklearn.linear_model import LinearRegression

import sys
sys.path.insert(0,metatlas_dir)
from metatlas.untargeted import tools as mzm

sys.path.insert(0,metatlas_dir)
from metatlas.io import feature_tools as ft

# import ray
# ray.init()

In [None]:
chooser_output_dir = 'msms-chooser_outputs/20220308_JGI-AK-TH_TN_507992_PlantStds_Set1_QE-HF_HILICZ_USHXG01602_full/output_batch.tsv'
experiment = '20220308_JGI-AK-TH_TN_507992_PlantStds_Set1_QE-HF_HILICZ_USHXG01602'

# s = os.path.join('tharwood/MSMS-Chooser_507992_PlantStds_Set1_QE-HF',experiment)

df_chooser = pd.read_csv(chooser_output_dir,sep='\t')
df_chooser.columns = [c.lower() for c in df_chooser.columns]
df_chooser.rename(columns={'filename':'gnps_filename'},inplace=True)
df_chooser['basename'] = df_chooser['gnps_filename'].apply(lambda x: x.split('/')[-1].replace('.mzML','.h5'))
df_chooser['compound_name'] = df_chooser['compound_name'] + '-' + df_chooser['adduct']
df_chooser.head()

In [None]:
raw_data_dir = '/global/cfs/cdirs/metatlas/raw_data/jgi/'

mydir = os.path.join(raw_data_dir,experiment)
files = glob.glob(os.path.join(mydir,'*.h5'))
print(len(files))
df_files = pd.DataFrame()
df_files['full_filename'] = files
df_files['basename'] = df_files['full_filename'].apply(lambda x: os.path.basename(x))
df_files['run_order'] = df_files['full_filename'].apply(lambda x: os.path.basename(x).split('_')[-1].replace('.h5','').replace('Run',''))
df_files['run_order'] = df_files['run_order'].astype(int)
df_files.sort_values('run_order',ascending=True,inplace=True)
df_files.reset_index(inplace=True,drop=True)
df_chooser = pd.merge(df_chooser,df_files,on='basename',how='left')

In [None]:
sum(df_chooser['full_filename'].isna())

In [None]:
df_chooser.loc[0,'full_filename']

# pre-filter MSMS-Chooser output

Lets wait for this

# translate scan number to retention time


In [None]:
scan_rt = []
for f in df_chooser['full_filename'].unique():
    if os.path.isfile(f):
        df = ft.df_container_from_metatlas_file(f,desired_key='ms1_%s'%os.path.basename(f).split('_')[9].lower())
        df2 = ft.df_container_from_metatlas_file(f,desired_key='ms2_%s'%os.path.basename(f).split('_')[9].lower())
        temp = df[['rt','i']].drop_duplicates('rt').reset_index(drop=True).copy()
        temp2 = df2[['rt','i']].drop_duplicates('rt').reset_index(drop=True).copy()
        temp = pd.concat([temp,temp2])
        temp.sort_values('rt',inplace=True)
        temp.reset_index(drop=True,inplace=True)
        temp.index.name = 'scan_number'
        temp.reset_index(drop=False,inplace=True)
        temp = temp[['scan_number','rt']]
        # df = pd.merge(df,temp,on='rt',how='left')
        temp['file'] = f
        scan_rt.append(temp)
    else:
        print('no file')
scan_rt = pd.concat(scan_rt)

In [None]:
df_chooser = pd.merge(df_chooser,scan_rt,left_on=['full_filename','extractscan'],right_on=['file','scan_number'],how='left')
df_chooser.drop(columns=['file','scan_number'],inplace=True)
df_chooser.head()

In [None]:
# fig,ax = plt.subplots(nrows=2,ncols=2,figsize=(18,18),sharex=True,sharey=True)
# ax = ax.flatten()
# counter = 0
# for g in scan_rt[:4]:
#     # print(g['file'].unique())
#     x = g[['scan_number']].values
#     y = g[['rt']].values
#     ax[counter].plot(x,y,'.',label='Data')
#     f = LinearRegression().fit(x, y)
#     slope = f.coef_[0][0]
#     intercept = f.intercept_[0]
#     y2 = x*slope + intercept
#     ax[counter].plot(x,y2,'-',label='Fit')
#     ax[counter].legend()
#     counter += 1
#     print(slope,intercept)
# plt.tight_layout()

In [None]:
# slope = 0.0020046951007204436 
# intercept = 0.17953672418209976
ppm_tolerance = 10.0
extra_time = 12

cols = ['moleculemass', 'extractscan', 'inchi', 'charge',
       'ionmode', 'adduct','compound_name','full_filename','rt']
df_standards = df_chooser[cols].copy()
df_standards.drop_duplicates(inplace=True)
df_standards.rename(columns={'moleculemass':'mz','compound_name':'label','rt':'rt_peak'},inplace=True)

# NOTE THAT RT_PEAK IS ACTUALLY THE RT OF WHEN THE MSMS WAS COLLECTED NOT THE PEAK
# df_standards['rt_peak'] = df_standards['extractscan']*slope + intercept

df_standards['rt_min'] = df_standards['rt_peak'] - 0.15
df_standards['rt_max'] = df_standards['rt_peak'] + 0.15
df_standards['ppm_tolerance'] = ppm_tolerance
df_standards['extra_time'] = extra_time

df_standards_pos = df_standards[df_standards['ionmode']=='Positive']
df_standards_neg = df_standards[df_standards['ionmode']=='Negative']
df_standards_pos['polarity'] = 'positive'
df_standards_neg['polarity'] = 'negative'

df_standards_pos['group_index'] = ft.group_consecutive(df_standards_pos['mz'].values[:],
                                         stepsize=ppm_tolerance,
                                         do_ppm=True)

df_standards_neg['group_index'] = ft.group_consecutive(df_standards_neg['mz'].values[:],
                                         stepsize=ppm_tolerance,
                                         do_ppm=True)



In [None]:
# polarity,mz,scan_num = df_chooser.loc[0,['ionmode','moleculemass','extractscan']]
# print(polarity,mz)
# fig,ax = plt.subplots()
# idx = abs(df['mz']-mz)<0.01
# ax.plot(df.loc[idx,'scan_number'],df.loc[idx,'i'],'.-')
# ax.set_xlim([scan_num-30,scan_num+30])
# ax.axvline(scan_num,color='k')
# plt.show()

In [None]:
data_list = []
for f in df_chooser['full_filename'].unique():
    data_setup = {}
    data_setup['lcmsrun'] = f
    data_setup['file_index'] = int(os.path.basename(f).split('_')[-1].replace('.h5','').replace('Run',''))
    polarity = os.path.basename(f).split('_')[9]
    if polarity == 'POS':
        data_setup['polarity'] = 'positive'
        data_setup['atlas'] = df_standards_pos
    else:
        data_setup['polarity'] = 'negative'
        data_setup['atlas'] = df_standards_neg
        
    data_list.append(data_setup)



In [None]:
# @ray.remote
def unmap_vars(x):
    # sys.path.insert(0,metatlas_dir)
    # from metatlas.io import feature_tools as ft
    d = ft.get_data(x,return_data=True,save_file=False)
    d['file_info'] = {}
    d['file_info']['filename'] = x['lcmsrun']
    d['file_info']['filename'] = os.path.basename(x['lcmsrun'])
    group_str = os.path.basename(x['lcmsrun']).split('_')[12]
    d['file_info']['group'] = group_str
    sample_str = os.path.basename(x['lcmsrun']).split('_')[14]
    d['file_info']['sample_blank'] = sample_str
    d['file_info']['label'] = '%s-%s'%(group_str,'-'.join(sample_str.split('-')[1:3]))
    d['file_info']['run_order'] = x['file_index']
    
    # keep atlas entries for all adducts for a particular compound being run
    idx1 = d['ms1_data']['label'].str.contains(d['file_info']['label'])
    d['ms1_data'] = d['ms1_data'][idx1]
    idx1 = d['ms1_summary']['label'].str.contains(d['file_info']['label'])
    d['ms1_summary'] = d['ms1_summary'][idx1]
    idx1 = d['ms2_data']['label'].str.contains(d['file_info']['label'])
    d['ms2_data'] = d['ms2_data'][idx1]
    # if d['ms1_data'].shape[0]>0:
    #     d['ms1_summary'] = ft.calculate_ms1_summary(d['ms1_data'])
    return d#['ms1_data']


# results = pd.concat(results)
# results.head()

In [None]:
results = []

In [None]:
for x in data_list:
    res = unmap_vars(x)
    results.append(res)
    print(x['lcmsrun'])

In [None]:
# %%time
# futures = [unmap_vars.remote(x) for x in data_list]
# results = ray.get(futures)

In [None]:
len(results)

In [None]:
results[0]['ms1_data']

In [None]:
def get_closest_spectrum(rt_peak,rt_msms):
    """
    list of floats and a float, returns indices that are closest to float
    """
    u_rt = np.unique(rt_msms)
    closest_rt = np.argmin(abs(rt_peak-u_rt))
    idx = np.argwhere(rt_msms==u_rt[closest_rt]).flatten()
    return idx

out = []       
for r in results: # for each file contains all atlas hits with same:
    # compoundname and collision energy as in filename
    # will have potentially a hit for each adduct that might have been seen in other files.
    for i,row in r['ms1_summary'].iterrows():
        # if msms.shape[0]>0:
        idx = r['ms1_data']['label']==row['label']
        eic = ft.group_duplicates(r['ms1_data'].loc[idx,['label','rt','i','in_feature']],'label',make_string=False)
        idx = (r['ms2_data']['label']==row['label']) & (r['ms2_data']['in_feature']==True)
        msms = r['ms2_data'][idx].copy()
        if msms.shape[0]>0:
            msms.reset_index(drop=True,inplace=True)
            idx = get_closest_spectrum(row['rt_peak'],msms['rt'].values)
            msms = msms.loc[idx]
            msms = ft.group_duplicates(msms[['label','mz','rt','i']],'label',make_string=False)
            out.append(pd.concat([row,msms.add_suffix('_msms').loc[0],eic.add_suffix('_eic').loc[0]]))
            # out.append(pd.concat([row.reset_index(drop=True),msms.add_suffix('_msms'),eic.add_suffix('_eic')],axis=1,ignore_index=False))
out = pd.concat(out,axis=1).T#ignore_index=False)
out



In [None]:
temp = pd.merge(df_chooser,out,left_on='compound_name',right_on='label',how='outer')
temp.to_csv('diagnostic_peakheight_and_centroids.csv',index=None)

# Old GridSpec Subplot Stuff

In [None]:
# fig=plt.figure(figsize=(13,17))
#     gs=GridSpec(gg.shape[0],2)
#     ax1=fig.add_subplot(gs[:,0])
#     ax = []
#     for i in range(gs.nrows):
#         if i==0:
#             ax.append(fig.add_subplot(gs[i,1]))
#         else:
#             ax.append(fig.add_subplot(gs[i,1],sharex=ax[0]))

#     ax.insert(0,ax1)

In [None]:
g = temp.groupby('inchi')
df_g = [gg for _, gg in g]

from matplotlib.gridspec import GridSpec
outdir = '/global/homes/t/tharwood/msms_set/downloads/plant_standards_diagnostic_plots_hilic'
for gg in df_g:
    nrows = gg.shape[0]+4
    fig,ax = plt.subplots(nrows=nrows,ncols=1,figsize=(13,4*nrows))
    # ax = ax.flatten()
    # fig,ax = plt.subplots(nrows=4,ncols=2,figsize=(22,11),gridspec_kw={'width_ratios': [1, 2],'height_ratios':[4,1,1,1]})
    # f, (a0, a1) = plt.subplots(2, 3, gridspec_kw={'width_ratios': [3, 1]})
    count = 0
    labels = []
    gg.sort_values('peak_height',ascending=False,inplace=True)
    for i,row in gg.iterrows():
        if (type(row['mz_msms'])==np.ndarray) & (type(row['rt_eic'])==np.ndarray):
            x = row['rt_eic']
            y = row['i_eic']
            in_feature = row['in_feature_eic']
            idx = np.argsort(x)
            x = x[idx]
            y = y[idx]
            in_feature = in_feature[idx]
            rep_str = row['label'].index('-CE')
            labels.append(row['label'][:rep_str])
            idx_infeature = in_feature==True
            h = ax[0].plot(x[idx_infeature],y[idx_infeature],label=row['label'],linewidth=2)
            ax[0].legend()
            ax[0].axvline(row['rt'],linewidth=6,alpha=0.23,color=h[-1].get_color())

            ax[1].plot(x[idx_infeature],y[idx_infeature],color=h[-1].get_color(),linewidth=2)
            ax[1].axvline(row['rt'],linewidth=6,alpha=0.23,color=h[-1].get_color())
            ax[1].set_yscale('log')

            ax[2].plot(x,y,color=h[-1].get_color(),linewidth=2)
            ax[2].axvline(row['rt'],linewidth=6,alpha=0.23,)
            
            ax[3].plot(x,y,color=h[-1].get_color(),linewidth=2)
            ax[3].axvline(row['rt'],linewidth=6,alpha=0.23,)
            ax[3].set_yscale('log')
            
            # ax[0].set_yscale('log')
            # if count<4:
            y = row['i_msms']
            ax[count+4].axvline(row['moleculemass'],color='grey',alpha=0.4,linewidth=6)
            ax[count+4].axvline(row['exactmass'],color='brown',alpha=0.4,linewidth=6)
            ax[count+4].vlines(row['mz_msms'],0*y,y,color='k',linewidth=3)
            # ax[count+4].vlines(row['mz_msms'],0*y,y,color=h[-1].get_color(),linewidth=3)
            # ax[count+4].set_facecolor(h[-1].get_color(),alpha=0.2)
            for spine in ax[count+4].spines.values():
                    spine.set_edgecolor(h[-1].get_color())
            count += 1
    if count>0: # plots were made!
        ax[0].get_xaxis().set_visible(False)
        ax[0].get_shared_x_axes().join(ax[0], ax[1])

        ax[2].get_xaxis().set_visible(False)
        ax[1].get_shared_x_axes().join(ax[1], ax[2])
        for i,a in enumerate(ax[:-1]):
            if i>=4:
                ax[i].get_xaxis().set_visible(False)
                ax[i].get_shared_x_axes().join(ax[i], ax[i+1])

        for a in ax:
            a.yaxis.get_offset_text().set_fontsize(14)
            for spine in a.spines.values():
                spine.set_linewidth(2)
        for a in ax[4:]:
            a.ticklabel_format(axis='y',style='sci', scilimits=(0,0))
            a.set_ylim(bottom=0)

        for a in ax:
            a.tick_params(axis='both',length=10, width=2, which='major', labelsize=14)
            a.tick_params(axis='both', which='minor', labelsize=14)
        plt.setp(ax[0].get_legend().get_texts(), fontsize=12) # for legend text
        ax[1].set_xlabel('Retention Time (min)',fontsize=20)
        ax[3].set_xlabel('Retention Time (min)',fontsize=20)
        for a in ax:
            a.set_ylabel('Intensity',fontsize=20)
        ax[-1].set_xlabel('m/z',fontsize=20)
        plt.tight_layout()
        if not os.path.isdir(outdir):
            os.mkdir(outdir)
        for basename in pd.unique(labels):
            print(basename)
            filename = '%s.pdf'%os.path.join(outdir,basename)
            fig.savefig(filename)
        fig.clear()
        plt.close('all')
plt.close('all')