## 04 - Understanding the Results

Let's take a look at the outputs generated in part 03. The first thing we need to do is to again load in the libraries we'll want to use.

In [2]:
'''
In order to import pyhxexpress and test_config from a directory different than our 
current working directory we need to add the location of those files to the system path.

In this case, I am working in the Documentation folder, pyHXexpress and the data are both up a level
'''

import sys
import os
import pathlib

hxex_path = os.path.join(pathlib.Path(os.getcwd()).parent)
data_path = os.path.join(pathlib.Path(os.getcwd()).parent,'Bimodal_HDX_Data')
sys.path.append(hxex_path)
sys.path.append(data_path)

import numpy as np, pandas as pd
import importlib
pd.set_option('display.max_columns',None) 
pd.set_option('display.max_colwidth', None)

import pyhxexpress.hxex as hxex
import test_config as config

def hxex_reload():
    importlib.reload(hxex)
    importlib.reload(config)
    hxex.config = config

hxex_reload()

Lets read in the outputs we generated in part 3. These are the metadf_asrun, data_fits, and fitparamsALL files.<p>
We didn't save the raw and peakpicked data to separate csv files, but we can quickly gather those using the 'get_data()' function.

In [17]:
#output_path = config.Output_DIR
metadf_run = pd.read_csv(os.path.join(config.Output_DIR,'metadf_asrun_15Apr2024.csv')).drop('Index',axis=1)
datafits = pd.read_csv(os.path.join(config.Output_DIR,'data_fits15Apr2024.csv')).drop('Index',axis=1)
fitparams = pd.read_csv(os.path.join(config.Output_DIR,'fitparamsAll_asrun_15Apr2024.csv')).drop('Index',axis=1)

deutdata, rawdata = hxex.get_data(metadf_run)

In [11]:
# These are the files that were run and the summary of timepoints that are polymodal
display(metadf_run)

Unnamed: 0,file,sample,start_seq,end_seq,peptide_range,charge,peptide,polymodal,dataset_run
0,Angio_2_HI.xlsx,AngioII_HI,1,8,0001-0008,2,DRVYIHPF,60 180 240 300 360 420 480 540 600 660 720 780 840 900 960 1020 1080 1140 1200 1260,Yes
1,GluFib_2_HI.xlsx,GluFib_HI,1,14,0001-0014,2,EGVNDNEEGFFSAR,240 300 360 420 480 540 600 660 720 780 840 900 960 1020 1080 1140 1200 1260,Yes


In [12]:
'''
This is the peak picked data for every peptide/timepoint/charge/replicate corresponding to the 'file's in metadf_run 
There are additional columns which may include 'env_width', 'env_symm', and 'TD_env_width', these will be discussed more
in an advanced topics tutorial. They are 'Features' from the unfit peak picked data that I have used to train an ML model on 
for predicting whether there are 1 or more populations present. 
'''
display(deutdata)

Unnamed: 0,mz,Intensity,n_deut,env_width,env_symm,max_namides,time,data_id,sample,peptide,charge,rep,peptide_range,start_seq,end_seq,file,time_idx
0,523.774534,574000.0,0,2.568142,2.0,6,0.0,0,AngioII_HI,DRVYIHPF,2,1,0001-0008,1,8,Angio_2_HI.xlsx,0
1,524.276211,352400.0,1,2.568142,2.0,6,0.0,0,AngioII_HI,DRVYIHPF,2,1,0001-0008,1,8,Angio_2_HI.xlsx,0
2,524.777889,103200.0,2,2.568142,2.0,6,0.0,0,AngioII_HI,DRVYIHPF,2,1,0001-0008,1,8,Angio_2_HI.xlsx,0
3,525.280297,21230.0,3,2.568142,2.0,6,0.0,0,AngioII_HI,DRVYIHPF,2,1,0001-0008,1,8,Angio_2_HI.xlsx,0
4,525.782705,4580.0,4,2.568142,2.0,6,0.0,0,AngioII_HI,DRVYIHPF,2,1,0001-0008,1,8,Angio_2_HI.xlsx,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
725,793.384753,31890.0,15,15.138171,1.1,13,1260.0,1,GluFib_HI,EGVNDNEEGFFSAR,2,1,0001-0014,1,14,GluFib_2_HI.xlsx,21
726,793.887892,11210.0,16,15.138171,1.1,13,1260.0,1,GluFib_HI,EGVNDNEEGFFSAR,2,1,0001-0014,1,14,GluFib_2_HI.xlsx,21
727,794.391030,3765.0,17,15.138171,1.1,13,1260.0,1,GluFib_HI,EGVNDNEEGFFSAR,2,1,0001-0014,1,14,GluFib_2_HI.xlsx,21
728,794.894169,1515.0,18,15.138171,1.1,13,1260.0,1,GluFib_HI,EGVNDNEEGFFSAR,2,1,0001-0014,1,14,GluFib_2_HI.xlsx,21


In [13]:
# This is the raw spectral data corresponding to the files in metadf_run
display(rawdata)

Unnamed: 0,index,mz,Intensity,time,data_id,sample,peptide,charge,rep,peptide_range,start_seq,end_seq,file,time_idx
0,0,523.435,277.00,0.0,0,AngioII_HI,DRVYIHPF,2,1,0001-0008,1,8,Angio_2_HI.xlsx,0
1,1,523.445,236.80,0.0,0,AngioII_HI,DRVYIHPF,2,1,0001-0008,1,8,Angio_2_HI.xlsx,0
2,2,523.455,164.80,0.0,0,AngioII_HI,DRVYIHPF,2,1,0001-0008,1,8,Angio_2_HI.xlsx,0
3,3,523.465,126.00,0.0,0,AngioII_HI,DRVYIHPF,2,1,0001-0008,1,8,Angio_2_HI.xlsx,0
4,4,523.475,107.70,0.0,0,AngioII_HI,DRVYIHPF,2,1,0001-0008,1,8,Angio_2_HI.xlsx,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31954,799,795.230,108.30,1260.0,1,GluFib_HI,EGVNDNEEGFFSAR,2,1,0001-0014,1,14,GluFib_2_HI.xlsx,21
31955,800,795.243,86.75,1260.0,1,GluFib_HI,EGVNDNEEGFFSAR,2,1,0001-0014,1,14,GluFib_2_HI.xlsx,21
31956,801,795.255,62.75,1260.0,1,GluFib_HI,EGVNDNEEGFFSAR,2,1,0001-0014,1,14,GluFib_2_HI.xlsx,21
31957,802,795.267,68.00,1260.0,1,GluFib_HI,EGVNDNEEGFFSAR,2,1,0001-0014,1,14,GluFib_2_HI.xlsx,21


In [14]:
'''
This dataframe contains some of the values we're usually most interested in such as Dabs_ and pop_ and the centroids
Dabs_i is the TD-UN corrected value for the Deuterium uptake with a corresponding population pop_i 

The p-values for any fit_pops > min_pops should be less than the specified 'Ncurve_p_accept' value. The p-value will be 1.0 
for fit_pops = min_pops, meaning additional populations did not sufficiently reduce the p-value. 
'''
display(datafits)

Unnamed: 0,data_id,sample,peptide,peptide_range,start_seq,end_seq,charge,time,time_idx,rep,centroid,env_width,env_symm,max_namides,UN_TD_corr,fit_pops,p-value,centroid_1,Dabs_1,Dabs_std_1,pop_1,pop_std_1,centroid_2,Dabs_2,Dabs_std_2,pop_2,pop_std_2,centroid_3,Dabs_3,Dabs_std_3,pop_3,pop_std_3,solution_npops
0,2,AngioII_HI,DRVYIHPF,0001-0008,1,8,2,0,0,1,524.089187,2.568142,2.0,6,0.998076,1,1.0,524.097421,0.00121,0.002597,1.0,0.0,,,,,,,,,,,1
1,2,AngioII_HI,DRVYIHPF,0001-0008,1,8,2,60,1,1,524.914705,5.455503,1.251208,6,0.998076,2,1.755999e-05,524.579653,0.937311,0.185094,0.372437,0.224799,525.106505,2.094926,0.406878,0.627563,0.224799,,,,,,1
2,2,AngioII_HI,DRVYIHPF,0001-0008,1,8,2,120,2,1,525.673768,6.604516,1.1,6,0.998076,1,1.0,525.669472,3.141915,0.025602,1.0,0.0,,,,,,,,,,,1
3,2,AngioII_HI,DRVYIHPF,0001-0008,1,8,2,180,3,1,526.46316,6.505287,1.1,6,0.998076,2,0.000911323,526.307572,3.84335,0.418237,0.172044,0.186133,526.479448,4.887078,0.403899,0.827956,0.186133,,,,,,1
4,2,AngioII_HI,DRVYIHPF,0001-0008,1,8,2,240,4,1,525.442186,8.882579,2.0,6,0.998076,2,6.028915e-10,524.113171,0.032427,0.015051,0.541973,0.009328,526.987554,5.73631,0.062991,0.458027,0.009328,,,,,,2
5,2,AngioII_HI,DRVYIHPF,0001-0008,1,8,2,300,5,1,524.468802,4.656814,2.0,6,0.998076,2,1.322471e-09,524.147822,0.098293,0.061848,0.596614,0.056376,524.91847,1.69477,0.165017,0.403386,0.056376,,,,,,2
6,2,AngioII_HI,DRVYIHPF,0001-0008,1,8,2,360,6,1,524.855596,6.485148,2.0,6,0.998076,2,2.312817e-12,524.138327,0.070334,0.029699,0.54763,0.015435,525.716244,3.221905,0.067759,0.45237,0.015435,,,,,,2
7,2,AngioII_HI,DRVYIHPF,0001-0008,1,8,2,420,7,1,525.166911,7.807363,2.0,6,0.998076,2,6.838968e-10,524.112965,0.028802,0.018787,0.534899,0.009965,526.35922,4.518993,0.056029,0.465101,0.009965,,,,,,2
8,2,AngioII_HI,DRVYIHPF,0001-0008,1,8,2,480,8,1,525.940605,9.390881,1.24685,6,0.998076,2,5.173639e-14,524.882038,1.585012,0.068917,0.492128,0.015388,526.921423,5.636786,0.06197,0.507872,0.015388,,,,,,2
9,2,AngioII_HI,DRVYIHPF,0001-0008,1,8,2,540,9,1,525.335343,6.846569,1.157764,6,0.998076,2,1.058869e-08,524.534347,0.916561,0.275635,0.201926,0.092254,525.488404,2.828565,0.115392,0.798074,0.092254,,,,,,2


In [15]:
'''
The fitparams dataframe contains all of the fit variables determined during the full run. Each fit spectrum will have
fit_pops + Nboot number of entries. Consider the first fit spectrum: the first row is the n=1 fit, the second row is 
the n=2 fit. The n=2 fit did not sufficiently reduce the residual sum squared (rss) value so the Nboot fits are performed
on n=1 populations (failed p-value test to add another population). 
'''
display(fitparams)

Unnamed: 0,data_id,sample,peptide,peptide_range,start_seq,end_seq,charge,time,time_idx,rep,ncurves,nboot,rss,Fit_Params,solution_npops,p-value
0,2,AngioII_HI,DRVYIHPF,0001-0008,1,8,2,0,0,1,1,0,0.000240,0.005248293817372022 0.006212793309226296 1.5043220389690426e-13 0.9999999999,1,1.0
1,2,AngioII_HI,DRVYIHPF,0001-0008,1,8,2,0,0,1,2,0,0.000240,0.0052482921220392215 5.058496763284646e-10 2.007019488251341e-07 2.34048760037862e-10 2.205235731210712e-09 0.9998422983882923 0.9999999980008685,1,1.0
2,2,AngioII_HI,DRVYIHPF,0001-0008,1,8,2,0,0,1,1,1,0.000021,0.0031182892067406717 2.7813906152341382e-06 1.5052454614255506e-05 1.0,1,1.0
3,2,AngioII_HI,DRVYIHPF,0001-0008,1,8,2,0,0,1,1,2,0.000021,0.00741689730941397 7.984697672087362e-09 0.0033748187076754737 1.0,1,1.0
4,2,AngioII_HI,DRVYIHPF,0001-0008,1,8,2,0,0,1,1,3,0.000022,0.0038748301032982867 0.3791551748821007 0.009766879604335393 1.0,1,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1019,5,GluFib_HI,EGVNDNEEGFFSAR,0001-0014,1,14,2,1000000,22,1,1,16,0.000001,2.2721483766869113e-12 13.713903859940148 0.8675585769794234 1.0,1,1.0
1020,5,GluFib_HI,EGVNDNEEGFFSAR,0001-0014,1,14,2,1000000,22,1,1,17,0.000009,0.008966830546923196 13.882357936511912 0.8547220930982877 1.0,1,1.0
1021,5,GluFib_HI,EGVNDNEEGFFSAR,0001-0014,1,14,2,1000000,22,1,1,18,0.000006,0.0013641315711673586 13.592670958027625 0.8755815473790861 1.0,1,1.0
1022,5,GluFib_HI,EGVNDNEEGFFSAR,0001-0014,1,14,2,1000000,22,1,1,19,0.000001,7.521754239115574e-15 13.716731974152749 0.868444944601406 1.0,1,1.0
