In [1]:
# CompoundCalculator - Interpretation
# 20201105 Designed to look at the output from the matching routines and  visualize/interpret them
# This is expolratory and lifted directly from the Calculator
    

In [2]:
import os
from statistics import median, stdev
# import pandas as pd
from collections import defaultdict, namedtuple
# import re

# look at deltas re multimers

In [3]:
# Stolen from Match.ipynb

Peak = namedtuple('Peak', 'Mass Inten RT')

def values_from_line(line):
    """
    Split a line into parts and try to convert them to numbers...return the list of numbers or fields
    """
    
    parts = line.split()
    
    try:
        vals = [float(field) for field in parts]  # convert all to numbers
        success = True
    except:
        vals = parts      # return the fields if the conversion fails
        success = False
    
    return success, vals
    
def read_peak_list(peak_file_path):
    """
    Reads a tab-delimited text file generating a list of Peak tuples (Mass, Inten, RT). Mass must be present but the other fields are optional
    and will be stored internally as zero if absent.
    If the file has only one column it is assumed to contain masses otherwise the code assumes that the first column is Mass,
    the second is Inten and the RT is absent.
    If the file contains a header line it is used to define the order of the columns by looking for matches with common labels
    e.g. mz, m/z, Mass, etc. for masses. The RT column can only be used via a header line containi 'RT' or 'rt'
    """
    mass_col = 0
    inten_col = 1
    rt_col = -1
    has_rt = False
    has_inten = True
    start_line= 0
    
    peaks = []    #list of (mass, inten, RT) tuples

    # read all the lines so we can process them one-by-one
    with open(peak_file_path, 'r') as f:  
    
        lines = f.readlines()
        
        f.close()
    
    success, vals = values_from_line(lines[0])  # try to convert the first line
    
    if not success:   # couldn't get values; probably a header....vals is a list of the parts

        # see if we can figure out what the columns are...lsist can be extende if needed
        for col_index, col in enumerate(vals):
            if col in ['mz', 'm/z', 'mass', 'Mass', 'Mass/Charge']: mass_col = col_index
            if col in ['Int', 'Inten', 'inten', 'Height']: inten_col = col_index
            if col in ['RT', 'rt']: rt_col = col_index

        has_rt = rt_col > -1
        has_inten = inten_col > -1
            
        start_line = 1     # can skip this line

        print('m:', mass_col, 'Int:', inten_col, 'RT:', rt_col)

    base_peak_mass, base_peak_inten = 0,0
    
    # Process line by line. Lines that cannot be converted to numbers are reported 
    # Note: the first line will be reprocessed if it is numeric
    for line in lines[start_line:]:        

        # get a list of numbers from the fields in the line - success will be false if this fails and vals will be the actual text parts
        success, vals = values_from_line(line)  
               
        if success:
            mass = vals[mass_col]
            rt = vals[rt_col] if has_rt else 0
            inten = vals[inten_col] if has_inten else 0
            
            p = Peak(mass, inten, rt)
              
            peaks.append(p)
            
            if inten > base_peak_inten:
                base_peak_inten = inten
                base_peak_mass = mass
            
        else:
            print('Problem in line:', line, vals)   # vals will be the list of fields if there's a problem
    
    peaks = sorted(peaks, key = lambda x: x.Mass)     # ensure the list is sorted by mass

    masses, intens, rts = zip(*peaks)

    # Note: intensity params will be 0 if there is no intensity column
    return peaks, sum(intens), base_peak_mass, base_peak_inten, has_rt    # peaks, tic, base_peak_inten, sum rts

In [4]:
data_path = os.sep + os.path.join('Users','ronbonner','Data', 'SharedData', 'Test')

data_file = 'S_4 MeOH FA pks 0.2 percent.txt'

data_path = os.path.join(data_path, data_file)

# out_path = os.path.join(data_path, out_file)
print('Path exists:', os.path.exists(data_path))

print(data_path)

Path exists: True
/Users/ronbonner/Data/SharedData/Test/S_4 MeOH FA pks 0.2 percent.txt


In [5]:
peaks, raw_tic, base_peak_mass, base_peak_inten, has_rt = read_peak_list(data_path)

print(f'{len(peaks)} peaks, TIC {raw_tic:.3e}')
print(f'Base peak {base_peak_mass:.4f}, {base_peak_inten:.3e}')

m: 0 Int: 2 RT: -1
2932 peaks, TIC 3.972e+05
Base peak 169.0455, 1.080e+05


In [12]:
metals =[
    ('Be-2H',6.99653),
    ('Mg-2H',21.96939),
    ('Na-H',21.98194),
    ('Al-3H',23.95806),
    ('Ca-2H',37.94694),
    ('K-H',37.95588),
    ('Ti-2H',45.93229),
    ('V-2H',48.92831),
    ('Fe-3H',52.91146),
    ('Mn-2H',52.92239),
    ('Ni-2H',55.91969),
    ('Co-2H',56.91754),
    ('Cu-2H',60.91395),
    ('Zn-2H',61.91349),
    ('Ge-2H',71.90553),
    ('Zr-4H',85.87339),
    ('Sr-2H',85.88996),
    ('Mo-3H',94.88193),
    ('Ag-H',105.89727),
    ('Cd-2H',111.88771),
    ('Ba-2H',135.88960),
    ('Tl-5H',199.93530),
    ('Bi-3H',205.95692),
    ('Pb-2H',205.96100)
]


In [18]:
monomer_mass = 146.057909 #dimesa

proton = 1.00727

In [22]:
# Need to do some work here....currently all deltas are stored
# could either search for each as found or remove those that are wildly out of range
def get_possible_ma_list(p_list, monomer_mass, multimer_limit, charge_limit):
    
    proton = 1.00727
    
    effective_adduct_masses = []    # list of potential (m-vH)

    for p in p_list:
        for cn in range(1,multimer_limit+1):         # monomer range + 1
            for z in range(1,charge_limit+1):      # charge range + 1
                d = z*(p.Mass - proton) - (cn*monomer_mass)
                effective_adduct_masses.append((d, p.Mass, z, p.Inten, cn))

    effective_adduct_masses = sorted(effective_adduct_masses,key= lambda x: x[0])

    return effective_adduct_masses

deltas = get_possible_ma_list(peaks, monomer_mass, 3, 3)

print(len(deltas))

26388


In [27]:
# given a list of values (deltas), look for a target value (diff) within an absolute error window (error)
# return matching targets
def find_delta(deltas, diff, error):
    
    low, high = diff-error, diff+error
    
    res = [d for d in deltas if low < d[0] < high]

    return res

for label, mass in metals:

    res = find_delta(deltas, mass, 0.01)

    if not len(res): continue

    inten_sum = sum([r[3] for r in res])

    errors_mmu = [(r[0]-mass)*1000 for r in res]

    median_err = median(errors_mmu)

    err_range = max(errors_mmu) - min(errors_mmu)

    err_stdev = stdev(errors_mmu) if len(res) > 1 else 0

    desc = f'{label} ({mass:.5f}), {len(res)} pks, tic {inten_sum:.1f}'
    desc += f', errors (mmu): range {err_range:.2f}. median {median_err:.2f}, st_dev {err_stdev:.2f}'
    print(desc)

    res_summary.append((label, mass, len(res), inten_sum, err_range, median_err, err_stdev))

    for r in sorted(res, key=lambda x: x[1]):
        delta, obs_mass, z, inten, monomer_count = r
        if inten < min_inten_to_report: continue
        err_in_mmu = (delta-mass)*1000
        print(f'{obs_mass:14.4f}, {monomer_count}M {z}+ {inten:8.1f} (m-vH)_calc: {delta:.5f} error: {err_in_mmu:.2f}')

        by_mass[obs_mass].append((label, mass, z, monomer_count, inten, err_in_mmu))

    #print()

Be-2H
Be-2H (6.99653), 1 pks, tic 34.3, errors (mmu): range 0.00. median -2.14, st_dev 0.00
      154.0596, 1M 1+     34.3 (m-vH)_calc: 6.99439 error: -2.14
Mg-2H
Mg-2H (21.96939), 3 pks, tic 13372.6, errors (mmu): range 15.20. median 1.28, st_dev 7.60
      231.0795, 3M 2+     19.7 (m-vH)_calc: 21.97067 error: 1.28
      315.1017, 2M 1+  12866.1 (m-vH)_calc: 21.97859 error: 9.20
      461.1444, 3M 1+    486.9 (m-vH)_calc: 21.96339 error: -6.00
Na-H
Na-H (21.98194), 2 pks, tic 120861.3, errors (mmu): range 1.77. median -2.47, st_dev 1.25
      169.0455, 1M 1+ 107995.3 (m-vH)_calc: 21.98036 error: -1.58
      315.1017, 2M 1+  12866.1 (m-vH)_calc: 21.97859 error: -3.35
Al-3H
Al-3H (23.95806), 3 pks, tic 45923.1, errors (mmu): range 1.19. median -3.86, st_dev 0.62
      171.0194, 1M 1+     41.9 (m-vH)_calc: 23.95420 error: -3.86
      317.0776, 2M 1+   3303.8 (m-vH)_calc: 23.95450 error: -3.56
      463.1343, 3M 1+  42577.4 (m-vH)_calc: 23.95331 error: -4.75
Ca-2H
Ca-2H (37.94694), 5 pks,

In [24]:
summary = sorted(res_summary, key=lambda x: x[3], reverse=True)

In [25]:

for s in summary:
    label, mass, res_count, tic, err_range, median_err, err_stdev = s
    tic_percent = tic * 100 / raw_tic
    print(f'{label:6}({mass:.3f})\t{res_count} matches, TIC {tic:.3e} ({tic_percent:.2f}%),\
    errors (mmu): med{median_err:7.2f}, stdev {err_stdev:.2f}')

Na-H  (21.982)	2 matches, TIC 1.209e+05 (30.43%),    errors (mmu): med  -2.47, stdev 1.25
Al-3H (23.958)	3 matches, TIC 4.592e+04 (11.56%),    errors (mmu): med  -3.86, stdev 0.62
Ca-2H (37.947)	5 matches, TIC 2.793e+04 (7.03%),    errors (mmu): med  -4.72, stdev 2.13
Mg-2H (21.969)	3 matches, TIC 1.337e+04 (3.37%),    errors (mmu): med   1.28, stdev 7.60
Ba-2H (135.890)	6 matches, TIC 1.244e+04 (3.13%),    errors (mmu): med  -6.14, stdev 1.91
Fe-3H (52.911)	2 matches, TIC 6.244e+03 (1.57%),    errors (mmu): med  -4.93, stdev 0.83
Ti-2H (45.932)	4 matches, TIC 1.367e+03 (0.34%),    errors (mmu): med   1.71, stdev 5.65
Sr-2H (85.890)	5 matches, TIC 9.157e+02 (0.23%),    errors (mmu): med  -6.47, stdev 2.38
Ni-2H (55.920)	2 matches, TIC 4.914e+02 (0.12%),    errors (mmu): med  -4.23, stdev 1.49
Co-2H (56.918)	3 matches, TIC 2.869e+02 (0.07%),    errors (mmu): med  -1.59, stdev 3.54
Zr-4H (85.873)	3 matches, TIC 1.936e+02 (0.05%),    errors (mmu): med   7.05, stdev 3.19
Zn-2H (61.913)	1 m

In [26]:
spacer = ' ' * 13

for m in sorted(by_mass.keys()):
    first = True
    for item in by_mass[m]:
        label, mass, z, monomer_count, inten, err_in_mmu = item  #unpack
        if first:
            desc = f'{m:12.5f}:{mass:12.5f}: {label:6} {monomer_count}M {z}+ {inten:9.1f} {err_in_mmu:5.2f}'
        else:
            desc = f'{spacer}{mass:12.5f}: {label:6} {monomer_count}M {z}+ {inten:9.1f} {err_in_mmu:5.2f}'
        print(desc)
        first = False

   116.97410:    85.87339: Zr-4H  1M 2+      18.7  2.36
   141.97813:   135.88960: Ba-2H  1M 2+     196.0 -5.79
   154.05957:     6.99653: Be-2H  1M 1+      34.3 -2.14
   166.03533:    37.94694: Ca-2H  2M 2+     170.8 -6.63
                 56.91754: Co-2H  3M 3+     170.8 -7.07
   167.03161:   205.95692: Bi-3H  2M 3+      17.3  0.27
                205.96100: Pb-2H  2M 3+      17.3 -3.81
   169.04554:    21.98194: Na-H   1M 1+  107995.3 -1.58
   171.01938:    23.95806: Al-3H  1M 1+      41.9 -3.86
   171.53151:    48.92831: V-2H   2M 2+      25.4  4.35
   174.00233:   199.93530: Tl-5H  1M 2+      14.9 -3.09
   185.00996:    37.94694: Ca-2H  1M 1+    2279.0 -2.16
   190.00610:    85.87339: Zr-4H  2M 2+      42.2  8.45
                 85.88996: Sr-2H  2M 2+      42.2 -8.12
   193.00055:    45.93229: Ti-2H  1M 1+      11.8  3.08
   215.00674:   135.88960: Ba-2H  2M 2+     662.3 -6.48
   231.07947:    21.96939: Mg-2H  3M 2+      19.7  1.28
   232.95150:    85.88996: Sr-2H  1M 1+     132.