# Prospector - preliminary
---------

Provides a way to look for possible adducts of a known compound with different multimer counts and charges states.

Given a monomer mass and ranges of monomer counts and charge states, calculates an 'efective adduct mass' for each ion in a peak list and compares them to a list of known values to determine possible unusual adducts. Putative adduct ions are grouped together and summarized

Note: these calculations do not include combinations of adducts and are most uesful for determing possible adducts to be used with the CompoundCalculator (which can look at combinations). Including zero in the list of effective adduct masses allows the software to find protonated forms too.
    

# Imports and function definitions
---------

In [1]:
import os

from statistics import median, stdev
from collections import namedtuple, defaultdict


In [2]:
# Code to read a peak list
# This is from the Match notebook so the same peak lists can be read

Peak = namedtuple('Peak', 'Mass Inten RT')

def values_from_line(line):
    """
    Split a line into parts and try to convert them to numbers...return the list of numbers or fields
    Returns True if all fields were numbers else False and the list of fields
    """
    
    parts = line.split()
    
    try:
        vals = [float(field) for field in parts]  # convert all to numbers
        success = True
    except:
        vals = parts      # return the fields if the conversion fails
        success = False
    
    return success, vals
    
def read_peak_list(peak_file_path):
    """
    Reads a tab-delimited text file generating a list of Peak tuples (Mass, Inten, RT). Mass must be present but the other fields are optional
    and will be stored internally as zero if absent.
    If the file has only one column it is assumed to contain masses otherwise the code assumes that the first column is Mass,
    the second is Inten and the RT is absent.
    If the file contains a header line it is used to define the order of the columns by looking for matches with common labels
    e.g. mz, m/z, Mass, etc. for masses. The RT column can only be used via a header line containi 'RT' or 'rt'
    """
    mass_col = 0
    inten_col = 1
    rt_col = -1
    has_rt = False
    has_inten = True
    start_line= 0
    
    peaks = []    #list of (mass, inten, RT) tuples

    # read all the lines so we can process them one-by-one
    with open(peak_file_path, 'r') as f:  
    
        lines = f.readlines()
        
        f.close()
    
    success, vals = values_from_line(lines[0])  # try to convert the first line
    
    if not success:   # couldn't get values; probably a header....vals is a list of the parts

        # see if we can figure out what the columns are...lsist can be extende if needed
        for col_index, col in enumerate(vals):
            if col in ['mz', 'm/z', 'mass', 'Mass', 'Mass/Charge']: mass_col = col_index
            if col in ['Int', 'Inten', 'inten', 'Height']: inten_col = col_index
            if col in ['RT', 'rt']: rt_col = col_index

        has_rt = rt_col > -1
        has_inten = inten_col > -1
            
        start_line = 1     # can skip this line

        print('m:', mass_col, 'Int:', inten_col, 'RT:', rt_col)

    base_peak_mass, base_peak_inten = 0,0
    
    # Process line by line. Lines that cannot be converted to numbers are reported 
    # Note: the first line will be reprocessed if it is numeric
    for line in lines[start_line:]:        

        # get a list of numbers from the fields in the line - success will be false if this fails and vals will be the actual text parts
        success, vals = values_from_line(line)  
               
        if success:
            mass = vals[mass_col]
            rt = vals[rt_col] if has_rt else 0
            inten = vals[inten_col] if has_inten else 0
            
            p = Peak(mass, inten, rt)
              
            peaks.append(p)
            
            if inten > base_peak_inten:
                base_peak_inten = inten
                base_peak_mass = mass
            
        else:
            print('Problem in line:', line, vals)   # vals will be the list of fields if there's a problem
    
    peaks = sorted(peaks, key = lambda x: x.Mass)     # ensure the list is sorted by mass

    masses, intens, rts = zip(*peaks)

    # Note: intensity params will be 0 if there is no intensity column
    return peaks, sum(intens), base_peak_mass, base_peak_inten, has_rt    # peaks, tic, base_peak_inten, sum rts

In [3]:
def get_possible_ma_list(p_list, monomer_mass, multimer_limit, charge_limit, low_mass_limit, high_mass_limit):
    """
    Given a monomer mass, a limit for the largest multimer, and a maximum charge values, calculates
    the 'effective adduct mass' for each entry in the peak list provided. Retains values where the mass is
    positive and less than mass_limit.
    
    The lower and upper mass_limits are usually derived from the lightest and heaviest adducts and the
    maximum allowed error. We could skip the cases where z = cn > 1, but it's good to have a reminder
    tocheck for multiply charged ions.
    
    Returns a list of (effective adduct mass, peak mass, charge, peak intensity, moonomer count) tuples
    """
    proton = 1.00727
    
    effective_adduct_masses = []    # list of potential (m-vH)

    for p in p_list:
        for cn in range(1,multimer_limit+1):         # monomer range + 1
            for z in range(1,charge_limit+1):        # charge range + 1
                        
                d = z*(p.Mass - proton) - (cn*monomer_mass)
                
                if low_mass_limit < d < high_mass_limit:
                    effective_adduct_masses.append((d, p.Mass, z, p.Inten, cn))

    effective_adduct_masses = sorted(effective_adduct_masses,key= lambda x: x[0])

    return effective_adduct_masses

# Setup
-----

## Define peak file path and read peak list

In [4]:
data_path = os.sep + os.path.join('Users','ronbonner','Data', 'SharedData', 'Test')

data_file = 'S_4 MeOH FA pks 0.2 percent.txt'

data_path = os.path.join(data_path, data_file)

if not os.path.exists(data_path):
    print('No such path:', data_path)

print(data_path)

peaks, raw_tic, base_peak_mass, base_peak_inten, has_rt = read_peak_list(data_path)

print(f'{len(peaks)} peaks, TIC {raw_tic:.3e}')
print(f'Base peak {base_peak_mass:.4f}, {base_peak_inten:.3e}')

/Users/ronbonner/Data/SharedData/Test/S_4 MeOH FA pks 0.2 percent.txt
m: 0 Int: 2 RT: -1
2932 peaks, TIC 3.972e+05
Base peak 169.0455, 1.080e+05


## Define list of target adducts

This is a list of (label, effective adduct mass) tuples

In [36]:
metals =[
    ('H', 0),            # to find MH+, 2MH+, etc.
    ('Be-2H',6.99653),
    ('Mg-2H',21.96939),
    ('Na-H',21.98194),
    ('Al-3H',23.95806),
    ('Ca-2H',37.94694),
    ('K-H',37.95588),
    ('Ti-2H',45.93229),
    ('V-2H',48.92831),
    ('Fe-3H',52.91146),
    ('Fe-2H',52.91146+1.0078246),
    ('Mn-2H',52.92239),
    ('Ni-2H',55.91969),
    ('Co-2H',56.91754),
    ('Cu-2H',60.91395),
    ('Zn-2H',61.91349),
    ('Ge-2H',71.90553),
    ('Zr-4H',85.87339),
    ('Sr-2H',85.88996),
    ('Mo-3H',94.88193),
    ('Ag-H',105.89727),
    ('Cd-2H',111.88771),
    ('Ba-2H',135.88960),
    ('Tl-5H',199.93530),
    ('Bi-3H',205.95692),
    ('Pb-2H',205.96100)
]

# Find the heaviest so we can set an upper mass limit
heaviest = max(metals, key=lambda x:x[1])
lightest = min(metals, key=lambda x:x[1])

print(heaviest, lightest)

('Pb-2H', 205.961) ('H', 0)


## Parameters

In [37]:
monomer_mass = 146.057909 #dimesa

max_multimer = 5
max_charge = 3
min_inten_to_include = 20   # for each match
min_count_to_include = 2    # for one adduct
max_error_in_amu = 0.01

# Process and report
------

Calculate a list of (effective adduct mass, peak mass, charge, peak intensity, moonomer count) tuples for each peak in the peak list. The mass limits are determined from the heaviest and lightest and the maximum allowed error

In [38]:

low, high = lightest[1] - max_error_in_amu, heaviest[1] + max_error_in_amu
ma_list = get_possible_ma_list(peaks, monomer_mass, max_multimer, max_charge, low, high)

print(len(ma_list), 'possible Ma' )

# results summary
res_summary = []    

# a dictionary of putative ma's for each mass in the peak list that has at least one
by_mass = defaultdict(list) 

for label, eam in metals:    # for each (label, efective adduct mass)

    # get a list of the putative ma's that are in range of this eam value and filterby intensity
    res = [d for d in ma_list if (eam-max_error_in_amu) < d[0] < (eam+max_error_in_amu)]
    
    res = [d for d in res if d[3] > min_inten_to_include]
    
    if len(res) < min_count_to_include: continue    #not enough

    inten_sum = sum([r[3] for r in res])

    # get errors and stats
    errors_mmu = [(r[0]-eam)*1000 for r in res]   # matching list of errors in mmu

    median_err = median(errors_mmu)
    err_range = max(errors_mmu) - min(errors_mmu)
    err_stdev = stdev(errors_mmu) if len(res) > 1 else 0

    # generate and print a one line summary and add values to the results summary
    desc = f'{label} ({eam:.5f}), {len(res)} pks, tic {inten_sum:.1f}'
    desc += f', errors (mmu): range {err_range:.2f}. median {median_err:.2f}, st_dev {err_stdev:.2f}'
    print(desc)

    res_summary.append((label, eam, len(res), inten_sum, err_range, median_err, err_stdev))

    # print the details for each putative assignment and add to our by_mass peak summary
    for r in sorted(res, key=lambda x: x[1]):
        delta, obs_mass, z, inten, monomer_count = r
        err_in_mmu = (delta-eam)*1000
        print(f'{obs_mass:12.4f}, {monomer_count}M {z}+ {inten:8.1f} (m-vH)_calc: {delta:.5f} error: {err_in_mmu:.2f}')

        by_mass[obs_mass].append((label, eam, z, monomer_count, inten, err_in_mmu))


8459 possible Ma
H (0.00000), 5 pks, tic 4715.6, errors (mmu): range 5.95. median -5.95, st_dev 2.48
    147.0622, 3M 3+   1505.1 (m-vH)_calc: -0.00893 error: -8.93
    147.0622, 2M 2+   1505.1 (m-vH)_calc: -0.00595 error: -5.95
    147.0622, 1M 1+   1505.1 (m-vH)_calc: -0.00298 error: -2.98
    293.1200, 4M 2+    100.2 (m-vH)_calc: -0.00617 error: -6.17
    293.1200, 2M 1+    100.2 (m-vH)_calc: -0.00309 error: -3.09
Mg-2H (21.96939), 4 pks, tic 13443.5, errors (mmu): range 17.15. median -4.88, st_dev 7.74
    315.1017, 2M 1+  12866.1 (m-vH)_calc: 21.97859 error: 9.20
    377.1349, 5M 2+     56.4 (m-vH)_calc: 21.96563 error: -3.76
    461.1444, 3M 1+    486.9 (m-vH)_calc: 21.96339 error: -6.00
    607.2003, 4M 1+     34.2 (m-vH)_calc: 21.96143 error: -7.96
Na-H (21.98194), 2 pks, tic 120861.3, errors (mmu): range 1.77. median -2.47, st_dev 1.25
    169.0455, 1M 1+ 107995.3 (m-vH)_calc: 21.98036 error: -1.58
    315.1017, 2M 1+  12866.1 (m-vH)_calc: 21.97859 error: -3.35
Al-3H (23.95806

In [39]:
# print a summary of the results for each possible adduct
summary = sorted(res_summary, key=lambda x: x[3], reverse=True)

for s in summary:
    label, eam, res_count, tic, err_range, median_err, err_stdev = s
    tic_percent = tic * 100 / raw_tic
    
    desc = f'{label:6}({eam:.3f})\t{res_count} matches, TIC {tic:.3e} ({tic_percent:5.2f}%)'
    desc += f'. Errors (mmu): range {err_range:0.2f}, med{median_err:7.2f}, stdev {err_stdev:.2f}'
    print(desc)


Na-H  (21.982)	2 matches, TIC 1.209e+05 (30.43%). Errors (mmu): range 1.77, med  -2.47, stdev 1.25
Al-3H (23.958)	3 matches, TIC 4.592e+04 (11.56%). Errors (mmu): range 1.19, med  -3.86, stdev 0.62
Ca-2H (37.947)	6 matches, TIC 2.944e+04 ( 7.41%). Errors (mmu): range 5.42, med  -5.67, stdev 2.10
Mg-2H (21.969)	4 matches, TIC 1.344e+04 ( 3.38%). Errors (mmu): range 17.15, med  -4.88, stdev 7.74
Ba-2H (135.890)	7 matches, TIC 1.277e+04 ( 3.21%). Errors (mmu): range 5.64, med  -6.48, stdev 2.00
Fe-3H (52.911)	2 matches, TIC 6.244e+03 ( 1.57%). Errors (mmu): range 1.18, med  -4.93, stdev 0.83
H     (0.000)	5 matches, TIC 4.716e+03 ( 1.19%). Errors (mmu): range 5.95, med  -5.95, stdev 2.48
Fe-2H (53.919)	5 matches, TIC 3.531e+03 ( 0.89%). Errors (mmu): range 5.38, med  -5.05, stdev 2.30
Ti-2H (45.932)	2 matches, TIC 1.337e+03 ( 0.34%). Errors (mmu): range 0.01, med   1.71, stdev 0.01
Sr-2H (85.890)	6 matches, TIC 1.008e+03 ( 0.25%). Errors (mmu): range 5.88, med  -7.29, stdev 2.26
Ni-2H (55

In [40]:
# print the matches organized by mass...this highlights possible conflicts
spacer = ' ' * 13

for m in sorted(by_mass.keys()):
    first = True
    for item in by_mass[m]:
        label, mass, z, monomer_count, inten, err_in_mmu = item  #unpack
        if first:
            desc = f'{m:12.5f}:{mass:12.5f}: {label:6} {monomer_count}M {z}+ {inten:9.1f} {err_in_mmu:5.2f}'
        else:
            desc = f'{spacer}{mass:12.5f}: {label:6} {monomer_count}M {z}+ {inten:9.1f} {err_in_mmu:5.2f}'
        print(desc)
        first = False

   141.97813:   135.88960: Ba-2H  1M 2+     196.0 -5.79
   147.06220:     0.00000: H      3M 3+    1505.1 -8.93
                  0.00000: H      2M 2+    1505.1 -5.95
                  0.00000: H      1M 1+    1505.1 -2.98
   166.03533:    37.94694: Ca-2H  2M 2+     170.8 -6.63
                 56.91754: Co-2H  3M 3+     170.8 -7.07
   169.04554:    21.98194: Na-H   1M 1+  107995.3 -1.58
   171.01938:    23.95806: Al-3H  1M 1+      41.9 -3.86
   171.53151:    48.92831: V-2H   2M 2+      25.4  4.35
   185.00996:    37.94694: Ca-2H  1M 1+    2279.0 -2.16
   190.00610:    85.87339: Zr-4H  2M 2+      42.2  8.45
                 85.88996: Sr-2H  2M 2+      42.2 -8.12
   200.98081:    53.91928: Fe-2H  1M 1+      64.7 -3.66
   212.05857:    48.92831: V-2H   4M 3+      20.4 -6.06
   215.00674:   135.88960: Ba-2H  2M 2+     662.3 -6.48
   232.95150:    85.88996: Sr-2H  1M 1+     132.0 -3.64
   239.06382:    37.94694: Ca-2H  3M 2+     490.8 -7.57
   263.03435:    85.87339: Zr-4H  3M 2+     132.