In [1]:
# 201105 Match
# Compares calculated ion list from CompoundCalculator with a peak list of m/z, inten (optionally RT)

# Step 5 - Matching

Compare the list we generated to an input peak list (assumed to be mass and intesity)

## 5.1 Imports and function definitions

In [11]:
def get_match_stats(matches, peaks, tic):

    matched_indices = set([m[0] for m in matches])   # get the unique indices since a peak may have more than one match

    matched_inten = sum([peaks[i][1] for i in matched_indices])  # sum the intensities

    percent_matched = matched_inten * 100/tic
    
    return matched_indices, percent_matched

In [12]:
def read_peak_list(peak_file_path):
    
    peaks = []    #list of (mass, inten) tuples

    with open(peak_file_path, 'r') as f:  
    
        for line in f:
            parts = line.split()
            peaks.append((float(parts[0]), float(parts[1])))  

        f.close()
    
    peaks = sorted(peaks, key = lambda x: x[0])     # ensure the list is sorted by mass

    masses, intens = zip(*peaks)

    return peaks, sum(intens), max(intens)    # peaks, tic, base_peak_inten

In [13]:
def match_as_str(m):
    p_index, ion = m         # Unpack the peak index and the matching composition
    pm, pi = peaks[p_index]  # peak mass and intensity
    
    delta = (pm - ion.Mass) * 1000   # error in mmu
    
    return f'{p_index:5}:{pm:10.4f} ({delta:5.1f}) {pi:12.1f} {ion.Mass:10.4f}  {ion.Root:8}{ion.Name}'


## 5.2 Setup

In [14]:
save_matches = True                # do we want to save the matched peaks (as mass, inten, match name)?

include_large_unmatched = True      # do we want to include the larger unmatched peaks (by default > 1% base peak inten)

compounds_as_string = '_'.join([c[0] for c in base_compounds])


In [15]:
# Set up fie names and paths...this is platform independent (i.e. we don't need to know the separator character)
calc_data = os.sep + os.path.join('Users','ronbonner','Data', 'Calculator')

peak_file = "201023 Erngren guanosine peaklist.txt"

peak_file_path = os.path.join(calc_data, peak_file)

peaks, tic, base_peak_inten = read_peak_list(peak_file_path)

print(f'{len(peaks)}, peaks read. TIC {tic}, base peak inten {base_peak_inten}')

print(peak_file_path)

1862, peaks read. TIC 266939600.0, base peak inten 30100000.0
/Users/ronbonner/Data/Calculator/201023 Erngren guanosine peaklist.txt


## 5.3 Match ions

We first match the ions generated by the calculator. In a subsequent step we look specifically for the 13C forms of matched peaks.

**Note**: to repeat the match without changing the ion forms, run this cell and all below

In [26]:
import datetime 

current_time = datetime.datetime.now().replace(microsecond=0)

ions = sorted(ion_forms, key = lambda x: x.Mass)   # sort values by mass to check for matches...

peak_index, ion_index, peaks_matched = 0, 0, 0
peak_half_window = 0.005
matches = []   # this is going to end up as a list of tuples : (peak index, matched composition)

# Loop all the values and peaks looking for matches within the specified window
while (ion_index < len(ions)) and (peak_index < len(peaks)):

    this_peak, this_ion = peaks[peak_index], ions[ion_index]
    low_peak, high_peak = this_peak[0]-peak_half_window, this_peak[0]+peak_half_window
  
    # Increment the vaue if it's too low and the peak if the value is too high
    if this_ion.Mass < low_peak:
        ion_index += 1
        continue

    if this_ion.Mass > high_peak:
        peak_index += 1
        continue

    # save peak index and ion composition
    # since there may be more than one peak that matches this ion value, we look ahead at the peaks
    # using a separate index so the current peak can be used with the next ion value
    # we also track the ions matched since some ions may have more than one matching peak

    matches.append((peak_index, this_ion))    # reference to peak and this composition
    peaks_matched += 1   
    
    look_ahead = peak_index + 1
 
    # look ahead at the peaks while they're still within the search window and add any matches to the list
    while (look_ahead < len(peaks)):
                
        look_ahead_peak = peaks[look_ahead]
        
        if(look_ahead_peak[0] - this_ion.Mass) > peak_half_window:
            break
            
        matches.append((look_ahead, this_ion))
        look_ahead +=1
        peaks_matched += 1 


    ion_index += 1 # increment ion index but not peak_index - there may be more than one ion within the window..

matched_indices, percent_tic_matched = get_match_stats(matches, peaks, tic)

matched_indices = sorted(matched_indices)
initial_matches = f'{len(matched_indices)} peaks matched ({percent_tic_matched:.1f}% tic), {len(matches)} total matches from {len(peaks)} peaks'
print(initial_matches)

107 peaks matched (45.2% tic), 107 total matches from 1862 peaks


In [17]:
# print( len(matched_indices), len(matches))
# print(matches)

In [18]:
# Look for C13 isotopes of matched peaks

c13_matches = []

last_matched_mass = 0
c13_half_window = 0.005
max_C13_count = 4
last_peak_index = -1

# for peak_index in list(matched_indices):    #only need to look at each peak once
for peak_index, m in sorted(matches, key=lambda x: x[0]):    #only need to look at each peak once
    
    if peak_index == last_peak_index:
#         print('  dup', peak_index, peaks[peak_index], m)
        continue

    last_peak_index = peak_index
        
    m_mass, _ = peaks[peak_index]     # don't need inten
    
    next_peak_index = peak_index      #start looking at the next higher peak
        
    keep_going = True
    
    for c13_count in range(1, max_C13_count+1):  #look for 1,2,3... C13
    
        c13_mass = m_mass + (c13_count * 1.004)   #expected c13 mass
        c13_name = f'{m_mass:.4f}(+{c13_count})'
        c13_comp = Composition(c13_name, 1, c13_mass, m.Root)  
        
        while next_peak_index < len(peaks) - 1:
            
            next_peak_index += 1  # point at next value in list
                
            next_peak_mass, next_peak_inten = peaks[next_peak_index]
            
            if next_peak_mass > (c13_mass + c13_half_window):
                keep_going = False       # when one isotope is not matched we abort and stop looking for more
                break
                
            if next_peak_mass > (c13_mass - c13_half_window):
                c13_matches.append((next_peak_index, c13_comp))
                break
        
        if not keep_going:
            break     # leave 13c for loop   

# for c13m in c13_matches:
#     print(c13m)

matches += c13_matches

matches = sorted(matches, key = lambda x: x[0])  # sort by peak index...

matched_indices, percent_tic_matched = get_match_stats(matches, peaks, tic)

after_13c_match = f'{len(matched_indices)} peaks matched ({percent_tic_matched:.1f}% tic), {len(matches)} total matches from {len(peaks)} peaks'
print(after_13c_match)

164 peaks matched (50.6% tic), 164 total matches from 1862 peaks


In [19]:
def print_match_list(matches, simplify=False):    #provide the index of the mass field
    
    m_sorted = sorted(matches, key=lambda x: x[0])
    m_grps = groupby(m_sorted, lambda x: x[0])
    
    for k, grp in m_grps:
        grp_as_list = list(grp)
        if len(grp_as_list) > 1:
            grp_as_list = sorted(grp_as_list, key= lambda x: len(x[1].Name))
#         print(len(grp_as_list), grp_as_list)
        if(simplify):
            v = grp_as_list[0]
            desc = match_as_str(v)
            if len(grp_as_list) > 1:
                desc += f' [1/{len(grp_as_list)}]'
            print(desc)
        else:
            for g in grp_as_list:
                print(match_as_str(g))           
    print()

simplify=True

In [20]:
print_match_list(matches, simplify=False)

    8:   90.9762 ( -0.4)     239000.0    90.9766  CH2O2   CH2O2.(Na-H)2.H+
   25:  106.9501 ( -0.5)     387000.0   106.9506  CH2O2   CH2O2.Na-H.K-H.H+
   28:  108.9479 ( -0.8)      33100.0   108.9487  CH2O2   CH2O2.Na-H.K*H.H+
   51:  122.9241 ( -0.4)    1880000.0   122.9245  CH2O2   CH2O2.(K-H)2.H+
   55:  123.9271 ( -1.0)      21500.0   123.9281  CH2O2   122.9241(+1)
   58:  124.9222 ( -0.4)     277000.0   124.9226  CH2O2   CH2O2.K-H.K*H.H+
  127:  158.9633 ( -0.8)     233000.0   158.9641  CH2O2   (CH2O2)2.(Na-H)3.H+
  157:  174.9370 ( -1.0)     276000.0   174.9380  CH2O2   (CH2O2)2.(Na-H)2.K-H.H+
  165:  176.9361 ( -0.0)      25300.0   176.9361  CH2O2   (CH2O2)2.(Na-H)2.K*H.H+
  192:  190.9112 ( -0.7)     498000.0   190.9119  CH2O2   (CH2O2)2.Na-H.(K-H)2.H+
  198:  192.9095 ( -0.6)      80900.0   192.9101  CH2O2   (CH2O2)2.Na-H.K-H.K*H.H+
  225:  206.8851 ( -0.8)     369000.0   206.8859  CH2O2   (CH2O2)2.(K-H)3.H+
  229:  208.8837 ( -0.3)      88700.0   208.8840  CH2O2   (CH2O2)2.(K

In [21]:
def get_unmatched_indices(matched_indices, peaks, threshold):
    
    peak_matches = [True if i in matched_indices else False for i in range(len(peaks))]
    
    unmatched = [i for i in range(len(peaks)) if not peak_matches[i]]
    
    return [i for i in unmatched if peaks[i][1] >= threshold]

In [36]:
threshold_percent = 1

bpi_percent_thresh = threshold_percent * base_peak_inten / 100

unmatched_indices = get_unmatched_indices(matched_indices, peaks, bpi_percent_thresh)  # get the unmatched peask > 1% base peak

unmatched_mass, largest_unmatched_inten = 0,0

for pi in unmatched_indices:
    m,inten = peaks[pi]
    percent_base_peak = inten * 100/ base_peak_inten
    print(f'{m:10.4f} {inten:10.0f} {percent_base_peak:8.2f}% base peak')
    
    if inten > largest_unmatched_inten:
        unmatched_mass, largest_unmatched_inten = m,inten

large_inten_rel = largest_unmatched_inten * 100/ base_peak_inten

largest_unmatched_string = f'Largest unmatched {unmatched_mass}, {largest_unmatched_inten} {large_inten_rel:.1f}% base'

print(largest_unmatched_string)

  102.1272    1480000     4.92% base peak
  103.9550     385000     1.28% base peak
  118.0857     561000     1.86% base peak
  118.1218     523000     1.74% base peak
  135.0296    1900000     6.31% base peak
  141.1127     320000     1.06% base peak
  152.0562   18800000    62.46% base peak
  153.0578    1330000     4.42% base peak
  155.1286     883000     2.93% base peak
  156.0415     601000     2.00% base peak
  174.0380    1650000     5.48% base peak
  190.0117     351000     1.17% base peak
  196.0197     457000     1.52% base peak
  217.1040     351000     1.17% base peak
  230.2476     943000     3.13% base peak
  267.1693     630000     2.09% base peak
  273.1669     724000     2.41% base peak
  277.1044     493000     1.64% base peak
  279.0946     369000     1.23% base peak
  285.1298     325000     1.08% base peak
  288.2896     379000     1.26% base peak
  289.1430     371000     1.23% base peak
  301.1055     562000     1.87% base peak
  307.0837    1790000     5.95% ba

In [41]:
def save_matches_to_file(out_path, matches_to_save, peaks):
    
    with open(out_path, 'w') as f:  

        for m in matches_to_save:
            
            peak_index, matched_ion = m
            
            pm, pi = peaks[peak_index]
            
            f.write(f'{pm:.4f}\t{pi:.1f}\t{matched_ion.Name}{os.linesep}')
                    
        f.close()

    return(len(to_save))


In [44]:

if save_matches:

    out_path, _ = os.path.splitext(peak_file_path)    # path without extension

    out_path = f'{out_path} {compounds_as_string} matches {current_time}.txt'
    
    to_save = matches
    
    if include_large_unmatched:
        to_save = matches + [(i, Composition('None',1, 0.1)) for i in unmatched_indices]  # append a list of the unmatched ions as empty matches
    
    to_save = sorted(to_save, key = lambda x: x[0])  # was x[1] peak mass, now x[0], peak index

    lines_written_count = save_matches_to_file(out_path, to_save, peaks)
    
    print(out_path)
    print(lines_written_count, 'written')

/Users/ronbonner/Data/Calculator/201023 Erngren guanosine peaklist Guan matches 2020-11-05 06:29:29.txt
165 written


In [24]:
# summarize matches

def limits_as_string(limits):
    non_zero_limits = [l for l in limits if l[1] > 0]
    if len(non_zero_limits) == 0:
        return ""
    else:
        desc = ",".join([f'{l}' for l in non_zero_limits])
        return desc

print (current_time)

print('Compounds:', ';'.join([f'{c}, {m:.4f}' for (c,m) in base_compounds]))

if multimer_limit > 1: print(f'Up to {multimer_limit} multimers')
if include_hetero_dimers: print(f'Include heterodimers')
print(f'{ionization} mode')

desc = limits_as_string(phase1_limits)
if desc: print(f'Phase 1: {desc}')

desc = limits_as_string(phase2_limits)
if desc: print(f'Phase 2: {desc}')

desc = limits_as_string(adduct_limits)
if desc: print(f'Adducts: {desc}, max count = {max_adduct_count}')

desc = limits_as_string(loss_limits)
if desc: print(f'Losses: {desc}')  

if include_adducts_as_compounds:
    desc = f'{len(adducts_to_add)} adducts with'
    desc += f' mass < {max_adduct_as_compound_mass}'
    desc += f' and {adduct_as_compound_must_have} in name included'
    print(desc)
          
print(len(ion_forms), 'ion forms')
print()
print(f'{peak_file_path}')
print(f'{len(peaks)} peaks, matched with half window {peak_half_window} amu')
print(f'Looking for <= {max_C13_count} 13C isotopes with half window {c13_half_window}')

print(initial_matches)
print('After 13C match',after_13c_match)

print(f'{len(unmatched_indices)} unmatched peaks gt {threshold_percent}%, {largest_unmatched_string}')

2020-11-05 06:24:30
Compounds: Guan, 283.0917
Up to 3 multimers
Include heterodimers
positive mode
Adducts: ('CH2O2', 4),('Na-H', 6),('K-H', 6),('K*H', 2), max count = 10
364 adducts with mass < 800 and CH2O2 in name included
1840 ion forms

/Users/ronbonner/Data/Calculator/201023 Erngren guanosine peaklist.txt
1862 peaks, matched with half window 0.005 amu
Looking for <= 4 13C isotopes with half window 0.005
107 peaks matched (45.2% tic), 107 total matches from 1862 peaks
After 13C match 164 peaks matched (50.6% tic), 164 total matches from 1862 peaks
53 unmatched peaks gt 1%, Largest unmatched 152.0562, 18800000.0 62.5% base
