# CompoundCalulator 20210705

Given a list of base compound names and molecular weights, this notebook calculates possible derivatives (e.g. metabolites) and adds adducts and losses resulting in a list of possible ion masses with labels. These target ion lists are used with the Match notebook to explain peaks in peak lists and is part of Multi-layered Analysis (MLA).
In MLA, a peak list is matched to a target ion list; matches are visualized/verified and the residual spectrum is matched against a new target list modified by adding more compounds or adducts. These are added so that combinations with earlier targets are generated.

In [1]:
from collections import defaultdict
from itertools import groupby
import datetime
import os
import re

# Class and function definitions
------------------------------
The basic entity is a 'Composition'...NB. this is not an elemental composition but simply a text label, a count, a root name and a mass. When compositions are combined, the labels are concatenated (using a specified separator character) and the masses are added. The root name is used to track compound sand can be updated following modification.

In [2]:
from dataclasses import dataclass

@dataclass
class Composition:
    Name: str = ""
    Count: int = 1
    Mass: float=-1
    Root:  str = ""       #the composition this on is based on - for tracking
    
    # For adducts the mass values are the 'Effective Adduct Mass'
    # The dictionary can be changed with: Composition.Mods = {new dictionary}
    Mods = {'OH':15.99492,
        'COOH':29.97418,     #COOH is CH3->COOH, i.e. +O2, -H2)
        'Gluc':176.032088,
        'Sulphate':79.956815,
        'Hex':180.0633,
        'C6H10O5':162.052823,
            
        'H2O':-18.010565,    # losses are negative
         'CO2':-43.989829,
        'CO':-27.994915,
        'HCOOH':-46.005479,
        'HCl':-35.976678,
        'H2':-2.015650,
        'Rib':-132.0425,
            
        'C2H4O2':60.021129,  # neutrals can be added directly
        'CH2O2':46.005479,
        'CHO2': 44.997654,          
        'NH3':17.026549,     #'Effective adduct masses'
        'Na-H':21.981944, 
            
        'K-H':37.955881,'K*H': 39.9540,     #41K - H                   
        'Ca-2H': 37.946941,
        'Ba-2H':135.889597,
        'Fe-3H':52.910913,
        'Fe-2H': 53.919286,
        'Al-3H':23.957515,       
        'Be-2H':6.99653,
        'Mg-2H':21.96939,
        'Al-3H':23.95806,
        'Ti-2H':45.93229,
        'V-2H':48.92831,
        'Mn-2H':52.92239,
        'Ni-2H':55.91969,
        'Co-2H':56.91754,
        'Cu-2H':60.91395,
        'Zn-2H':61.91349,
        'Ge-2H':71.90553,
        'Sr-2H':85.88996,
        'Zr-4H':85.87339,
        'Mo-3H':94.88193,
        'Ag-H':105.89727,
        'Cd-2H':111.88771,
        'Tl-5H':199.93530,
        'Pb-2H':205.96100,
        'Bi-3H':205.95692,
        }
    
    def __init__(self, name, count, mass=None, root=None):

        self.Name = f'{name}' if count == 1 else f'({name}){count}'

        self.Count = 1    # there's only one of these even if the 'count' (really a multiplier) is gretaer
        self.Mass = mass if mass else self.Mods[name]*count        
        
        if root:
            self.Root = root
        else:
            self.Root = name
    
    # Make the Composition from a (Name, Count) tuple
    @classmethod
    def from_tuple(cls, t):
        return Composition(t[0],t[1])

    # make a composition from a list of (Name,Count)tuples
    @classmethod
    def from_tuple_list(cls, t_list):
        comp = None
        
        for t in t_list:           
            if not comp:               
                comp = Composition.from_tuple(t)    #create a comp from the first in the list so we can append others to it
            else:
                comp2 = Composition.from_tuple(t)
                comp = comp.add_comp(comp2, sep='.')
                
        return comp
    
    @classmethod
    def proton(cls):
        return Composition('H+', 1, 1.00727)
    
    # Some basic sanity checks...
    @classmethod
    def test(cls):        
        print('Proton: ', Composition.proton())

        a = Composition('Na-H',2)
        print('Normal init:', a)

        b = Composition.from_tuple(('K-H',2))
        print('From tuple:', b)

        ab = a.add_comp(b, sep='.')
        print('From merge:', ab)

        t_list = [('Na-H',2),('K-H',2), ('NH3', 1)]
        abc = Composition.from_tuple_list(t_list)
        print('From tuple list:', abc)
    
    # prints the current list of available modifications
    @classmethod
    def get_mods_as_strings(cls):
        for label in cls.Mods:
            print(label, cls.Mods[label])
    
    def get_proton_comp(self, z):
        if z == 1:
            name = 'H+'
        else:
            name = f'{z}H+'
            
        comp_p = Composition(name, 1, 1.00727 * z)
        
        return comp_p
    
    def protonate(self):
        return self.add_comp(Composition('H+', 1, 1.00727), sep='.')
    
    def deprotonate(self):
        return self.add_comp(Composition('[-H+]-', 1, -1.00727), sep='.')

    def make_copy(self, mult=1):
        return Composition(self.Name, self.Count*mult, self.Mass*mult, self.Root)
    
    def label(self):
        return self.Name
    
    # Merge two compositions to generate a new one with a new mass
    def add_comp(self, comp1, sep='_', z=1):
        new_name = self.label() + sep + comp1.label()
        new_mass = (self.Mass + comp1.Mass)/z
#         print(self.Mass, comp1.Mass, new_name, z)
        return Composition( new_name, 1, root=self.Root, mass=new_mass)

Composition.test()
#Composition.get_mods_as_strings()   

Proton:  Composition(Name='H+', Count=1, Mass=1.00727, Root='H+')
Normal init: Composition(Name='(Na-H)2', Count=1, Mass=43.963888, Root='Na-H')
From tuple: Composition(Name='(K-H)2', Count=1, Mass=75.911762, Root='K-H')
From merge: Composition(Name='(Na-H)2.(K-H)2', Count=1, Mass=119.87565, Root='Na-H')
From tuple list: Composition(Name='(Na-H)2.(K-H)2.NH3', Count=1, Mass=136.902199, Root='Na-H')


In [25]:
# recursive routine to find combinations
def get_combs(maxima, item_count, pos, seed, take, res):
    """
    The idea is that the number of each composition to evaluate can be written as a list of integers, e.g. [1,0,0], [0,1,0].
    We process each entry successively, setting the value to the number we need to take or the maximum allowed for that entry;
    the number to take for the subsequnt entry is based on the number remaining from the first. E.g. if we are to take 5 and 3
    are used for the first entry, we pass 2 to the next. We stop when tke gets to 0 or when we run out of entries,
    """
    if take == 0:
        return
    elif take > maxima[pos]:
        this_take = maxima[pos]
    else:
        this_take = take
    
    while this_take >= 0:
        
        # clear the rest of the seed and set this position's value
        for i in range(pos, len(seed)): seed[i] = 0
        seed[pos] = this_take

        # set up for next level
        next_take = take - this_take
        next_pos = pos + 1
        
        if not next_take: # or next_pos == item_count:        # nothing more to add, so save a copy of the current seed
            res.append(list(seed))   # copy the seed
        elif next_pos == item_count:
            break
        else:    
            get_combs(maxima, item_count, next_pos, seed, next_take, res)
        
        this_take -=1

def get_comps_as_str(cleaned_list):
    """
    Process the list of adduct limits (cleaned to remove duplicates and entries with zero counts) to generate 
    a simple string of adducts and counts. We remove hydrogen losses if present - these are identify by "-H"
    where the "H" can be foloowed or preceded by a number, n. Note: we require the minus sign since an adduct may contain
    H atoms, e.g. HCOOH or C2H3O2. If the last char is a digit (as in C2H3O2) and teh count is > 1, 
    we add 'x' between the adduct and count
    """
    res_str = ""
    
    for adduct, count in cleaned_list:
        
        if not count: continue
            
        a = re.sub("-\d*H\d*", "", adduct)  # this removes and optional number of H from after or before the H

        if count == 1:
            res_str += a
        elif a[-1].isdigit():
            res_str += f'{a}x{count}'            
        else:
            res_str += f'{a}{count}'
    
    return res_str
    
    
def make_combinations(limit_list, max_combinations):
    """
    Sets up for the recursive routine by getting and cleaning the list of limits, generating a list of integers 
    corresponding to the maximum number of each composition, and calling get_combs with take counts of 1, 2, 3...max
    Returns a list of (adduct, count) tuples and a string that summarizes the list, generated by removing hydrogen losses from 
    the adduct strings but only if there is a minus sign. The summary string is in the order the adducts are encountered in the
    limit_list and is not sorted further
    """   
    # first we make sure the compositions are unique and limits are non-zero
    # this is needed because the user may specify the same composition more than once which woukd
    # cause it to be treated as a separate limit
    
    cleaned = defaultdict(int)
    
    # create a dictionary of {comp:limit}; if the comp is already present the limit will be added
    for (c,l) in limit_list:
        if l > 0:
            cleaned[c] += l
    
    # convert the cleaned dict to a list and then into lists of comps and maxima   
    clean_list = [(c, cleaned[c]) for c in cleaned]    
    print(clean_list)
    comps, limits = zip(*clean_list)
    
    # get a list of combinations; each combination is a list of the counts for the composition at that index
    item_count = len(limits)  # number of entries in the limit list
    seed = [0]*item_count
    
    res = []   # this will hold the lists of integers representing the count of each Composition
    
    # take 1, 2, 3...max_combinations items and append to res[]
    for take in range(1, max_combinations+1):
        get_combs(limits, item_count, 0, seed, take, res)
    
    # finally generate a list of the actual compostions, i.e [('x',2), ('y',3)] etc.
    # by combining the compositions and each list of counts
    combs=[]
    
    for r in res:
        c = [(comps[i], r[i]) for i in range(item_count) if r[i] > 0]
        combs.append(c)
    
    return combs,get_comps_as_str(clean_list)

# Test code
# Note: x is deliberately present twice
combs, comps_as_str = make_combinations([('x-2H', 2), ('y-H', 2), ('CH2O2', 2), ('q-H3',2), ('x-2H',1)], 3)

print(len(combs), 'should be 31')
print(comps_as_str, 'should be x3y2CH2O2x2q2')
# for c in combs:
#     print(c)

[('x-2H', 3), ('y-H', 2), ('CH2O2', 2), ('q-H3', 2)]
31 should be 31
x3y2CH2O2x2q2 should be x3y2CH2O2x2q2


In [4]:
def add_mods(compounds, limits, sep='_', update_root=False):
    """
    Adds modifications to each compound in the list returning the new compound list.
    The modfications are provided as a list of (mods, max count) tuples
    By default the root is root updated, so it stays the same as the orignal compound, but if True it
    is changed to the new comppound. This allows the root to reflect the compounds at a different level, e.g. after phase 1
    """
    mods = []

    # Make the compounds by copying the base and adding the possible mods
    for c in compounds:
        for l in limits:
            for i in range(l[1]):
                new_comp = c.make_copy().add_comp(Composition(l[0], i+1), sep=sep)
                
                if update_root:
                    new_comp.Root = new_comp.Name
                mods.append(new_comp)
                #print(new_comp)

    compounds += mods
    
    return compounds


In [5]:
# convert compositions to a printable string
def limits_as_string(limits):
    """
    Coverts the composition limits for a particular type (adducts, losses, phase 1...) to string.
    Compositions can be switched off by setting the limit to zero so we skip those
    """
    non_zero_limits = [l for l in limits if l[1] > 0]  # a list of compositions withlimit > 0
    
    if len(non_zero_limits) == 0:
        return ""
    else:
        desc = ",".join([f'{l}' for l in non_zero_limits])
        return desc

In [6]:
def get_comp_adduct_str(comp_names, mult_limit, hetero_dimers, adduct_str, max_adducts):
    """    
    Builds a string describing the compounds and adducts
    """
    
    #Build the output name
    c_a_str = f'{comp_names}'           
    c_a_str += f'_m{mult_limit}' if mult_limit else ""
    c_a_str += 'h' if hetero_dimers else ""
    c_a_str += f'_{max_adducts}-{adduct_str}' if adduct_str else ""
    
    return c_a_str


# generates a unique file name given the parameters and a string representing the date
# if xic_width is non zero the user wants a list of masses and widthes for use with PeakView
def get_ouput_file_name(comp_names, ionization, time_str, include_date_in_file_name, \
                        mult_limit, hetero_dimers, adduct_str, max_adducts, xic_width):
    """
    Generates a file name based on the compounds used (as a string Comp1_comp2.. etc.) and the polarity
    with additions indicating the file is intended to extract XICs in PeakView and the date/time if
    required; the format used by the main code is YYMMDD_HHMMSS
    """
    polarity = 'neg' if ionization == "negative" else 'pos'

    wants_xic = xic_width > 0

    #Build the output name
    base_name = get_comp_adduct_str(comp_names, mult_limit, hetero_dimers, adduct_str, max_adducts)
    base_name += f' {polarity}'

    if wants_xic:
        base_name += ' xic'
        
    if include_date_in_file_name:
        base_name += ' ' + time_str
    
    return wants_xic, base_name + '.txt'



# Setup
-----

Provide the  base compound information and other parameters.
The base compounds are supplied as a list of (name, mass) tuples.
The mass need not be a real known compound but can be an observed and unexplained peak so that its potential derivatives are generated.

All user-defined parameters are set here so, once they are set, the code can be executed with 'Run selected cell and all below"
## Shared path

In [7]:
# Define  shared path for data files
# This allows the Calculator and Match notebooks to easily share data
# This is a platform independent way of defining a path, but Windows users must start with 'C:'
shared_path = os.sep + os.path.join('Users','ronbonner','Data', 'SharedData')
    
print(shared_path)

/Users/ronbonner/Data/SharedData


## Compounds and adducts

In [41]:
# Define the compound(s) we want to work with
# can be known compouds or unknown observed peaks, here treated as MH+ by subtracting the mass of H+
base_compounds = [('DiMeSA', 146.057909)]   # must be a list

# base_compounds = [('Guan', 283.091669),  # Guanosine
# base_compounds = [('x116', 116.0711-1.00727), ('x114', 114.0668-1.00727),('x132', 132.07690-1.00727),
#                   ('y114', 114.09040-1.00727), ('x190', 190.11950-1.00727)
#                  ]

# Define the limits for metabolites and adducts...
# Defining this way is not required but allows metabolite and adduct sets to be easily changed depending on polarity.
# Unwanted compositions can be rmoved or the limit can be set to zero

ionization = 'positive'          # only 'negative' changes the settings...anything else is 'positive'

phase1_limits = [('OH', 0), ('COOH', 0)]  # metabolite modifications - phase 1

if ionization == 'negative':
    phase2_limits = [('Gluc', 1), ('Sulphate', 0)]
    adduct_limits = [('Na-H', 2), ('K-H', 2), ('C2H4O2',1), ('CH2O2', 1)]  
    loss_limits = [('H2O',0), ('CO2',0)]
else:
    phase2_limits = [('Gluc', 0)]
    adduct_limits = [('Na-H', 3), ('K-H',0), ('K*H',0), ('NH3',0), ('Ca-2H', 2)]                   
    loss_limits = [('H2O',1), ('HCOOH', 0), ('Am', 0), ('Rib', 0)]

# it can be useful to summarize here - to allow review before proceeding
print('Adducts:', get_comps_as_str(adduct_limits))
print('Losses:', get_comps_as_str(loss_limits))

Adducts: Na3Ca2
Losses: H2O


## Parameters

In [42]:
multimer_limit = 3              # maximum multimer count
max_adduct_count = 5            # total number of adducts allowed
include_hetero_dimers = False     # if True, calculate dimers of *different* compounds

## Output

In [43]:

output_mass_limit = 1000  # masses greater than this are not written to the file
xic_width = 0.0           # if 0 the normal output form is used...alternative, e.g. 0.01, to generate the PeakView compatible form

save_ion_list = True      # write the results a file (or print thm here)
include_date_in_file_name = False   #include the date_time in the file name

# Generate the output_path; optional - add a subfolder to the shared path
# otherwise use: data_path = shared_path
data_path = os.path.join(shared_path,'Test')
print(data_path)

/Users/ronbonner/Data/SharedData/Test


# Step 1 - Adduct generation
---------------------------

Generate a list of possible adduct forms by generating all comibnations of adducts (up to the specified limit) and selecting the unique forms (i.e. as far as we are concerned, a+b+a is the same as a+a+b). Note: this approach would also work if we wanted to allow combinations of the metabolites. These will be added to each compound.

In [44]:
adduct_combs, adducts_as_str = make_combinations(adduct_limits, max_adduct_count)

adducts_as_str += '-' + get_comps_as_str(loss_limits)     # add losses 
    
adduct_comps = [Composition.from_tuple_list(c) for c in adduct_combs]
adduct_comps = sorted(adduct_comps, key=lambda x: x.Mass)

print(len(adduct_comps),'adduct forms')
print(adducts_as_str)

# for a in adduct_comps:  # to view compositions
#     print(a)

[('Na-H', 3), ('Ca-2H', 2)]
11 adduct forms
Na3Ca2-H2O


# Step 2 - Compound generation
-----------------------------

We convert the base compound list to a list of compositions and then successively apply the various modifications, generating extended compound lists, in phase order.

Finally we calculate the dimers and heterodimers (if desired)

In [45]:
# Make the compounds by copying the base and adding the possible mods
# The root is set wqual to the name unless specifically specified

compounds = [Composition(name, 1, mass) for name, mass in base_compounds]

# If update_root is True, the root name is changed to the modified name otherwise it is left alone
# This allows the user to choose whether to keep the root as the base conpound or change it to a modified form

compounds = add_mods(compounds, phase1_limits, update_root=True)
print(len(compounds), 'compounds after phase 1')

compounds = add_mods(compounds, phase2_limits, update_root=True)
print(len(compounds), 'after phase 2')

multimers = []

for c in compounds:
    for m in range(2, multimer_limit+1):
        new_comp = c.make_copy(m)
        multimers.append(new_comp)

if include_hetero_dimers:
    for i, c in enumerate(compounds):
        for j in range(i+1, len(compounds)):
            new_comp = c.make_copy()
            new_comp_2 = compounds[j].make_copy()
            new_comp = new_comp.add_comp(new_comp_2, sep='+')
            multimers.append(new_comp)
    
compounds += multimers

print(len(compounds), 'with multimers')

compounds = add_mods(compounds, loss_limits, sep='-')
print (len(compounds), 'after losses')

# for c in compounds:
#     print(c)

1 compounds after phase 1
1 after phase 2
3 with multimers
6 after losses


# Step 3 - Generate ion forms
----------------------------

We now add all the adduct forms to each of the compounds. The approach relies on adducts being formed by replacing labile protons and are therefore indpendent of the polarity; the final form is determined by providing a charge agent, i.e. adding or subtracting protons.


In [46]:
ion_forms = []  

# now we add each compound on its own and then with the adducts
for c in compounds:
    
    # add the base compound, with a proton added or subtracted depending on the ionization mode
    new_comp = c.make_copy()
    if ionization == 'negative':
        new_comp = new_comp.deprotonate()
    else:
        new_comp = new_comp.protonate() 
        
    ion_forms.append(new_comp)   
    
    # then add the adduct forms
    for a in adduct_comps:
        new_comp = c.make_copy().add_comp(a, sep='.')
        if ionization == 'negative':
            new_comp = new_comp.deprotonate()
        else:
            new_comp = new_comp.protonate()
        ion_forms.append(new_comp)       
        
print(len(ion_forms), 'ion forms')
    

72 ion forms


# Step 4 - Summarize results and conditions
-----------------------------------------

In [47]:
# summarize calculations

current_time = datetime.datetime.now()

time_str = current_time.strftime('%y%m%d_%H%M%S')

comp_names = '-'.join([f'{c}' for (c,m) in base_compounds])  # a string of the compoundnames separated by '-'

# first line of file is a summary of the compounds and adducts
cond_str = get_comp_adduct_str(comp_names, multimer_limit, include_hetero_dimers, adducts_as_str, max_adduct_count)

print(cond_str)

print (time_str)
cond_str += f';Time:{time_str}'

print('Compounds:', comp_names)
cond_str += f';Compounds:{comp_names}'

if multimer_limit > 1:
    print(f'Up to {multimer_limit} multimers')
    cond_str += f';Multimer_limit:{multimer_limit}'

if include_hetero_dimers:
    print(f'Include heterodimers')
    cond_str += f';Heterodimers:True'

print(f'{ionization} mode')
cond_str += f';Polarity:{ionization}'

desc = limits_as_string(phase1_limits)
if desc:
    print(f'Phase 1: {desc}')
    cond_str += f';Phase_1:{desc}'


desc = limits_as_string(phase2_limits)
if desc:
    print(f'Phase 2: {desc}')
    cond_str += f';Phase_2:{desc}'

desc = limits_as_string(adduct_limits)
if desc:
    print(f'Adducts: {desc}, max count = {max_adduct_count}')
    cond_str += f';Adducts:{desc}; Max adduct count:{max_adduct_count}'

desc = limits_as_string(loss_limits)
if desc:
    print(f'Losses: {desc}')  
    cond_str += f';Losses:{desc}'
          
print(len(ion_forms), 'ion forms')

print(cond_str)

DiMeSA_m3_5-Na3Ca2-H2O
210705_085506
Compounds: DiMeSA
Up to 3 multimers
positive mode
Adducts: ('Na-H', 3),('Ca-2H', 2), max count = 5
Losses: ('H2O', 1)
72 ion forms
DiMeSA_m3_5-Na3Ca2-H2O;Time:210705_085506;Compounds:DiMeSA;Multimer_limit:3;Polarity:positive;Adducts:('Na-H', 3),('Ca-2H', 2); Max adduct count:5;Losses:('H2O', 1)


# Step 5 - Save the mass/name list
--------------------------------

Optionally save the ion forms as a simple tab delimited text file.
- the main format is: mass, root, label
- an additional format: mass, xic width, name is intended to be used with PeakView Extract XIC (by importing it)

The list can also be truncated to an upper mass limit.

To be sure the file exists, we re-open it and count the nuber of lines

In [48]:
# Set up fie names and paths...

if save_ion_list:
    
    wants_xic, out_name = get_ouput_file_name(comp_names, ionization, time_str, include_date_in_file_name, \
                                    multimer_limit, include_hetero_dimers, adducts_as_str, max_adduct_count, xic_width)

    line_count = 1      # first line is conditions

    ion_forms = sorted(ion_forms, key=lambda x: x.Mass)

    output_path = os.path.join(data_path, out_name)

    print (output_path)

    with open(output_path, 'w') as f:

        print(f'#{cond_str}', file=f)

        for ion in ion_forms:

            if ion.Mass > output_mass_limit: 
                break       

            if wants_xic:
                print(f'{ion.Mass:10.4f}\t{xic_width}\t{ion.Name}', file=f)
            else:
                print(f'{ion.Mass:10.4f}\t{ion.Root}\t{ion.Name}', file=f)

            line_count += 1

        f.close()

    print(time_str) 
    print(line_count, 'lines written to', output_path)

    with open(output_path, 'r') as f:   
        lines_read = f.readlines()    
        f.close()

    print(len(lines_read), 'read', lines_read[0])

#     if lines_read[0][0] == '#':
#         print("Conditions:")
#         print(lines_read[0][1:])
else:
    for ion in sorted(ion_forms, key=lambda x:x.Mass):        #sort list by mass
        print(f'{ion.Mass:12.4f}     {ion.Root:14} {ion.Name}')

/Users/ronbonner/Data/SharedData/Test/DiMeSA_m3_5-Na3Ca2-H2O pos.txt
210705_085506
73 lines written to /Users/ronbonner/Data/SharedData/Test/DiMeSA_m3_5-Na3Ca2-H2O pos.txt
73 read #DiMeSA_m3_5-Na3Ca2-H2O;Time:210705_085506;Compounds:DiMeSA;Multimer_limit:3;Polarity:positive;Adducts:('Na-H', 3),('Ca-2H', 2); Max adduct count:5;Losses:('H2O', 1)

