In [13]:
import pandas as pd
from termcolor import colored

In [14]:
# Read in data
polarity = "positive"
c18_standards_df = pd.read_csv('./C18_standards_' + polarity + '.tsv', sep='\t', float_precision='round_trip')
mass_file = pd.read_csv('./select_single_adduct_per_compound_data/inchi-key_to_mass.csv', sep=',', float_precision='round_trip')

In [15]:
# Find duplicates of different features (columns)

c18_standards_df_dups = c18_standards_df.copy()
c18_standards_df_dups = pd.merge(c18_standards_df_dups, mass_file, how="inner", on="inchi_key")

c18_standards_df_dups['inchi-adduct'] = c18_standards_df_dups['inchi_key'].astype(str) + c18_standards_df_dups['adduct']
c18_standards_df_dups['label-adduct'] = c18_standards_df_dups['label'].astype(str) + c18_standards_df_dups['adduct']
duplicated_inchi_adducts = list(c18_standards_df_dups[c18_standards_df_dups['inchi-adduct'].duplicated()]['inchi-adduct'].drop_duplicates().values)
duplicated_inchis = list(c18_standards_df_dups[c18_standards_df_dups['inchi_key'].duplicated()]['inchi_key'].drop_duplicates().values)
duplicated_label_adducts = list(c18_standards_df_dups[c18_standards_df_dups['label-adduct'].duplicated()]['label-adduct'].drop_duplicates().values)
duplicated_mz_isomers = list(c18_standards_df_dups[c18_standards_df_dups['mono_isotopic_molecular_weight'].round(decimals=6).duplicated()]['mono_isotopic_molecular_weight'].drop_duplicates().values)

In [None]:
# Identify inchi keys that have multiple adducts and choose hydrogen (most common) or the highest intensity

if polarity == "positive":
    preferred_adduct = '[M+H]+'
elif polarity == "negative":
    preferred_adduct = '[M-H]-'

c18_standards_df_filt1 = c18_standards_df_dups.copy()

starting_compounds = c18_standards_df_filt1.shape
removed_compounds = 0

c18_standards_df_filt1 = c18_standards_df_filt1[~c18_standards_df_filt1['inchi_key'].isin(duplicated_inchis)]

for inchi in duplicated_inchis:

    adducts = list(c18_standards_df_dups[c18_standards_df_dups['inchi_key'] == inchi]['adduct'])
    number_of_duplicates = len(adducts)

    if not all(i == adducts[0] for i in adducts):

        if preferred_adduct in adducts:

            inchi_sub = c18_standards_df_dups[c18_standards_df_dups['inchi_key'] == inchi]
            best_adduct = inchi_sub[inchi_sub['adduct'] == preferred_adduct].iloc[[0]]
            c18_standards_df_filt1 = pd.concat([c18_standards_df_filt1, best_adduct], ignore_index=True)
            print(colored("Choosing " + preferred_adduct + " for duplicated " + inchi, "green"))
            removed_compounds += number_of_duplicates-1

        else:

            best_adduct = c18_standards_df_dups[c18_standards_df_dups['inchi_key'] == inchi].sort_values('intensity', ascending = False).iloc[[0]]
            c18_standards_df_filt1 = pd.concat([c18_standards_df_filt1, best_adduct], ignore_index=True)
            print(colored("Warning: No " + preferred_adduct + " for " + inchi + ". Choosing " + str(best_adduct['adduct'].to_list()) + " based on intensity", "red"))
            removed_compounds += number_of_duplicates-1

    else:

        print(colored("Warning: all adducts for duplicated inchi key are the same!", "red"))

if c18_standards_df_filt1.shape[0] == starting_compounds[0]-removed_compounds:

    print(colored("\nCorrect number of duplicated inchis removed!", "green"))

else:

    print(colored("\nDifferent number of duplicated inchis removed than expected! Need to investigate", "red"))

In [None]:
# Check if isomers have different adducts

isomers_with_differing_adducts = []

for isomer_mz in duplicated_mz_isomers:

    subset = c18_standards_df_filt1[c18_standards_df_filt1['mono_isotopic_molecular_weight'] == isomer_mz]
    inchi = list(subset['inchi_key'])
    adducts = list(subset['adduct'])
    same_adducts = all(i == adducts[0] for i in adducts)

    if same_adducts == False:

        print(colored("Warning! Isomers (" + str(inchi) + ") of mass " + str(isomer_mz) + " have different adducts (" + str(adducts) + ")!", "red"))
        isomers_with_differing_adducts.append(inchi)

        if len(adducts) == 2:

            diff = (subset['rt_peak'].iloc[0] - subset['rt_peak'].iloc[1])

            if abs(diff) <= 0.5:

                print(colored("Warning! Isomers with differing adducts elute together", "cyan"))
    else:

        pass
        #print(colored("All good! Isomers at MZ " + str(isomer_mz) + " have the same adduct!", "green"))

def flatten(xss):
    return [x for xs in xss for x in xs]

isomers_with_differing_adducts = flatten(isomers_with_differing_adducts)

In [19]:
# Identify inchi_keys that have multiple adducts and choose the highest intensity

c18_standards_df_filt1 = c18_standards_df_dups.copy()

starting_compounds = c18_standards_df_filt1.shape
removed_compounds = 0

c18_standards_df_filt1 = c18_standards_df_filt1[~c18_standards_df_filt1['inchi_key'].isin(duplicated_inchis)]

for inchi in duplicated_inchis:

    adducts = list(c18_standards_df_dups[c18_standards_df_dups['inchi_key'] == inchi]['adduct'])
    number_of_duplicates = len(adducts)

    if any(i in isomers_with_differing_adducts for i in inchi): # Do the duplicate inchis plus adducts have isomers with differing adducts? If so, keep all
        
        retained_entries = c18_standards_df_dups[c18_standards_df_dups['inchi_key'].isin(inchi)]
        print(colored("Retaining all adducts (" + str(adducts) + " for " + inchi + " because there are isomers with differing adducts", "red"))
        c18_standards_df_filt1 = pd.concat([c18_standards_df_filt1, retained_entries], ignore_index=True)

    if not all(i == adducts[0] for i in adducts):

        best_adduct = c18_standards_df_dups[c18_standards_df_dups['inchi_key'] == inchi].sort_values('intensity', ascending = False).iloc[[0]]
        c18_standards_df_filt1 = pd.concat([c18_standards_df_filt1, best_adduct], ignore_index=True)
        print(colored("Filtering to best adduct (" + str(best_adduct['adduct'].to_list()) + ")" + " for duplicated " + inchi + " by intensity", "green"))
        removed_compounds += number_of_duplicates-1

if c18_standards_df_filt1.shape[0] == starting_compounds[0]-removed_compounds:

    print(colored("\nCorrect number of duplicated inchis removed!", "green"))

else:

    print(colored("\nDifferent number of duplicated inchis removed than expected! Need to investigate", "red"))

In [None]:
# Check if label-adduct pairs have different MZs
# Look for red to indicate filtering needs to be done

for label_adduct in duplicated_label_adducts:

    mzs = list(c18_standards_df_filt1[c18_standards_df_filt1['label-adduct'] == label_adduct]['mono_isotopic_molecular_weight'])
    same_mz = all(i == mzs[0] for i in mzs)

    if same_mz == False:

        print(colored("Warning! Identical compound labels at mass " + str(mzs[0]) + " have different adducts!", "red"))

    else:

        print(colored("Identical compound labels at mass " + str(mzs[0]) + " have the same adduct!", "green"))

In [None]:
# Check if compounds with same label but different inchi_keys have the same adduct
# Look for red to indicate filtering needs to be done

duplicated_labels = list(c18_standards_df_filt1[c18_standards_df_filt1['label'].duplicated()]['label'].drop_duplicates().values)

for label in duplicated_labels:

    adducts = list(c18_standards_df_filt1[c18_standards_df_filt1['label'] == label]['adduct'])
    same_adducts = all(i == adducts[0] for i in adducts)

    if same_mz == False:

        print(colored("Warning! Duplicated label " + label + " has different adducts!", "red"))

    else:

        print(colored("Duplicated label " + label + " has the same adduct!", "green"))

In [None]:
# Identify duplicated labels which have different first-14 inchi keys

c18_standards_df_filt2 = c18_standards_df_filt1.copy()

starting_compounds = c18_standards_df_filt2.shape
removed_compounds = 0

for label in duplicated_labels:

    inchis = list(c18_standards_df_filt2[c18_standards_df_filt2['label'] == label]['inchi_key'])
    
    if inchis:

        number_of_duplicates = len(inchis)
        first14 = [i.split('-', 1)[0] for i in inchis]
        
        if all(i == first14[0] for i in first14):

            c18_standards_df_filt2 = c18_standards_df_filt2[~c18_standards_df_filt2['label'].isin([label])]

            best_label = c18_standards_df[c18_standards_df['label'] == label].sort_values('intensity', ascending = False).iloc[[0]]
            print(colored("Multiple entries for " + label + " have identical inchi key prefixes " + "(" + first14[0] + ")" + "; " + "Returning only highest intensity entry: " + best_label['inchi_key'].to_list()[0], "red"))

            c18_standards_df_filt2 = pd.concat([c18_standards_df_filt2, best_label], ignore_index=True)
            removed_compounds += number_of_duplicates-1

        else:

            print(colored("Inchi keys for duplicated label " + label + " do not have the same prefix (" + str(first14) + "): Retaining all", "green"))

if c18_standards_df_filt2.shape[0] == starting_compounds[0]-removed_compounds:

    print(colored("\nCorrect number of duplicated labels removed!", "green"))

else:

    print(colored("\nDifferent number of duplicated labels removed than expected! Need to investigate", "red"))

In [23]:
c18_standards_df_filt3 = c18_standards_df_filt2.drop('label-adduct', axis=1)

c18_standards_df_filt3.to_csv('C18_standards_' + polarity + '_reduced.tsv', sep='\t', index = False)