In [1]:
import pandas as pd
from termcolor import colored
import os
from tqdm.notebook import tqdm

In [2]:
polarity = "positive"
atlas_type = "EMA"

In [3]:
current_working_directory = os.getcwd()
print("You're working in the directory: " + current_working_directory)

You're working in the directory: /Users/BKieft/Metabolomics/metatlas-data/notebooks


In [4]:
# Read in data
C18_atlas_df = pd.read_csv(current_working_directory + '/../C18/C18_' + atlas_type + '_' + polarity + '_all-adducts_renamed_deduplicated.tsv', sep='\t', float_precision='round_trip')
mass_file = pd.read_csv(current_working_directory + '/select_single_adduct_per_compound_data/inchi-key_to_mass.csv', sep=',', float_precision='round_trip')

In [5]:
# Find duplicates of different features (columns)

C18_atlas_df_dups = C18_atlas_df.copy()
C18_atlas_df_dups = pd.merge(C18_atlas_df_dups, mass_file, how="inner", on="inchi_key")

C18_atlas_df_dups['inchi-adduct'] = C18_atlas_df_dups['inchi_key'].astype(str) + C18_atlas_df_dups['adduct']
C18_atlas_df_dups['label-adduct'] = C18_atlas_df_dups['label'].astype(str) + C18_atlas_df_dups['adduct']
duplicated_inchi_adducts = list(C18_atlas_df_dups[C18_atlas_df_dups['inchi-adduct'].duplicated()]['inchi-adduct'].drop_duplicates().values)
duplicated_inchis = list(C18_atlas_df_dups[C18_atlas_df_dups['inchi_key'].duplicated()]['inchi_key'].drop_duplicates().values)
duplicated_label_adducts = list(C18_atlas_df_dups[C18_atlas_df_dups['label-adduct'].duplicated()]['label-adduct'].drop_duplicates().values)


In [6]:
# Identify inchi keys that have multiple adducts and choose hydrogen (most common) or the highest intensity

if polarity == "positive":
    preferred_adduct = '[M+H]+'
elif polarity == "negative":
    preferred_adduct = '[M-H]-'

C18_atlas_df_filt1 = C18_atlas_df_dups.copy()

starting_compounds = C18_atlas_df_filt1.shape
removed_compounds = 0

C18_atlas_df_filt1 = C18_atlas_df_filt1[~C18_atlas_df_filt1['inchi_key'].isin(duplicated_inchis)]

for inchi in tqdm(duplicated_inchis):

    adducts = list(C18_atlas_df_dups[C18_atlas_df_dups['inchi_key'] == inchi]['adduct'])
    number_of_duplicates = len(adducts)

    if not all(i == adducts[0] for i in adducts):

        if preferred_adduct in adducts:

            inchi_sub = C18_atlas_df_dups[C18_atlas_df_dups['inchi_key'] == inchi]
            best_adduct = inchi_sub[inchi_sub['adduct'] == preferred_adduct].iloc[[0]]
            C18_atlas_df_filt1 = pd.concat([C18_atlas_df_filt1, best_adduct], ignore_index=True)
            #print(colored("Choosing " + preferred_adduct + " for duplicated " + inchi, "green"))
            removed_compounds += number_of_duplicates-1

        else:

            best_adduct = C18_atlas_df_dups[C18_atlas_df_dups['inchi_key'] == inchi].sort_values('intensity', ascending = False).iloc[[0]]
            C18_atlas_df_filt1 = pd.concat([C18_atlas_df_filt1, best_adduct], ignore_index=True)
            #print(colored("Warning: No " + preferred_adduct + " for " + inchi + ". Choosing " + str(best_adduct['adduct'].to_list()) + " based on intensity", "red"))
            removed_compounds += number_of_duplicates-1

    else:
        
        best_adduct = C18_atlas_df_dups[C18_atlas_df_dups['inchi_key'] == inchi].iloc[[0]]  # Choose the first one
        removed_compounds += number_of_duplicates
        #print(colored("Warning: all adducts for duplicated inchi key " + inchi + " are the same!", "red"))

if C18_atlas_df_filt1.shape[0] == starting_compounds[0]-removed_compounds:

    print(colored("\nCorrect number of duplicated inchis (" + str(removed_compounds) + ") removed from starting number of " + str(starting_compounds[0]), "green"))

else:

    print(colored("\nDifferent number of duplicated inchis removed (" + str(removed_compounds) + ") than expected! Need to investigate", "red"))

  0%|          | 0/3664 [00:00<?, ?it/s]

[32m
Correct number of duplicated inchis (3750) removed from starting number of 7622[0m


In [7]:
# Check if label-adduct pairs have different masses
# Look for red to indicate filtering needs to be done

duplicated_label_adducts = list(C18_atlas_df_filt1[C18_atlas_df_filt1['label-adduct'].duplicated()]['label-adduct'].drop_duplicates().values)

for label_adduct in duplicated_label_adducts:

    mzs = list(C18_atlas_df_filt1[C18_atlas_df_filt1['label-adduct'] == label_adduct]['mono_isotopic_molecular_weight'])
    same_mz = all(i == mzs[0] for i in mzs)

    if same_mz == False:

        print(colored("Warning! Identical compound labels at mass " + str(mzs[0]) + " have different adducts!", "red"))

    else:

        print(colored("Identical compound labels at mass " + str(mzs[0]) + " have the same adduct!", "green"))

[32mIdentical compound labels at mass 131.0946287 have the same adduct![0m
[32mIdentical compound labels at mass 131.0946287 have the same adduct![0m
[32mIdentical compound labels at mass 264.1361591 have the same adduct![0m
[32mIdentical compound labels at mass 353.1263227 have the same adduct![0m
[32mIdentical compound labels at mass 265.1314081 have the same adduct![0m
[32mIdentical compound labels at mass 327.1470582 have the same adduct![0m
[32mIdentical compound labels at mass 399.1681875 have the same adduct![0m
[32mIdentical compound labels at mass 254.1419132 have the same adduct![0m
[32mIdentical compound labels at mass 410.2304534 have the same adduct![0m
[32mIdentical compound labels at mass 248.1412445 have the same adduct![0m
[32mIdentical compound labels at mass 356.2562743 have the same adduct![0m
[32mIdentical compound labels at mass 413.3293796 have the same adduct![0m
[32mIdentical compound labels at mass 244.0881634 have the same adduct![0m

In [8]:
# Check if compounds with same label but different inchi_keys have the same adduct
# Look for red to indicate filtering needs to be done

duplicated_labels = list(C18_atlas_df_filt1[C18_atlas_df_filt1['label'].duplicated()]['label'].drop_duplicates().values)

for label in duplicated_labels:

    adducts = list(C18_atlas_df_filt1[C18_atlas_df_filt1['label'] == label]['adduct'])
    same_adducts = all(i == adducts[0] for i in adducts)

    if same_mz == False:

        print(colored("Warning! Duplicated label " + label + " has different adducts!", "red"))

    else:

        print(colored("Duplicated label " + label + " has the same adduct!", "green"))

[32mDuplicated label leucine has the same adduct![0m
[32mDuplicated label isoleucine has the same adduct![0m
[32mDuplicated label abscisic acid has the same adduct![0m
[32mDuplicated label chelidonine has the same adduct![0m
[32mDuplicated label anisomycin has the same adduct![0m
[32mDuplicated label boldine has the same adduct![0m
[32mDuplicated label colchicine has the same adduct![0m
[32mDuplicated label lysergol has the same adduct![0m
[32mDuplicated label forskolin has the same adduct![0m
[32mDuplicated label parthenolide has the same adduct![0m
[32mDuplicated label lagochiline has the same adduct![0m
[32mDuplicated label solasodine has the same adduct![0m
[32mDuplicated label biotin has the same adduct![0m
[32mDuplicated label naringenin has the same adduct![0m
[32mDuplicated label reserpine has the same adduct![0m
[32mDuplicated label pantothenic acid has the same adduct![0m
[32mDuplicated label (+)-usnic acid has the same adduct![0m
[32mDuplic

In [9]:
# Identify duplicated labels which have different first-14 inchi keys

C18_atlas_df_filt2 = C18_atlas_df_filt1.copy()

starting_compounds = C18_atlas_df_filt2.shape
removed_compounds = 0

for label in duplicated_labels:

    inchis = list(C18_atlas_df_filt2[C18_atlas_df_filt2['label'] == label]['inchi_key'])
    
    if inchis:

        number_of_duplicates = len(inchis)
        first14 = [i.split('-', 1)[0] for i in inchis]
        
        if all(i == first14[0] for i in first14):

            C18_atlas_df_filt2 = C18_atlas_df_filt2[~C18_atlas_df_filt2['label'].isin([label])]

            best_label = C18_atlas_df[C18_atlas_df['label'] == label].sort_values('intensity', ascending = False).iloc[[0]]
            print(colored("Multiple entries for " + label + " have identical inchi key prefixes " + "(" + first14[0] + ")" + "; " + "Returning only highest intensity entry: " + best_label['inchi_key'].to_list()[0], "red"))

            C18_atlas_df_filt2 = pd.concat([C18_atlas_df_filt2, best_label], ignore_index=True)
            removed_compounds += number_of_duplicates-1

        else:

            print(colored("Inchi keys for duplicated label " + label + " do not have the same prefix (" + str(first14) + "): Retaining all", "green"))

if C18_atlas_df_filt2.shape[0] == starting_compounds[0]-removed_compounds:

    print(colored("\nCorrect number of duplicated labels removed!", "green"))

else:

    print(colored("\nDifferent number of duplicated labels removed than expected! Need to investigate", "red"))

[31mMultiple entries for leucine have identical inchi key prefixes (ROHFNLRQFUQHCH); Returning only highest intensity entry: ROHFNLRQFUQHCH-UHFFFAOYSA-N[0m
[31mMultiple entries for isoleucine have identical inchi key prefixes (AGPKZVBTJJNPAG); Returning only highest intensity entry: AGPKZVBTJJNPAG-WHFBIAKZSA-N[0m
[31mMultiple entries for abscisic acid have identical inchi key prefixes (JLIDBLDQVAYHNE); Returning only highest intensity entry: JLIDBLDQVAYHNE-YKALOCIXSA-N[0m
[31mMultiple entries for chelidonine have identical inchi key prefixes (GHKISGDRQRSCII); Returning only highest intensity entry: GHKISGDRQRSCII-UHFFFAOYSA-N[0m
[32mInchi keys for duplicated label anisomycin do not have the same prefix (['YKJYKKNCCRKFSL', 'JPQPWCGQXFYELU']): Retaining all[0m
[31mMultiple entries for boldine have identical inchi key prefixes (LZJRNLRASBVRRX); Returning only highest intensity entry: LZJRNLRASBVRRX-UHFFFAOYSA-N[0m
[31mMultiple entries for colchicine have identical inchi key p

In [10]:
C18_atlas_df_filt2.shape

(3846, 57)

In [13]:
# Check if isomers have different adducts

isomers_with_differing_adducts = []

duplicated_mz_isomers = list(C18_atlas_df_filt2[C18_atlas_df_filt2['mono_isotopic_molecular_weight'].round(decimals=6).duplicated()]['mono_isotopic_molecular_weight'].drop_duplicates().values)

for isomer_mass in duplicated_mz_isomers:

    subset = C18_atlas_df_filt2[C18_atlas_df_filt2['mono_isotopic_molecular_weight'] == isomer_mass]
    inchi = list(subset['inchi_key'])
    adducts = list(subset['adduct'])
    same_adducts = all(i == adducts[0] for i in adducts)

    if same_adducts == False:

        print(colored("Warning! Isomers (" + str(inchi) + ") of mass " + str(isomer_mass) + " have different adducts (" + str(adducts) + ")!", "red"))
        isomers_with_differing_adducts.append(inchi)

        if len(adducts) == 2:

            diff = (subset['rt_peak'].iloc[0] - subset['rt_peak'].iloc[1])

            if abs(diff) <= 0.5:

                print(colored("\tNote: These isomers elute together", "yellow"))
    else:

        pass
        #print(colored("All good! Isomers at MZ " + str(isomer_mass) + " have the same adduct!", "green"))

def flatten(xss):
    return [x for xs in xss for x in xs]

isomers_with_differing_adducts = flatten(isomers_with_differing_adducts)

[33m	Note: These isomers elute together[0m
[33m	Note: These isomers elute together[0m
[33m	Note: These isomers elute together[0m
[33m	Note: These isomers elute together[0m
[33m	Note: These isomers elute together[0m
[33m	Note: These isomers elute together[0m
[33m	Note: These isomers elute together[0m
[33m	Note: These isomers elute together[0m
[33m	Note: These isomers elute together[0m
[33m	Note: These isomers elute together[0m
[33m	Note: These isomers elute together[0m
[33m	Note: These isomers elute together[0m
[33m	Note: These isomers elute together[0m


In [14]:
C18_atlas_df_filt3 = C18_atlas_df_filt2.drop('label-adduct', axis=1)

C18_atlas_df_filt3.to_csv(current_working_directory + '/../C18/C18_' + atlas_type + '_' + polarity + '_all-adducts_renamed_deduplicated_reduced.tsv', sep='\t', index = False)