In [1]:
import pandas as pd
import os

In [2]:
def find_rt_differences(tsv_file1, tsv_file2):
    """
    Reads two TSV files and returns a DataFrame with rows where
    shared inchi_keys have differing rt_peak (with ±0.1 tolerance).
    Adds a 'source_file' column to indicate the origin.
    """
    # Read the TSV files
    df1 = pd.read_csv(tsv_file1, sep='\t')
    df2 = pd.read_csv(tsv_file2, sep='\t')

    # Add a column to indicate the source file
    df1['source_file'] = os.path.basename(tsv_file1)
    df2['source_file'] = os.path.basename(tsv_file2)

    # Concatenate the DataFrames
    df_all = pd.concat([df1, df2], ignore_index=True)

    # Find shared inchi_keys
    shared_keys = set(df1['inchi_key']).intersection(df2['inchi_key'])

    # Filter for shared inchi_keys
    df_shared = df_all[df_all['inchi_key'].isin(shared_keys)]

    # Group by inchi_key and compare rt_peak columns with tolerance
    diff_rows = []
    for key, group in df_shared.groupby(['inchi_key', 'adduct']):
        rt_peaks = group['rt_peak'].dropna().values
        if len(rt_peaks) > 1 and (rt_peaks.max() - rt_peaks.min()) > 0.1:
            diff_rows.append(group)

    # Concatenate all differing rows into a new DataFrame
    if diff_rows:
        df_diff = pd.concat(diff_rows, ignore_index=True)
    else:
        df_diff = pd.DataFrame(columns=df_shared.columns)

    df_diff.sort_values(by=['inchi_key', 'source_file'], inplace=True)

    return df_diff

In [3]:
result_df = find_rt_differences('/Users/BKieft/Metabolomics/metatlas-data/HILIC/HILIC_EMA-standards_positive.tsv', 
                                '/Users/BKieft/Metabolomics/metatlas-data/HILIC/HILIC_ISTDsEtcV7_positive.tsv')
result_df[['label', 'inchi_key', 'adduct', 'rt_peak', 'rt_min', 'rt_max', 'source_file']]

Unnamed: 0,label,inchi_key,adduct,rt_peak,rt_min,rt_max,source_file
0,N-acetyl-glucosamine,OVRNDRQMDRJTHS-RTRLPJTCSA-N,[M+H]+,6.94819,6.19819,7.69819,HILIC_EMA-standards_positive.tsv
1,N-acetyl-glucosamine (unlabeled),OVRNDRQMDRJTHS-RTRLPJTCSA-N,[M+H]+,6.707815,6.407815,7.007815,HILIC_ISTDsEtcV7_positive.tsv


In [4]:
result_df = find_rt_differences('/Users/BKieft/Metabolomics/metatlas-data/HILIC/HILIC_EMA-standards_negative.tsv', 
                                '/Users/BKieft/Metabolomics/metatlas-data/HILIC/HILIC_ISTDsEtcV7_negative.tsv')
result_df[['label', 'inchi_key', 'adduct', 'rt_peak', 'rt_min', 'rt_max', 'source_file']]

Unnamed: 0,label,inchi_key,adduct,rt_peak,rt_min,rt_max,source_file
0,galactitol,FBPFZTCFMRRESA-GUCUJZIJSA-N,[M-H]-,9.713578,8.963578,10.463578,HILIC_EMA-standards_negative.tsv
1,galactitol (unlabeled),FBPFZTCFMRRESA-GUCUJZIJSA-N,[M-H]-,9.513187,9.213187,9.813187,HILIC_ISTDsEtcV7_negative.tsv
2,arabitol,HEBKCHPVOIAQTA-IMJSIDKUSA-N,[M-H]-,5.358367,4.608367,6.108367,HILIC_EMA-standards_negative.tsv
3,arabinitol (unlabeled),HEBKCHPVOIAQTA-IMJSIDKUSA-N,[M-H]-,5.187916,4.887916,5.487916,HILIC_ISTDsEtcV7_negative.tsv
4,xylitol,HEBKCHPVOIAQTA-NGQZWQHPSA-N,[M-H]-,5.023195,4.273195,5.773195,HILIC_EMA-standards_negative.tsv
5,xylitol (unlabeled),HEBKCHPVOIAQTA-NGQZWQHPSA-N,[M-H]-,4.846595,4.546595,5.146595,HILIC_ISTDsEtcV7_negative.tsv
6,ribitol,HEBKCHPVOIAQTA-ZXFHETKHSA-N,[M-H]-,4.934977,4.184977,5.684977,HILIC_EMA-standards_negative.tsv
7,ribitol (unlabeled),HEBKCHPVOIAQTA-ZXFHETKHSA-N,[M-H]-,4.75339,4.45339,5.05339,HILIC_ISTDsEtcV7_negative.tsv
8,pyruvic acid,LCTONWCANYUPML-UHFFFAOYSA-N,[M-H]-,6.207383,5.457383,6.957383,HILIC_EMA-standards_negative.tsv
9,pyruvic acid (unlabeled),LCTONWCANYUPML-UHFFFAOYSA-N,[M-H]-,5.487306,5.187306,5.787306,HILIC_ISTDsEtcV7_negative.tsv


In [5]:
result_df = find_rt_differences('/Users/BKieft/Metabolomics/metatlas-data/C18/C18_EMA-standards_negative.tsv', 
                                '/Users/BKieft/Metabolomics/metatlas-data/C18/C18_ISTD_negative.tsv')
result_df[['label', 'inchi_key', 'adduct', 'rt_peak', 'rt_min', 'rt_max', 'source_file']]

Unnamed: 0,label,inchi_key,adduct,rt_peak,rt_min,rt_max,source_file
0,inosine,UGQMRVRMYYASKQ-KQYNXXCUSA-N,[M-H]-,0.916547,0.716547,1.116547,C18_EMA-standards_negative.tsv
1,inosine (unlabeled),UGQMRVRMYYASKQ-KQYNXXCUSA-N,[M-H]-,1.02,0.72,1.32,C18_ISTD_negative.tsv


In [6]:
result_df = find_rt_differences('/Users/BKieft/Metabolomics/metatlas-data/C18/C18_EMA-standards_positive.tsv', 
                                '/Users/BKieft/Metabolomics/metatlas-data/C18/C18_ISTD_positive.tsv')
result_df[['label', 'inchi_key', 'adduct', 'rt_peak', 'rt_min', 'rt_max', 'source_file']]

Unnamed: 0,label,inchi_key,adduct,rt_peak,rt_min,rt_max,source_file
0,inosine,UGQMRVRMYYASKQ-KQYNXXCUSA-N,[M+H]+,0.884954,0.684954,1.084954,C18_EMA-standards_positive.tsv
1,inosine (unlabeled),UGQMRVRMYYASKQ-KQYNXXCUSA-N,[M+H]+,1.02,0.72,1.32,C18_ISTD_positive.tsv
