In [15]:
import pandas as pd
import numpy as np

In [16]:
IS_file = "all_no_Br_Cl.csv" # File containing results from tracefinder including m/z and rt for the native compounds (which was found using IS)
fl_file = "results/process_feature_output_formatting_openms/output_start_parameter_worked_9_FL_param_QT_3/linked_features_quantification.tsv" # Results file from the feature finder

In [17]:
fl_df = pd.read_csv(fl_file,sep="\t")
is_df = pd.read_csv(IS_file,sep=",")

def rt_range_check(rt_1,rt_2,threshold=10):
    if abs(rt_1-rt_2) <= threshold:
        return True
    return False

# Calculate ppm error between a two masses
def calculate_ppm_error_between_two_masses(exact_mass,observed_mass):
    mass_difference = observed_mass-exact_mass
    ppm_error = (mass_difference * 1000000) / exact_mass
    return (ppm_error)

In [18]:
is_mz_list = is_df["m/z-H"].to_list()
is_rt_list = is_df["Rt(s)"].to_list()

fl_mz_list = fl_df["mz_cf"].to_list()
fl_rt_list = fl_df["rt_cf"].to_list()

In [19]:
import statistics
# Finding features matching the results to tracefinder
i = 0
output_list = []
for mz,rt in zip(is_mz_list,is_rt_list):
    for mz_fl,rt_fl in zip(fl_mz_list,fl_rt_list):
        if rt_range_check(rt,rt_fl,threshold=30):
            if abs(calculate_ppm_error_between_two_masses(mz,mz_fl)) < 5:
                subset_df = fl_df[(fl_df["rt_cf"] == rt_fl) & (fl_df["mz_cf"] == mz_fl)]
                num_rows = 0
                count = 0
                intensity_list = []
                
                check_df = is_df[is_df["m/z-H"] == mz]
                cmpd_name = check_df["Abbreviation"].to_list()[0].strip()
                cmpd_type = check_df["Native/IS"].to_list()[0].strip()
                
                met_id = subset_df["Met_ID"].to_list()[0]
                not_found_file = []
                for rowname in list(subset_df.columns.values):
                    if "cf" not in rowname:
                        if "mz_Sample" in rowname:
                            if str(float(subset_df[rowname].to_list()[0])) != "nan":
                                count += 1
                            num_rows += 1
                        if "intensity_Sample" in rowname:
                            intensity= float(subset_df[rowname].to_list()[0])
                            if str(intensity) != "nan":
                                 intensity_list.append(intensity)
                                #elif cmpd_name == "M5PFPeA":
                                #    not_found_file.append(rowname)
                            else:
                                pass
                
                i += 1
                try:
                    cv = str(round(statistics.stdev(intensity_list)/statistics.mean(intensity_list),4))
                except:
                    cv = "ND"
                #print (met_id,round(mz,4),round(mz_fl,4),round(calculate_ppm_error_between_two_masses(mz,mz_fl),4),"\t",round(rt,4),round(rt_fl,4),round(abs(rt-rt_fl),4),"\t",
                #       count,cmpd_type,
                #       cmpd_name, " Coverage=" + str(count/num_rows), " CV=" + cv)
                output_list.append({"met_id":met_id,
                    "mz_fl":mz_fl,"mz_is":mz,"rt_fl":rt_fl,"rt_is":rt,"rt_diff":abs(rt-rt_fl),"ppm_error":calculate_ppm_error_between_two_masses(mz,mz_fl),"cmpd_type":cmpd_type,
                    "cmpd_name":cmpd_name,"coverage":str(count/num_rows),"cv":cv,
                })
                
df = pd.DataFrame.from_dict(output_list)
all_compounds = list(set(df["cmpd_name"].to_list()))

In [20]:
# Selecting features based on criteria 
def check_df_and_sort(input_df):
    lowest_ppm_error = sorted(input_df["ppm_error"].to_list(), key=abs)[0]
    highest_coverage = sorted(input_df["coverage"].to_list())[-1]
    lowest_rt_diff = sorted(input_df["rt_diff"].to_list())[0]
    
    #selected_row = input_df[(input_df["ppm_error"] == lowest_ppm_error) & (input_df["coverage"] == highest_coverage) & (input_df["rt_diff"] == lowest_rt_diff)]
    selected_row = input_df[(input_df["coverage"] == highest_coverage)]
    if len(selected_row) == 1:
        return selected_row
    else:
        selected_row = input_df[(input_df["coverage"] == highest_coverage) & (input_df["rt_diff"] == lowest_rt_diff)]
        return selected_row

In [11]:
# Writing results to a file
list_df = []
for cmpd in all_compounds:
    if cmpd == "N-MeFOSAA":
        subset_native_df = df[(df["cmpd_name"] == cmpd) & (df["cmpd_type"] == "Native") & (df["rt_diff"] <= 10)]
    else:
        subset_native_df = df[(df["cmpd_name"] == cmpd) & (df["cmpd_type"] == "Native")]
    subset_is_df = df[(df["cmpd_name"] == cmpd) & (df["cmpd_type"] == "IS")]
    
    if len(subset_is_df) > 1:
        list_df.append(check_df_and_sort(subset_is_df))
    else:
        list_df.append(subset_is_df)
    if len(subset_native_df) > 1:
        list_df.append(check_df_and_sort(subset_native_df))
    else:
        list_df.append(subset_native_df)

df_out = pd.concat(list_df)
df_out.to_csv("found_features.csv")

In [21]:
df_out

Unnamed: 0,met_id,mz_fl,mz_is,rt_fl,rt_is,rt_diff,ppm_error,cmpd_type,cmpd_name,coverage,cv
17,12,506.95749,506.95704,470.563173,468.0,2.563173,0.887987,IS,M8PFOS,1.0,0.145
12,1747,267.993683,267.992765,279.963379,262.2,17.763379,3.426358,IS,M5PFPeA,1.0,0.1668
34,26008,298.943945,298.94299,305.47155,292.8,12.67155,3.195319,Native,L-PFBS,0.04,0.2593
23,173,512.960155,512.96004,494.488819,492.6,1.888819,0.224434,Native,PFDA,1.0,0.5797
0,1779,614.960118,614.96035,533.97238,533.4,0.57238,-0.377892,IS,MPFDoA,1.0,0.1438
9,2123,366.983206,366.98302,405.241211,399.6,5.641211,0.506333,IS,M4PFHpA,1.0,0.1438
35,9071,348.940907,348.9398,367.092377,360.0,7.092377,3.173651,Native,L-PFPeS,0.56,0.8319
36,38885,326.974326,326.97429,324.397134,341.4,17.002866,0.109099,Native,4:2FTS,0.01,ND
22,2154,589.014326,589.01436,518.541674,517.8,0.741674,-0.057384,IS,d5-N-EtFOSAA,1.0,0.1504
4,1790,428.97573,428.9746,437.218487,434.4,2.818487,2.633674,IS,M2-6:2FTS,1.0,0.1803
