In [None]:
import numpy as np
import pandas as pd
import glob
import os

In [None]:
FeatureMatrix= os.path.join("results", "features", "FeatureMatrix.tsv")
DF_features= pd.read_csv(FeatureMatrix, sep="\t")
DF_features=DF_features.set_index(["mz", "RT"])
DF_features= DF_features.drop(columns=["charge", "quality", "id"])
DF_features= DF_features.fillna(0)
DF_features["feature_ids"]= [ids[1:-1].split(",") for ids in DF_features["feature_ids"]]
DF_features

`1) Filter the feature matrix (optional)`

In [None]:
path= os.path.join("results", "interim", "analysis")
isExist= os.path.exists(path)
if not isExist:
    os.mkdir(path)

a (i) Remove all features detected in negative controls (make sure there is no cross-contamination)

In [None]:
media_list= [r"ISP2", r"FPY12", r"DNPM"] # different media/conditions (treatments)
for medium in media_list:
    Features_flt=DF_features.filter(regex=fr"{medium}")
    blanks= Features_flt.filter(regex="blank", axis= 1) 
    blanks = blanks.fillna(0)
    blanks= blanks.transpose()
    dictionary = {}
    cols= blanks.columns
    for i, col in enumerate(cols):
        dictionary[i] = np.count_nonzero(blanks[col]) / len(blanks[col])
    column_idx = [key for key, value in dictionary.items() if value >= 0.5] #Remove features that appear most frequently (in more than 50% of the samples) in the negative controls
    print(dictionary)
    blank_features= blanks.iloc[:, column_idx] 
    cols= blank_features.columns
    Features_flt= Features_flt.transpose()
    Features_nb= Features_flt.drop(columns= cols)
    Features_nb= Features_nb.dropna(how="all")
    blanks=blanks.transpose()
    blank_cols= blanks.columns
    Features_nb= Features_flt.drop(columns=blank_cols)
    filename= os.path.join(path, "No_NC_"+ medium + "_DF_features.csv")
    Features_nb.to_csv(filename, sep="\t")

a (ii) Or if there are multiple replicates, remove only the features detected in more than 50% of all the negative controls (or blanks)

In [None]:
media_list= [r"ISP2", r"FPY12", r"DNPM"] # different media/conditions (treatments)
for medium in media_list:
    Features_flt=DF_features.filter(regex=fr"{medium}")
    blanks= Features_flt.filter(regex="blank", axis= 1) 
    blanks = blanks.fillna(0)
    blanks= blanks.transpose()
    cols= blanks.columns
    Features_flt= Features_flt.transpose()
    Features_nb= Features_flt.drop(columns= cols)
    Features_nb= Features_nb.dropna(how="all")
    blanks=blanks.transpose()
    blank_cols= blanks.columns
    Features_flt= Features_flt.transpose()
    Features_nb= Features_flt.drop(columns=blank_cols)
    filename= os.path.join(path, "No_NC_"+ medium + "_DF_features.csv")
    Features_nb.to_csv(filename, sep="\t")

b) Replace the features that have lower intensity than 10^4 with NaN (noise for Orbitrap instruments)

In [None]:
def noise_flt(csvfile):   
    Features= pd.read_csv(csvfile, sep="\t")
    Features= Features.set_index(["mz", "RT"])
    Features= Features.sort_index(axis=1) 
    cols= Features.columns
    Features[cols] = Features[cols].replace({0:np.nan})
    Features[Features<10000] = np.nan
    Featuresnew=Features.dropna(how="all")
    Featuresnew = Featuresnew.fillna(0)
    DF= Featuresnew.reset_index()
    file_path = os.path.join(os.path.dirname(csvfile), 'noise_thr_' + os.path.basename(csvfile)[6:])
    DF.to_csv(file_path, sep="\t")

In [None]:
csvfiles= glob.glob(os.path.join(path, "No_NC_*.csv"))
for csvfile in csvfiles:
    noise_flt(csvfile)

c) Replace the presence of a feature with NaN if the feature is present in only 1 out of 3 replicates

In [None]:
def rep_filter(csvfile):
    Features= pd.read_csv(csvfile, sep="\t", index_col="Unnamed: 0")
    Features= Features.set_index(["mz", "RT"])
    Features= Features.sort_index(axis=1)
    cols= Features.columns
    Features= Features.fillna(0)
    Features= Features.transpose()
    Features= Features.reset_index()
    Features['genomeID']=Features['index'].str.extract(r'(NBC_?\d*)')
    Features['genomeID_MDNA']=Features['index'].str.extract(r'(MDNAWGS?\d*|MDNA_WGS_?\d*)')
    Features['genomeID']=Features['genomeID'].fillna(Features['genomeID_MDNA'])
    Features= Features.drop(columns=["genomeID_MDNA"])
    Features=Features.set_index(["index"])
    Grouped= Features.groupby("genomeID")
    DF= Grouped.transform(lambda x: np.nan if np.count_nonzero(x)<2 else x)
    DF=DF.transpose()
    DF=DF.reset_index()
    file_path = os.path.join(os.path.dirname(csvfile), os.path.basename(csvfile)[10:])
    DF.to_csv(file_path, sep="\t", index=None)

In [None]:
csvfiles= glob.glob(os.path.join(path, "noise_thr_*.csv"))
for csvfile in csvfiles:
    rep_filter(csvfile)

- Merge all tables on mz and RT:

In [None]:
Matrix_FPY12= pd.read_csv(os.path.join(path, "FPY12_DF_features.csv"), sep="\t")
Matrix_ISP2= pd.read_csv(os.path.join(path, "ISP2_DF_features.csv"), sep="\t")
Matrix_DNPM= pd.read_csv(os.path.join(path, "DNPM_DF_features.csv"), sep="\t")

Matrix_ISP2= Matrix_ISP2.set_index(["mz", "RT"])
Matrix_ISP2= Matrix_ISP2.fillna(0)
Matrix_ISP2= Matrix_ISP2.sort_index(axis=1)

Matrix_FPY12= Matrix_FPY12.set_index(["mz", "RT"])
Matrix_FPY12= Matrix_FPY12.sort_index(axis=1)
Matrix_FPY12= Matrix_FPY12.fillna(0)

Matrix_DNPM= Matrix_DNPM.set_index(["mz", "RT"])
Matrix_DNPM= Matrix_DNPM.fillna(0)
Matrix_DNPM= Matrix_DNPM.sort_index(axis=1)

Matrix_ISP2_FPY12= pd.merge(Matrix_FPY12, Matrix_ISP2, on=["mz", "RT"], how="outer")
Matrix= pd.merge(Matrix_ISP2_FPY12, Matrix_DNPM, on=["mz", "RT"],how= "outer")
cols= Matrix.columns
Matrix[cols] = Matrix[cols].replace({0:np.nan})
Matrix= Matrix.dropna(how="all")
Matrix= Matrix.reset_index()
Matrix.to_csv(os.path.join(path, "Matrix_Clean.csv"), sep="\t", index=None)