In [3]:
import numpy as np
import pandas as pd

import pylab as plt
import matplotlib.pyplot as plt
import seaborn as sns
import math
import os
import itertools

In [6]:
csf_data = pd.read_csv("./0. Original Files/CSF_normalized.csv",header=None)
feci_data = pd.read_csv("./0. Original Files/feci_normalized.csv",header=None)
pls_data = pd.read_csv("./0. Original Files/PLS_normalized.csv",header=None)

filtered_pls = pd.read_csv("./0. Original Files/Filtered Metabolomics - Plasma.csv",header=None)
filtered_feci = pd.read_csv("./0. Original Files/Filtered Metabolomics - Feci.csv",header=None)
filtered_csf = pd.read_csv("./0. Original Files/Filtered Metabolomics - CSF.csv",header=None)

In [8]:
def filter_metabolomics(data, filtered_qc, out_file, label):
    # make sure output directory exists
    os.makedirs(os.path.dirname(out_file), exist_ok=True)
    
    # First row of original data = feature_id
    first_row = data.iloc[0, :]
    print(f"{label} Data")
    print("Original # of columns:", len(first_row))
    
    # QC data - keep only rows marked "Keep"
    filtered_keep = filtered_qc[filtered_qc[3] == "Keep"]
    print("QC KEEP rows:", len(filtered_keep))

    # Map feature_id: new_class
    class_to_replace = dict(zip(filtered_keep[0], filtered_keep[5]))

    # Find matching columns
    columns_to_keep = []
    for i, val in enumerate(first_row.values):
        if val in filtered_keep[0].values:
            columns_to_keep.append(i)

    # Keeps row 0 (samplie IDs)
    columns_to_keep = sorted(set([0] + columns_to_keep))
    # Copy Data
    data_filtered = data.iloc[:, columns_to_keep].copy()

    # Replace second row with new class if available
    replacements = 0
    for sid in data_filtered.columns:
        # id in row 0
        feature_name = data_filtered.loc[0, sid] 
        # current value in row 1
        old_val = data_filtered.loc[1, sid] 
        new_val = class_to_replace.get(feature_name, old_val)
        if new_val != old_val:
            replacements += 1
        # update row 1
        data_filtered.loc[1, sid] = new_val 

    # Save result
    data_filtered.to_csv(out_file, index=False, header=False)
    print(f"Saved {out_file}")

    return data_filtered

# Run for PLS, CSF and Feci
pls_data_filtered  = filter_metabolomics(pls_data,  filtered_pls,  os.path.join("1. Filtered Data", "pls_filtered.csv"),  "PLS")
csf_data_filtered  = filter_metabolomics(csf_data,  filtered_csf,  os.path.join("1. Filtered Data", "csf_filtered.csv"),  "CSF")
feci_data_filtered = filter_metabolomics(feci_data, filtered_feci, os.path.join("1. Filtered Data", "feci_filtered.csv"), "Feci")



--- PLS Data ---
Original # of columns: 142
QC KEEP rows: 118
Saved 1. Filtered Data/pls_filtered.csv

--- CSF Data ---
Original # of columns: 83
QC KEEP rows: 50
Saved 1. Filtered Data/csf_filtered.csv

--- Feci Data ---
Original # of columns: 124
QC KEEP rows: 113
Saved 1. Filtered Data/feci_filtered.csv
