In [1]:
from pathlib import Path

import pandas as pd

In [2]:
THIS_DIR = Path(".").resolve()
ROOT = THIS_DIR.parent.parent.resolve()
DATA_DIR = ROOT / "src" / "data"
LIPIDOMICS_DIR = DATA_DIR / "lipidomics"
BLD_DATA = ROOT / "bld" / "data"
BLD_DATA.mkdir(parents=True, exist_ok=True)

In [None]:
def clean_data(sample_description, prs, lipid_intensities, cluster_labels):
    """Cleans and merges the sample description, Prs, lipidomic and clustering data.


    Args:
    PRS_df: pd.DataFrame with the PRS data
    lipidomics_df: pd.DataFrame with the lipidomics data
    cluster_df: pd.DataFrame with the clustering data

    Returns:
    clean_data: pd.DataFrame
    """

    clean_cluster_labels = _clean_cluster_labels(cluster_labels)
    clean_prs = _clean_prs(prs)
    clean_sample_description = _clean_sample_description(sample_description)
    clean_lipid_intensities = _clean_lipid_intensities(lipid_intensities)

    clean_dfs = [clean_prs, clean_lipid_intensities]
    # clean_data = pd.concat(clean_dfs, axis=1, join="outer")

    return {
        "clean_cluster_labels": clean_cluster_labels,
        "clean_prs": clean_prs,
        "clean_sample_description": clean_sample_description,
        "clean_lipid_intensities": clean_lipid_intensities,
        # "clean_data": clean_data,
    }


def _clean_sample_description(sample_description):
    sample_description = sample_description.set_index("Patient_ID")

    clean_sample_description = pd.DataFrame(index=sample_description.index)

    clean_sample_description["sex"] = sample_description["sex"].astype("category")
    clean_sample_description["age"] = sample_description["age"].astype("int")
    clean_sample_description["bmi"] = sample_description["bmi"]
    clean_sample_description["clinic"] = sample_description["clinic"].astype("category")
    clean_sample_description["year"] = sample_description["year"].astype("int")
    clean_sample_description["diagnosis"] = sample_description["diagnosis"].astype(
        "category"
    )

    return clean_sample_description


def _clean_cluster_labels(cluster_labels):
    cluster_labels = cluster_labels.set_index("cases")
    cluster_labels.index.name = "Patient_ID"

    clean_cluster_labels = pd.DataFrame(index=cluster_labels.index)

    clean_cluster_labels["cluster_label"] = cluster_labels["cluster_label"].astype(
        "category"
    )

    return clean_cluster_labels


def _clean_prs(prs):
    prs = prs.set_index("ID")
    prs.index.name = "Patient_ID"

    clean_prs = pd.DataFrame(index=prs.index)
    clean_prs["cluster_label"] = prs["group"].str.strip("subtype_").astype("category")
    for col in prs.filter(like="PRS_").columns:
        clean_prs[col] = prs[col].astype("float")

    return clean_prs


def _clean_lipid_intensities(lipid_intensities):
    clean_lipid_intensities = lipid_intensities.copy()
    clean_lipid_intensities = lipid_intensities.set_index("ID")
    # clean_lipid_intensities.index.name = "Patient_ID"
    # clean_lipid_intensities["cluster_label"] = clean_lipid_intensities["group"].str.strip("subtype_").astype("category")

    return clean_lipid_intensities

In [4]:
sample_description = pd.read_csv(
    LIPIDOMICS_DIR / "sample_description.csv", delimiter=";"
)

lipid_intensities = pd.read_csv(DATA_DIR / "lipidomics.csv", delimiter=",")
cluster_labels = pd.read_csv(DATA_DIR / "ClusterLabels.csv", delimiter=",")
prs = pd.read_csv(DATA_DIR / "PRS.csv", delimiter=",")
cleaned_dfs = clean_data(sample_description, prs, lipid_intensities, cluster_labels)

In [5]:
lipid_intensities = cleaned_dfs["clean_lipid_intensities"]
lipid_intensities

Unnamed: 0_level_0,group,gpeakneg1173,gpeakneg1346,gpeakneg1472,gpeakneg1488,gpeakneg1516,gpeakneg1533,gpeakneg1541,gpeakneg1580,gpeakneg163,...,gpeakpos9000,gpeakpos9264,gpeakpos9328,gpeakpos9390,gpeakpos9446,gpeakpos9505,gpeakpos9794,gpeakpos9863,gpeakpos9918,cluster_label
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
aagd330,subtype_1,23.953115,19.032590,21.711034,23.276074,20.894306,18.521350,20.943489,23.375640,21.975547,...,23.499906,23.016226,25.307012,27.685522,28.564764,27.554664,23.705398,25.921821,26.790723,1
achw003,subtype_1,23.091465,18.295016,20.528966,22.914697,20.317376,18.448708,20.905166,23.536030,28.696845,...,22.698507,22.530926,24.676387,27.177574,28.059297,26.544728,23.621415,25.469170,26.191333,1
acok454,subtype_4,23.594061,17.441830,20.658724,22.545079,20.434966,18.026518,20.865525,23.276562,22.068363,...,21.685893,22.316555,24.635970,27.440255,28.090679,25.835989,21.775457,24.021762,24.714306,4
aefl766,subtype_3,23.669306,19.401374,21.811560,22.959660,21.096145,18.794862,21.053931,23.614547,21.968921,...,24.743807,24.402238,26.859925,29.417981,30.209562,28.531459,25.146466,27.543357,28.564865,3
afsj906,subtype_4,22.261613,17.850074,20.963390,21.751444,20.307047,18.205165,20.497534,23.402497,21.990139,...,25.338601,25.622456,28.246384,30.376965,31.456635,29.315817,25.311258,27.434843,28.508252,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zqqj674,subtype_1,24.164228,17.870878,19.858343,21.400172,20.143621,17.314938,19.873363,23.173253,21.934429,...,21.229557,22.246238,23.751704,25.681428,26.153762,24.600574,21.909442,24.055257,24.663974,1
ztdb717,subtype_5,23.587809,18.840581,20.317283,21.368215,21.392466,18.983043,20.896459,23.420478,22.007388,...,25.676515,25.099090,27.703098,30.221437,31.247346,29.473689,25.939504,28.422602,29.707527,5
zups091,subtype_4,21.869536,17.284922,19.742000,20.903232,20.534037,18.714675,21.012971,23.597155,21.899827,...,23.744796,23.086588,26.062156,28.811957,29.718069,27.667099,24.163964,26.878870,27.668983,4
zxkt256,subtype_3,24.807661,19.817501,21.396378,22.858777,21.521678,17.998458,21.327181,24.488474,22.023006,...,22.206517,21.645851,23.822696,26.170481,27.187378,26.174255,22.628255,24.639160,25.839792,3


In [6]:
for key, value in cleaned_dfs.items():
    value.to_csv(BLD_DATA / f"{key}.csv", sep=";", index=True)
    value.to_pickle(BLD_DATA / f"{key}.pkl")