In [1]:
from pathlib import Path

import pandas as pd
from scipy import stats

In [2]:
THIS_DIR = Path(".").resolve()
ROOT = THIS_DIR.parent.parent.resolve()
DATA_DIR = ROOT / "src" / "data"
LIPIDOMICS_DIR = DATA_DIR / "lipidomics"
BLD_DATA = ROOT / "bld" / "data"
BLD_DATA.mkdir(parents=True, exist_ok=True)

In [3]:
data = pd.read_pickle(BLD_DATA / "clean_data.pkl")
lipidomics = pd.read_pickle(BLD_DATA / "clean_lipid_intensities.pkl")
prs = pd.read_pickle(BLD_DATA / "clean_prs.pkl")

# Which variables to focus on?

Kolmogorov Smirnov to compare distributions

*note to self: why distributions and not means*

In [4]:
def _get_interesting_cluster_variable_relationship(data, p_value_threshold):
    """Return clusters and variables where the distribution are significantly different based on Kolmorogov-Smirnov test
    from the baseline cluster (5). Ignoring multiple testing for exploratory purposes."""

    baseline_cluster = "5"
    other_cluster = data["cluster_label"].unique()

    other_cluster = other_cluster[other_cluster != baseline_cluster]

    results_dict = {}

    for cluster in other_cluster:
        for col in data.columns[1:]:
            cluster_5_data = data.groupby("cluster_label", observed=True).get_group(
                "5"
            )[col]
            other_cluster_data = data.groupby("cluster_label", observed=True).get_group(
                cluster
            )[col]

            ks_stat, p_value = stats.ks_2samp(cluster_5_data, other_cluster_data)
            results_dict[(col, cluster)] = p_value

    p_value_threshold = p_value_threshold
    interesting_cluster_variable_relationships = {
        k: v for k, v in results_dict.items() if v < p_value_threshold
    }

    return interesting_cluster_variable_relationships

In [5]:
results = _get_interesting_cluster_variable_relationship(data, 0.05)
len(results)
results

{('gpeakneg1472', '1'): 0.01706811240026365,
 ('gpeakneg1488', '1'): 0.045035301376136976,
 ('gpeakneg1652', '1'): 0.02684754439852551,
 ('gpeakneg1661', '1'): 0.014000975657107001,
 ('gpeakneg3514', '1'): 0.026219184521782117,
 ('gpeakneg3534', '1'): 0.004064998583606416,
 ('gpeakneg3819', '1'): 0.03372617664173577,
 ('gpeakneg3836', '1'): 0.007169885241369066,
 ('gpeakneg3852', '1'): 0.03173486498697656,
 ('gpeakneg3944', '1'): 0.000871694783480486,
 ('gpeakneg3952', '1'): 0.03579436596647986,
 ('gpeakneg3953', '1'): 0.03652203126206991,
 ('gpeakneg4047', '1'): 0.01571520414627248,
 ('gpeakneg4062', '1'): 0.02581603238092867,
 ('gpeakneg4069', '1'): 0.00394030819445574,
 ('gpeakneg4081', '1'): 0.04791867796914643,
 ('gpeakneg4172', '1'): 0.030437615598867995,
 ('gpeakneg4323', '1'): 0.005731819069909711,
 ('gpeakneg4474', '1'): 0.02684754439852551,
 ('gpeakneg4501', '1'): 0.0444897284603011,
 ('gpeakneg4581', '1'): 0.02581603238092867,
 ('gpeakneg4632', '1'): 0.038002252069715164,
 (

# Focus on Cluster 5