MODS Phenotypes: Step 6. Grady Matrix Factorization
===

In [1]:
import os
import sys
from pathlib import Path
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import multiprocessing
import dill as pickle

n_cpus = multiprocessing.cpu_count()

In [2]:
from sklearn.decomposition import NMF

In [3]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme(style="white", context="notebook")

In [4]:
from collections import ChainMap

In [5]:
import seaborn as sns

In [6]:
from sklearn.decomposition import NMF

In [7]:
sys.path.insert(0, "/opt/scratchspace/KLAB_SAIL/MODSPhenotypes/gSpan")
import shlex
from src.gspan_mining.main import parse_args, main
from src.gspan_mining.data_processing import process_graph_data

In [8]:
site_name = 'grady'
run_id = '2023_07_29'

---

In [11]:
data_fp = Path(f"/opt/scratchspace/KLAB_SAIL/MODSPhenotypes/data/{run_id}/{site_name}_{run_id}_sofa_scores_df.parquet")
score_df = pd.read_parquet(data_fp)

In [12]:
%%time
# TODO: Find a way to speed up loading the pickle! it's 4GB+
# investigate multiprocessing https://gist.github.com/jonzarecki/0853bbaf1142adc9d79ab73d9b427f46
with open(f"/opt/scratchspace/KLAB_SAIL/MODSPhenotypes/data/{run_id}/{site_name}_{run_id}_graph_miner.pkl", "rb") as dill_file:
    graph_miner = pickle.load(dill_file)

CPU times: user 15.6 s, sys: 1.33 s, total: 16.9 s
Wall time: 19.4 s


In [13]:
nnm_df = graph_miner.nnm_df.copy(deep=True)

# # only include data for patients with MODS within 28 days
# # this shouldnt be done in the future -- when I did the 11/11 v1 results I accidentally left them in
# cohort_df = pd.read_parquet('/home/gmatlin/tmp/cohort_df_12112022.parquet')
# nnm_df = nnm_df.loc[:,nnm_df.columns.isin(list(cohort_df.index))]

In [14]:
model = NMF(n_components=4,
            init='nndsvd',
            max_iter=10_000,
            random_state=0)

In [15]:
# F == graph_id (rows) by subgraph_patterns (columns)
F = pd.DataFrame(model.fit_transform(nnm_df))
F.index = nnm_df.index
F.columns = [f"phenotype_{i}" for i in F.columns]

In [16]:
# C == subgraph_patterns (rows) by subgraph_coefficients (columns)
# THIS IS THE COMPONENT MATRIX WE WANT TO ANALYZE
C = pd.DataFrame(model.components_)
C.index = [f"phenotype_{i}" for i in C.index]
C.columns = nnm_df.columns

In [17]:
phenotype_df = C.T # Transpose

## Export

In [18]:
phenotype_df.to_parquet(f"/opt/scratchspace/KLAB_SAIL/MODSPhenotypes/data/{run_id}/{site_name}_{run_id}_phenotypes.parquet")

In [19]:
phenotype_df.shape

(19591, 4)