# Table of Content

<a name="outline"></a>

## Setup

- [A](#seca) External Imports
- [B](#secb) Internal Imports
- [C](#secc) Configurations and Paths 
- [D](#secd) JAX Interface
- [E](#sece) General Utility Functions


## Clustering

- [1](#sec2) Disease Embeddings Clustering
- [2](#sec3) Subject Embeddings Clustering

<a name="seca"></a>

### A External Imports [^](#outline)

In [1]:
%load_ext autoreload
%autoreload 2
import sys
import os
import glob
import random
from collections import defaultdict
from pathlib import Path

from IPython.display import display

import pandas as pd

from tqdm import tqdm
import jax
jax.config.update('jax_platform_name', 'cpu')

<a name="secb"></a>

### B Internal Imports [^](#outline)

In [2]:


sys.path.append("..")

from lib import utils as U
from lib.ehr.dataset import load_dataset

# Assign the folder of the dataset to `DATA_FILE`.

HOME = os.environ.get('HOME')
DATA_STORE = f'{HOME}/Documents/DS211/users/tb1009/DATA'
DATA_FILE = os.path.join(DATA_STORE, 'ICE_TEST_1000.csv')
SOURCE_DIR = os.path.abspath("..")

<a name="secc"></a>

### C Configurations and Paths [^](#outline)

In [3]:
output_dir = 'cprd_artefacts'
Path(output_dir).mkdir(parents=True, exist_ok=True)


In [4]:
with U.modified_environ(DATA_FILE=DATA_FILE):
    cprd_dataset = load_dataset('CPRD')
   

In [5]:
output_dir = 'cprd_clustering_artefacts'
Path(output_dir).mkdir(parents=True, exist_ok=True)


<a name="secd"></a>

### D JAX Interface [^](#outline)

### Configuration should match the same configuration used in training in `cprd2_dx_training.ipynb`

In [6]:

%load_ext autoreload
%autoreload 2

from lib.ehr.coding_scheme import DxLTC212FlatCodes, DxLTC9809FlatMedcodes, EthCPRD5, EthCPRD16
from lib.ehr import OutcomeExtractor, FirstOccurrenceOutcomeExtractor
from lib.ehr import Subject_JAX
from lib.ehr import StaticInfoFlags

%load_ext autoreload
%autoreload 2

code_scheme = {
    #'dx': DxLTC9809FlatMedcodes(), # other options 
    'dx': DxLTC212FlatCodes(),
    #'outcome': OutcomeExtractor('dx_cprd_ltc9809'),
    'outcome': FirstOccurrenceOutcomeExtractor('dx_cprd_ltc212'),
    # Comment above^, and uncomment below, to consider only the first occurrence of codes per subject.
    # 'outcome': FirstOccurrenceOutcomeExtractor('dx_cprd_ltc9809'),
    'eth': EthCPRD5()
}

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [7]:

static_info_flags = StaticInfoFlags(
 gender=True,
 age=True,
 idx_deprivation=True,
 ethnicity=EthCPRD5(), # <- include it by the category of interest, not just 'True'.
)
cprd_interface = Subject_JAX.from_dataset(cprd_dataset, code_scheme=code_scheme, static_info_flags=static_info_flags)
cprd_all_subjects = list(cprd_interface.keys())
cprd_splits = cprd_interface.random_splits(split1=0.7, split2=0.85, random_seed=42)

<a name="sec1"></a>

## 1 Disease Embeddings Clustering on CPRD [^](#outline)

In [8]:


# Should be the same one used in JAX interface in the training notebook.
#dx_scheme = DxLTC9809FlatMedcodes()
dx_scheme = DxLTC212FlatCodes()


In [9]:
# scheme indices (textual code -> integer index)
dx_scheme.index

# reverse index (integer index -> textual code)
idx2code = {idx: code for code, idx in dx_scheme.index.items()}

### 1.A GloVe Based Disease Embeddings

Get the coocurrence matrix

In [10]:
# Time-window context coocurrence
cprd_cooc_timewin = cprd_interface.dx_coocurrence(cprd_all_subjects, window_size_days=365)

# Sequence context coocurrence
cprd_cooc_seqwin = cprd_interface.dx_coocurrence(cprd_all_subjects, context_size=20)

from lib.embeddings import train_glove

cprd_glove_timewin = train_glove(cprd_cooc_timewin, embeddings_size=100, iterations=50, prng_seed=0)
cprd_glove_seqwin = train_glove(cprd_cooc_seqwin, embeddings_size=100, iterations=50, prng_seed=0)

cprd_glove_timewin

array([[ 2.41131637e-01,  1.32867842e-01,  2.35407707e-01, ...,
        -7.18932031e-02,  2.57600973e-01, -1.59377858e-01],
       [-1.21855618e-02,  5.28275339e-02, -8.16538798e-03, ...,
        -9.27247390e-02, -2.62883764e-03, -2.21714140e-02],
       [ 2.63196072e-01, -3.44299220e-01,  4.78473886e-01, ...,
        -1.31982048e-01,  4.97720112e-01, -2.24525480e-01],
       ...,
       [-5.50115351e-03,  8.36920526e-04, -3.44570876e-03, ...,
        -6.41512597e-04,  6.54301361e-03, -1.60312106e-03],
       [ 1.92158283e-01,  2.36316220e-01,  1.84600157e-01, ...,
        -5.29295801e-01,  1.72118487e-01, -2.16294515e-01],
       [ 8.04683300e-03, -2.62595175e-03, -1.74527848e-03, ...,
        -5.35091290e-03,  1.77568208e-05, -4.10782401e-03]])

In [11]:
pd.DataFrame(cprd_cooc_timewin)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,202,203,204,205,206,207,208,209,210,211
0,24.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,19870.0,0.0,2.0,0.0,14.0,264.0,0.0,0.0,...,350.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1574.0,0.0,0.0,0.0,654.0,0.0,0.0,...,74.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,2.0,0.0,1030.0,0.0,102.0,504.0,0.0,0.0,...,176.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
207,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
208,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,402.0,0.0,0.0,0.0
209,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
210,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18.0,0.0


In [22]:
df=pd.DataFrame(cprd_cooc_seqwin)
df.index = list(map(dx_scheme.desc.get, dx_scheme.codes))
df.columns = list(map(dx_scheme.desc.get, dx_scheme.codes))
df.to_csv('crpd_cooc_seqwin.csv')
df

Unnamed: 0,Abdominal Aortic Aneurysm,Ankylosing spondylitis,Myocardial Infarction,Neuropathic Bladder,Non-Hodgkin Lymphoma,Nonrheumatic aortic valve disorders,Nonrheumatic mitral valve disorders,Obesity,Obsessive-compulsive disorder,Obstructive and reflux uropathy,...,Low HDL-C,Lupus Erythematosus,Macular degeneration,Meniere's Disease,Migraine,Motor neurone disease,Multiple sclerosis,Multiple valve disorder,Myasthenia gravis,Myelodysplastic Syndrome
Abdominal Aortic Aneurysm,208.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Ankylosing spondylitis,0.0,612.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,54.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Myocardial Infarction,0.0,0.0,130774.0,60.0,44.0,0.0,74.0,3576.0,0.0,0.0,...,4076.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Neuropathic Bladder,0.0,0.0,60.0,12290.0,0.0,0.0,58.0,8536.0,0.0,0.0,...,758.0,0.0,0.0,0.0,160.0,0.0,0.0,0.0,0.0,0.0
Non-Hodgkin Lymphoma,0.0,0.0,44.0,0.0,6364.0,0.0,614.0,4476.0,0.0,0.0,...,1618.0,0.0,0.0,0.0,292.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Motor neurone disease,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Multiple sclerosis,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,4284.0,0.0,0.0,0.0
Multiple valve disorder,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Myasthenia gravis,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,430.0,0.0


In [13]:
pd.DataFrame(cprd_cooc_seqwin, index=dx_scheme.codes)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,202,203,204,205,206,207,208,209,210,211
1,208.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.0,612.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,54.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100,0.0,0.0,130774.0,60.0,44.0,0.0,74.0,3576.0,0.0,0.0,...,4076.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
101,0.0,0.0,60.0,12290.0,0.0,0.0,58.0,8536.0,0.0,0.0,...,758.0,0.0,0.0,0.0,160.0,0.0,0.0,0.0,0.0,0.0
102,0.0,0.0,44.0,0.0,6364.0,0.0,614.0,4476.0,0.0,0.0,...,1618.0,0.0,0.0,0.0,292.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,4284.0,0.0,0.0,0.0
97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,430.0,0.0


In [14]:
cprd_cooc_seqwin.shape

(212, 212)

In [15]:
cprd_glove_seqwin

array([[ 2.84833024e-01, -3.53228159e-01,  4.26388449e-02, ...,
        -2.96691735e-01, -3.31314378e-01,  3.48006081e-01],
       [ 4.40280176e-01, -4.09134863e-01,  7.11565475e-01, ...,
        -3.85159188e-01, -3.78043422e-01,  4.13431604e-01],
       [ 4.26620761e-01, -3.44591664e-01,  7.48666236e-01, ...,
        -4.23595087e-01, -5.17868585e-01,  5.61151193e-01],
       ...,
       [-4.86058618e-03,  2.55938824e-03, -3.09311979e-03, ...,
         1.58750234e-03, -4.70330287e-03,  6.04875977e-04],
       [ 3.50063750e-01, -3.34807706e-01,  1.94905066e-01, ...,
        -3.07402896e-01, -3.48481299e-01,  3.84545134e-01],
       [ 8.75142058e-05,  2.00905442e-03, -6.36480866e-04, ...,
         6.05566796e-03,  1.97579479e-03,  1.06183224e-04]])

In [17]:
dx_scheme.desc

{'1': 'Abdominal Aortic Aneurysm',
 '10': 'Ankylosing spondylitis',
 '100': 'Myocardial Infarction',
 '101': 'Neuropathic Bladder',
 '102': 'Non-Hodgkin Lymphoma',
 '103': 'Nonrheumatic aortic valve disorders',
 '104': 'Nonrheumatic mitral valve disorders',
 '105': 'Obesity',
 '106': 'Obsessive-compulsive disorder',
 '107': 'Obstructive and reflux uropathy',
 '108': 'Oesophageal varices',
 '109': 'Osteoarthritis (excl spine)',
 '11': 'Anterior and Intermediate Uveitis',
 '110': 'Osteoporosis',
 '111': 'Other haemolytic anaemias',
 '112': 'Pancreatitis',
 '113': "Parkinson's disease",
 '114': 'Pericardial Effusion',
 '115': 'Peripheral Vascular Disease',
 '116': 'Peripheral Neuropathy',
 '117': 'Personality disorders',
 '118': 'Plasma Cell Malignancy',
 '119': 'Pleural effusion',
 '12': 'Anxiety disorders',
 '120': 'Pleural plaque',
 '121': 'Polycystic ovarian syndrome',
 '122': 'Polycythaemia vera',
 '123': 'Polymyalgia Rheumatica',
 '124': 'Portal hypertension',
 '125': 'Posterior Uve

### 1.B Predictor Based Disease Embeddings

TODO

### 1.C Predictor Based Subject Embeddings

TODO

In [16]:


# def embeddings_dictionary(clf):
#     model, state = cprd_predictors[clf]
#     params = model.get_params(state)
#     # Embeddings Mat
#     dx_G = model.dx_emb.compute_embeddings_mat(params['dx_emb'])

#     embeddings_dict = {}
#     for code, idx in dx_scheme.index.items():
#         in_vec = np.zeros((cprd_interface.dx_dim, ))
#         in_vec[idx] = 1.
#         out_vec = model.dx_emb.encode(dx_G, in_vec)
#         embeddings_dict[code] = out_vec
#     return embeddings_dict

# icenode_emb = embeddings_dictionary('ICE-NODE')
# icenode_uni_emb = embeddings_dictionary('ICE-NODE_UNIFORM')
# retain_emb = embeddings_dictionary('RETAIN')
# gru_emb = embeddings_dictionary('GRU')


# def subject_embeddings_dictionary(clf):
#     model, state = cprd_predictors[clf]
#     # All subjects in the study are passed
#     return model.subject_embeddings(state, cprd_interface.subjects)

# icenode_subj_emb = subject_embeddings_dictionary('ICE-NODE')
# icenode_subj_uni_emb = subject_embeddings_dictionary('ICE-NODE_UNIFORM')
# retain_subj_emb = subject_embeddings_dictionary('RETAIN')
# gru_subj_emb = subject_embeddings_dictionary('GRU')