In [None]:
# Installing the library via pip

!pip install --user bed-reader ukb_loader

In [1]:
import os

split_dir = '/gpfs/gpfs0/ukb_data/processed_data/all/splits'
split_path = f'{split_dir}/white_british'

if not os.path.exists('/gpfs/gpfs0/ukb_data/processed_data/all'):
    os.mkdir('/gpfs/gpfs0/ukb_data/processed_data/all')

## Real-valued phenotypes

Real-valued phenotypes are loaded with the `UKBDataLoader` class. For example, for the phenotype 'standing height' (data field number 50), together with additional features sex and age (data fields 31, 21003):

In [2]:
from ukb_loader import UKBDataLoader

In [3]:
loader = UKBDataLoader(split_dir, 'white_british', '50', ['21003', '31'])
train, val, test = loader.load_train(), loader.load_val(), loader.load_test()
train.head()

Unnamed: 0,21003,31,50
1000011,60.0,0.0,164.0
1000026,45.0,0.0,164.0
1000044,42.0,0.0,165.0
1000058,57.0,0.0,167.5
1000060,67.0,0.0,156.0


## ICD-10 phenotypes

Phenotypes based on [ICD-10 disease codes](https://biobank.ndph.ox.ac.uk/showcase/field.cgi?id=41270) are loaded with `BinaryICDLoader`. For example, loading non-insulin dependent diabetes mellitus without complications with code E11.9:

In [4]:
from ukb_loader import BinaryICDLoader

In [5]:
icd_field_code = '41270' # do not change that for ICD loader!
icd_disease_code = 'E119' # E11.9 - non-insulin dependent diabetes mellitus without complications
features = ['31', '50', '21002'] # sex, height, age
split_name = 'white_british'
loader = BinaryICDLoader(split_dir, split_name, icd_field_code, features, icd_disease_code) 

train, val, test = loader.load_train(), loader.load_val(), loader.load_test()
train.head()

Unnamed: 0,31,50,21002,41270
1000044,0.0,165.0,60.599998,0.0
1000058,0.0,167.5,56.200001,0.0
1000060,0.0,156.0,71.5,0.0
1000075,0.0,157.199997,58.400002,0.0
1000097,0.0,162.5,79.599998,0.0


## Self-reported phenotypes

Self-reported diseases are loaded through `BinarySDLoader`, with the appropriate field code - 20002 for non-cancer illnesses, and 20001 for cancerous ones.

* [List of non-cancer disease codes](https://biobank.ndph.ox.ac.uk/showcase/coding.cgi?id=6)
* [List of cancerous disease codes](https://biobank.ndph.ox.ac.uk/showcase/coding.cgi?id=3)

For example, for (non-cancer) diabetes:

In [6]:
from ukb_loader import BinarySDLoader

In [7]:
sd_field_code = '20002'
sd_disease_code = 1220 # diabetes, umbrella code, includes type I and type II diabetes
features = ['31', '50', '21002'] # sex, height, age
split_name = 'white_british'
loader = BinarySDLoader(split_dir, split_name, sd_field_code, features, sd_disease_code, na_as_false=True) 

train, val, test = loader.load_train(), loader.load_val(), loader.load_test()
train.head()

Unnamed: 0,31,50,21002,20002
1000011,0.0,164.0,77.699997,0.0
1000026,0.0,164.0,61.0,0.0
1000044,0.0,165.0,60.599998,0.0
1000058,0.0,167.5,56.200001,0.0
1000060,0.0,156.0,71.5,0.0
