# UKB Repo Demo

--> UKB Dataset is pretty large with over 5K data fields for over 500K subjects



--> Currently UKB dataset resides on OpenMind cluster 


--> Purpose of creating this library


--> Currently three modules in development for access to genetic,bulk and scalar data types for UKB dataset.



In [1]:
# import the main repo and relevant modules you might need
import UKBRepo.UKBRepo as ukbr
from UKBRepo.UKBRepo import module_scalar_data_handler as scalar_module
from UKBRepo.UKBRepo import module_bulk_data_handler as bulk_module
from UKBRepo.UKBRepo import module_genetic_data_handler as genetic_module

# Scalar Data Access

Example Scalar Categories
- Smoking/Diet related field ids
- Education and Diet
- Ethinicity 

In [3]:
# initialize the module object
scalar_handler_object=scalar_module.scalar_data_handler()

Init signature: scalar_module.scalar_data_handler()
Docstring:     
A class to represent family of methods for handling and fetching scalar data of UKB dataset

...

Attributes
----------
columns_to_read_for_field_id : list
    list of relevant columns to read from metadata file


Methods
-------
display_all_ukb_categories():
    displays all the categories

get_field_ids_for_category(category_name):
    retrieves filed ids for a category

get_subject_list_field_ids(category_name):
    retrieves list of subject relevant to that field id

get_data_scalar(field_id,subject_id):
    fetches scalar data from metadata file
File:           ~/PycharmProjects/UKBRepo/UKBRepo/module_scalar_data_handler.py
Type:           type

In [4]:
scalar_handler_object.display_all_ukb_categories()

['Abdominal composition',
 'ECG at rest, 12-lead',
 'Infectious Disease Antigens',
 'NMR metabolomics',
 'NMR metabolomics QC indicators',
 'Autorefraction',
 'Task functional brain MRI',
 'Abdominal MRI',
 'Social support',
 'ECG during exercise',
 'MET Scores',
 'Arterial stiffness',
 'Spirometry',
 'Brain MRI',
 'Telomeres',
 'Early life factors',
 'Family history',
 'Medical conditions',
 'Female-specific factors',
 'Eyesight',
 'Education',
 'Sexual factors',
 'Smoking',
 'Reception',
 'Diet',
 'Procedural metrics',
 'Blood biochemistry',
 'Blood biochemistry processing',
 'Alcohol',
 'T1 structural brain MRI',
 'Body composition by DXA',
 'Bone-densitometry of heel',
 'Medications',
 'Heart MRI',
 'Intraocular pressure',
 'Freesurfer BA exvivo',
 'Freesurfer a2009s',
 'Freesurfer desikan white',
 'Freesurfer desikan pial',
 'Freesurfer DKT',
 'Bone size, mineral and density by DXA',
 'Body composition by impedance',
 'Arterial spin labelling brain MRI',
 'Cardiac and aortic funct

In [5]:
scalar_handler_object.get_field_ids_for_category("Ethnicity")

Unnamed: 0.1,Unnamed: 0,Field ID,Description,Category,field_id_category
1101,1101,21000,Ethnic background,Ethnicity,Misc data type
4593,4593,3659,Year immigrated to UK (United Kingdom),Ethnicity,Misc data type


In [6]:
list_of_subjects=scalar_handler_object.get_subject_list_field_ids(21000)

print(len(list_of_subjects),type(list_of_subjects))

502417 <class 'numpy.ndarray'>


In [7]:
#fetch the scalar data
scalar_handler_object.get_data_scalar(21000,1000012)

Unnamed: 0,eid,21000-0.0,21000-1.0,21000-2.0
0,1000012,1001.0,,


In [8]:
# either use the encoding dictionary to evalute the value or use the dictionary to apply encoding to your pandas 
# object
scalar_module.age_encoding_dict['1001.0']

'British'

# Genetic Data Access

In [9]:
# Initialize the genetic module objects
genetic_handler_object=genetic_module.genetic_data_handler()

In [10]:
#fetch the location of all genetic data stored on ocean filesystem
genetic_module.genetic_data_location

'/ocean/projects/asc170022p/tighu/UKB_Genetic_Data'

In [11]:
# set the chromosomes you require
genetic_handler_object.chromosome_number_list=["A","B"]

In [12]:
genetic_handler_object.get_genetic_data_batch(100)

Chunk(nsamples=1024, nvariants=1024)

# Bulk Data Access

In [13]:
# Initialize the bulk module objects
bulk_handler_object=bulk_module.bulk_data_handler()

In [17]:
bulk_handler_object.display_all_ukb_categories()[:10]

['Abdominal composition',
 'ECG at rest, 12-lead',
 'Infectious Disease Antigens',
 'NMR metabolomics',
 'NMR metabolomics QC indicators',
 'Autorefraction',
 'Task functional brain MRI',
 'Abdominal MRI',
 'Social support',
 'ECG during exercise']

In [18]:
bulk_handler_object.get_field_ids_for_category("T1 structural brain MRI")

Unnamed: 0.1,Unnamed: 0,Field ID,Description,Category,field_id_category
162,162,25733,Amount of warping applied to non-linearly alig...,T1 structural brain MRI,Misc data type
1003,1003,25731,Discrepancy between T1 brain image and standar...,T1 structural brain MRI,Misc data type
1004,1004,25732,Discrepancy between T1 brain image and standar...,T1 structural brain MRI,Misc data type
1612,1612,25925,Intensity scaling for T1,T1 structural brain MRI,Misc data type
1634,1634,25735,Inverted contrast-to-noise ratio in T1,T1 structural brain MRI,Misc data type
1635,1635,25734,Inverted signal-to-noise ratio in T1,T1 structural brain MRI,Misc data type
3218,3218,25756,Scanner lateral (X) brain position,T1 structural brain MRI,Misc data type
3219,3219,25758,Scanner longitudinal (Z) brain position,T1 structural brain MRI,Misc data type
3220,3220,25759,Scanner table position,T1 structural brain MRI,Misc data type
3221,3221,25757,Scanner transverse (Y) brain position,T1 structural brain MRI,Misc data type


In [19]:
# 20252	T1 structural brain images - NIFTI	T1 structural brain MRI	blob/bulk data
bulk_handler_object.get_subject_list_field_ids(20252)

  bulk_handler_object.get_subject_list_field_ids(20252)


array([1000012, 1000029, 1000031, ..., 6025904, 6025916, 6025927])

In [21]:
#4670217,4150484,2986833
bulk_handler_object.get_data_bulk(20252,4670217)

4670217/4670217_20252_2_0.zip 4670217/4670217_20252_3_0.zip 


'/home/tighu/PycharmProjects/UKBRepo/ukb/inputs/4670217'

In [20]:
bulk_handler_object.get_data_bulk(20252,1118270)

1118270/1118270_20252_2_0.zip 1118270/1118270_20252_3_0.zip 


'/home/tighu/PycharmProjects/UKBRepo/ukb/inputs/1118270'

In [8]:
#bulk_handler_object.display_all_ukb_categories()

In [9]:
#scalar_handler_object.display_all_ukb_categories()

In [9]:
# Field ID 21000--> Ethnicity
scalar_handler_object.get_subject_list_field_ids(21000)

['21000-0.0', '21000-1.0', '21000-2.0']


array([1000012, 1000029, 1000031, ..., 6025904, 6025916, 6025927])

In [10]:
scalar_handler_object.get_data_scalar(21000,1000012)

Unnamed: 0,eid,21000-0.0,21000-1.0,21000-2.0
0,1000012,1001.0,,


In [8]:
bulk_handler_object.get_field_ids_for_category("T1 structural brain MRI")

Unnamed: 0,Field ID,Description,Category,Type
162,25733,Amount of warping applied to non-linearly alig...,T1 structural brain MRI,
1003,25731,Discrepancy between T1 brain image and standar...,T1 structural brain MRI,
1004,25732,Discrepancy between T1 brain image and standar...,T1 structural brain MRI,
1612,25925,Intensity scaling for T1,T1 structural brain MRI,
1634,25735,Inverted contrast-to-noise ratio in T1,T1 structural brain MRI,
1635,25734,Inverted signal-to-noise ratio in T1,T1 structural brain MRI,
3218,25756,Scanner lateral (X) brain position,T1 structural brain MRI,
3219,25758,Scanner longitudinal (Z) brain position,T1 structural brain MRI,
3220,25759,Scanner table position,T1 structural brain MRI,
3221,25757,Scanner transverse (Y) brain position,T1 structural brain MRI,


In [7]:
bulk_handler_object.get_subject_list_field_ids(20251)

  bulk_handler_object.get_subject_list_field_ids(20251)


array([1000012, 1000029, 1000031, ..., 6025904, 6025916, 6025927])

In [6]:
bulk_handler_object.get_data_bulk(12187,1067273)

/home/tighu/PycharmProjects/UKBRepo/ukb/inputs


'/home/tighu/PycharmProjects/UKBRepo/ukb/inputs/1067273'

In [4]:
!pwd

/home/tighu/PycharmProjects/UKBRepo/ukb/inputs


In [8]:
import datalad

In [16]:
%%bash
datalad get 2635065

get(error): inputs/2635065/2635065_20217_2_0.zip (file) [Transfer failed
Transfer failed
Transfer failed]
get(error): inputs/2635065/2635065_20218_2_0.zip (file) [Transfer failed
Transfer failed
Transfer failed]
get(error): inputs/2635065/2635065_20219_2_0.zip (file) [Transfer failed
Transfer failed
Transfer failed]
get(error): inputs/2635065/2635065_20225_2_0.zip (file) [Transfer failed
Transfer failed
Transfer failed]
get(error): inputs/2635065/2635065_20227_2_0.zip (file) [Transfer failed
Transfer failed
Transfer failed]
get(error): inputs/2635065/2635065_20249_2_0.zip (file) [Transfer failed
Transfer failed
Transfer failed]
get(error): inputs/2635065/2635065_20250_2_0.zip (file) [Transfer failed
Transfer failed
Transfer failed]
get(error): inputs/2635065/2635065_20251_2_0.zip (file) [Transfer failed
Transfer failed
Transfer failed]
get(error): inputs/2635065/2635065_20252_2_0.zip (file) [Transfer failed
Transfer failed
Transfer failed]
get(error): inputs/2635065/2635065_20253_2_0.z

CalledProcessError: Command 'b'datalad get 2635065\n'' returned non-zero exit status 1.

In [8]:
import os

In [9]:
os.system("datalad install -r tighu@openmind7.mit.edu:/om4/project/biobank/h_ukb/ukb")

256

In [12]:
!datalad --version

datalad 0.15.3


git-annex version: 8.20211118-g23ee48898
build flags: Assistant Webapp Pairing Inotify DBus DesktopNotify TorrentParser MagicMime Feeds Testsuite S3 WebDAV
dependency versions: aws-0.22 bloomfilter-2.0.1.0 cryptonite-0.26 DAV-1.3.4 feed-1.3.0.1 ghc-8.8.4 http-client-0.6.4.1 persistent-sqlite-2.10.6.2 torrent-10000.1.1 uuid-1.3.13 yesod-1.6.1.0
key/value backends: SHA256E SHA256 SHA512E SHA512 SHA224E SHA224 SHA384E SHA384 SHA3_256E SHA3_256 SHA3_512E SHA3_512 SHA3_224E SHA3_224 SHA3_384E SHA3_384 SKEIN256E SKEIN256 SKEIN512E SKEIN512 BLAKE2B256E BLAKE2B256 BLAKE2B512E BLAKE2B512 BLAKE2B160E BLAKE2B160 BLAKE2B224E BLAKE2B224 BLAKE2B384E BLAKE2B384 BLAKE2BP512E BLAKE2BP512 BLAKE2S256E BLAKE2S256 BLAKE2S160E BLAKE2S160 BLAKE2S224E BLAKE2S224 BLAKE2SP256E BLAKE2SP256 BLAKE2SP224E BLAKE2SP224 SHA1E SHA1 MD5E MD5 WORM URL X*
remote types: git gcrypt p2p S3 bup directory rsync web bittorrent webdav adb tahoe glacier ddar git-lfs httpalso borg hook external
operating system: linux x86_64

In [15]:
! conda env list

# conda environments:
#
base                  *  /home/tighu/anaconda3
UKB                      /home/tighu/anaconda3/envs/UKB
mne                      /home/tighu/anaconda3/envs/mne



In [7]:
scalar_handler_object.display_all_ukb_categories()

['Abdominal composition',
 'ECG at rest, 12-lead',
 'Infectious Disease Antigens',
 'NMR metabolomics',
 'NMR metabolomics QC indicators',
 'Autorefraction',
 'Task functional brain MRI',
 'Abdominal MRI',
 'Social support',
 'ECG during exercise',
 'MET Scores',
 'Arterial stiffness',
 'Spirometry',
 'Brain MRI',
 'Telomeres',
 'Early life factors',
 'Family history',
 'Medical conditions',
 'Female-specific factors',
 'Eyesight',
 'Education',
 'Sexual factors',
 'Smoking',
 'Reception',
 'Diet',
 'Procedural metrics',
 'Blood biochemistry',
 'Blood biochemistry processing',
 'Alcohol',
 'T1 structural brain MRI',
 'Body composition by DXA',
 'Bone-densitometry of heel',
 'Medications',
 'Heart MRI',
 'Intraocular pressure',
 'Freesurfer BA exvivo',
 'Freesurfer a2009s',
 'Freesurfer desikan white',
 'Freesurfer desikan pial',
 'Freesurfer DKT',
 'Bone size, mineral and density by DXA',
 'Body composition by impedance',
 'Arterial spin labelling brain MRI',
 'Cardiac and aortic funct