# Retrieve UK Biobank lipids phenotypes and covariates

This notebook retrieves lipid phenotypes and covariates from the database using Spark SQL and stores that extract as a CSV for further downstream use.

# Setup

<div class="alert alert-block alert-warning">
This notebook will run correctly on the UK Biobank Research Analysis Platform.
</div>

In [None]:
import dxdata
import pandas as pd
import re

In [None]:
# Initialize dxdata engine
engine = dxdata.connect(dialect="hive+pyspark")
pt = engine.execute("SET spark.sql.shuffle.partitions=50").to_pandas()

In [None]:
dataset = dxdata.load_dataset("app7089_202103231620.dataset")

In [None]:
participant = dataset['participant']

## Discover the fields of interest 

In [None]:
fields_cholesterol = list(participant.find_fields(title_regex="(?i)cholesterol|hdl|ldl|triglycerides|Age when attended assessment centre"))
len(fields_cholesterol)

In [None]:
for field in sorted(fields_cholesterol, key=lambda fld: fld.name):
    print(f'\n{field.column_name}: {field.title}')
    print(f'\t{field.units}')
    print(f'\t{field.type}')
    print(f'\t{field.coding}')
    if field.coding is not None:
        print(f'\t{field.coding.codes}')

## Retrieve the data 

In [None]:
import time

start = time.time()
pheno_data = participant.retrieve_fields(engine=engine, fields=fields_cholesterol, coding_values="replace").toPandas()
end = time.time()
print(end - start)

In [None]:
pheno_data.shape

In [None]:
# Uncomment to see row level data.
#pheno_data.head()

In [None]:
pheno_data.columns

### Construct improved column names 

In [None]:
col_names = {'eid': 'eid'}
for field in sorted(fields_cholesterol, key=lambda fld: fld.name):
    name = '_'.join([field.column_name, re.sub(' \| Instance \d', '', field.title).replace(' ', '_')])
    if field.units is not None:
        name += f'_{field.units}'
    print(name)
    col_names[field.column_name] = name

In [None]:
pheno_data = pheno_data.rename(columns=col_names)

In [None]:
pheno_data.columns

## Write out the data extract to a CSV 

In [None]:
pheno_data.to_csv('lipids.csv')

In [None]:
%%bash

dx upload lipids.csv