# Compute principal components

In this notebook, we compute principal components using the variants for the AoU and UKB participants.

# Setup 

In [None]:
from datetime import datetime
import hail as hl
import os
import time

In [None]:
from bokeh.io import show, output_notebook
from bokeh.layouts import gridplot
output_notebook()

## Define constants

In [None]:
EXOME_REGIONS = f'{os.getenv("WORKSPACE_BUCKET")}/data/ukb/exomes/xgen_plus_spikein.GRCh38.bed'

<div class="alert alert-block alert-info">
<b>Note:</b> The AoU matrix table for the alpha1 release was created via notebook 'Hail Demo' and then moved to a better place within the workspace bucket. It contains all samples and variants for the alpha1 release.
</div>

In [None]:
AOU_MT = 'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/aou/alpha1/cohort.mt'

<div class="alert alert-block alert-info">
<b>Note:</b> The UKB matrix table was created via notebook 'create_matrix_tables'. It contains data for all samples within <kbd>chr21</kbd>.
</div>

In [None]:
UKB_MT = 'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/ukb/exomes/chr21.mt'

<div class="alert alert-block alert-info">
<b>Note:</b> These matrix tables were created via notebook 'merge_variants'.
</div>

In [None]:
MERGED_MT = 'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/merged/20210601/merged-chr21.mt'

In [None]:
time.strftime('%Y%m%d')

In [None]:
RESULT_BUCKET = os.getenv("WORKSPACE_BUCKET")
DATESTAMP = time.strftime('%Y%m%d')
TIMESTAMP = time.strftime('%Y%m%d_%H%M%S')
WORK_DIR = !pwd

# Outputs
SCORES_CSV = f'{os.getenv("WORKSPACE_BUCKET")}/data/aou/alpha1/{DATESTAMP}/scores.csv'
HAIL_LOG = f'{WORK_DIR[0]}/hail-compute-pcs-variants-{TIMESTAMP}.log'

## Check access

In [None]:
!gsutil ls {MERGED_MT}

In [None]:
!gsutil ls {AOU_MT}

In [None]:
!gsutil ls {UKB_MT}

## Start Hail 

In [None]:
EXTRA_SPARK_CONFIG = {
    'spark.driver.extraJavaOptions': '-Xss4M',
    'spark.driver.maxResultSize': '50G',
    'spark.driver.memory': '90G',
    'spark.executor.extraJavaOptions': '-Xss4M',
    'spark.kryoserializer.buffer.max': '1G',
    'spark.memory.fraction': '0.33',
    'spark.network.timeout': '300',
    'spark.speculation': 'true',
    'spark.speculation.quantile': '0.95',
    'spark.task.maxFailures': '20'
}

In [None]:
hl.init(spark_conf=EXTRA_SPARK_CONFIG,
        min_block_size=50,
        default_reference='GRCh38',
        log=HAIL_LOG)

# Load exome capture regions

In [None]:
ukb_exome_capture_regions = hl.import_bed(EXOME_REGIONS)

# Read the matrix table

Right now we are only computing PCs on AoU. The data has not been filtered at all.

In [None]:
aou_wgs = hl.read_matrix_table(AOU_MT)

## Limit to exonic regions

In [None]:
aou_wgs = aou_wgs.filter_rows(
    hl.is_defined(ukb_exome_capture_regions[aou_wgs.locus]))

# Compute principal components

https://hail.is/docs/0.2/methods/genetics.html#hail.methods.hwe_normalized_pca

In [None]:
start = datetime.now()
print(start)

In [None]:
eigenvalues, scores, loadings = hl.hwe_normalized_pca(aou_wgs.GT)

In [None]:
end = datetime.now()
print(end)
print(end - start)

In [None]:
eigenvalues

In [None]:
scores.write(SCORES_TAB)

In [None]:
scores.describe()

In [None]:
scores.show(10)

In [None]:
scores.export(SCORES_CSV, delimiter = ',')

In [None]:
type(loadings)

# Appendix

extra cells that are useful

In [None]:
start = datetime.now()
print(start)

In [None]:
end = datetime.now()
print(end)
print(end - start)

# Provenance

In [None]:
print(datetime.now())

In [None]:
!pip3 freeze