# Compute principal components

In this notebook, we compute principal components using the variants for the AoU and UKB participants.

# Setup 

In [None]:
from datetime import datetime
import hail as hl
import os
import time

In [None]:
from bokeh.io import show, output_notebook
from bokeh.layouts import gridplot
output_notebook()

## Define constants

In [None]:
EXOME_REGIONS = f'{os.getenv("WORKSPACE_BUCKET")}/data/ukb/exomes/xgen_plus_spikein.GRCh38.bed'

<div class="alert alert-block alert-info">
<b>Note:</b> The AoU matrix table for the alpha1 release was created via notebook 'Hail Demo' and then moved to a better place within the workspace bucket. It contains all samples and variants for the alpha1 release.
</div>

In [None]:
AOU_MT = 'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/aou/alpha1/cohort.mt'

<div class="alert alert-block alert-info">
<b>Note:</b> The UKB matrix table was created via notebook 'create_matrix_tables'. It contains data for all samples within <kbd>chr21</kbd>.
</div>

In [None]:
UKB_MT = 'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/ukb/exomes/chr21.mt'

<div class="alert alert-block alert-info">
<b>Note:</b> These matrix tables were created via notebook 'merge_variants'.
</div>

In [None]:
MERGED_MT = 'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/merged/20210601/merged-chr21.mt'

In [None]:
time.strftime('%Y%m%d')

In [None]:
RESULT_BUCKET = os.getenv("WORKSPACE_BUCKET")
DATESTAMP = time.strftime('%Y%m%d')
TIMESTAMP = time.strftime('%Y%m%d_%H%M%S')
WORK_DIR = !pwd

# Outputs
SCORES_CSV = f'{os.getenv("WORKSPACE_BUCKET")}/data/aou/alpha1/{DATESTAMP}/scores.csv'
HAIL_LOG = f'{WORK_DIR[0]}/hail-compute-pcs-variants-{TIMESTAMP}.log'

## Check access

In [None]:
!gsutil ls {MERGED_MT}

In [None]:
!gsutil ls {AOU_MT}

In [None]:
!gsutil ls {UKB_MT}

## Start Hail 

In [None]:
# See also https://towardsdatascience.com/fetch-failed-exception-in-apache-spark-decrypting-the-most-common-causes-b8dff21075c
# See https://spark.apache.org/docs/2.4.7/configuration.html

EXTRA_SPARK_CONFIG = {
    # Maximum allowable size of Kryo serialization buffer, in MiB unless otherwise specified. This must be larger
    # than any object you attempt to serialize and must be less than 2048m. Increase this if you get a
    # "buffer limit exceeded" exception inside Kryo.
    'spark.kryoserializer.buffer.max': '1G', # Default is 64m
    
    # Fraction of (heap space - 300MB) used for execution and storage. The lower this is, the more frequently spills
    # and cached data eviction occur. The purpose of this config is to set aside memory for internal metadata, user
    # data structures, and imprecise size estimation in the case of sparse, unusually large records. Leaving this at
    # the default value is recommended. For more detail, including important information about correctly tuning JVM
    # garbage collection when increasing this value, see this description.
    'spark.memory.fraction': '0.33', # Default is 0.6
    
    # If set to "true", performs speculative execution of tasks. This means if one or more tasks are running slowly in a stage, they will be re-launched.
    'spark.speculation': 'true', # Default is false.
    
    # Fraction of tasks which must be complete before speculation is enabled for a particular stage.
    'spark.speculation.quantile': '0.95', # Default is 0.75

    # Default timeout for all network interactions. This config will be used in place of 
    # spark.core.connection.ack.wait.timeout, spark.storage.blockManagerSlaveTimeoutMs, 
    # spark.shuffle.io.connectionTimeout, spark.rpc.askTimeout or spark.rpc.lookupTimeout if they are not configured.
    'spark.network.timeout': '180s', # Default is 120s
        
    # (Netty only) Fetches that fail due to IO-related exceptions are automatically retried if this is set to a
    # non-zero value. This retry logic helps stabilize large shuffles in the face of long GC pauses or transient
    # network connectivity issues.
    'spark.shuffle.io.maxRetries': '10',  # Default is 3
    
    # (Netty only) How long to wait between retries of fetches. The maximum delay caused by retrying is 15 seconds
    # by default, calculated as maxRetries * retryWait.
    'spark.shuffle.io.retryWait': '15s',  # Default is 5s
    
    # Number of failures of any particular task before giving up on the job. The total number of failures spread
    # across different tasks will not cause the job to fail; a particular task has to fail this number of attempts.
    # Should be greater than or equal to 1. Number of allowed retries = this value - 1.
    'spark.task.maxFailures': '10', # Default is 4.

    # Number of consecutive stage attempts allowed before a stage is aborted.
    'spark.stage.maxConsecutiveAttempts': '10' # Default is 4.
}

In [None]:
hl.init(spark_conf=EXTRA_SPARK_CONFIG,
        min_block_size=50,
        default_reference='GRCh38',
        log=HAIL_LOG)

Check the configuration.

In [None]:
sc = hl.spark_context()
config = sc._conf.getAll()
config.sort()
config

# Load exome capture regions

In [None]:
ukb_exome_capture_regions = hl.import_bed(EXOME_REGIONS)

# Read the matrix table

Right now we are only computing PCs on AoU. The data has not been filtered at all.

In [None]:
aou_wgs = hl.read_matrix_table(AOU_MT)

## Limit to exonic regions

In [None]:
aou_wgs = aou_wgs.filter_rows(
    hl.is_defined(ukb_exome_capture_regions[aou_wgs.locus]))

# Compute principal components

https://hail.is/docs/0.2/methods/genetics.html#hail.methods.hwe_normalized_pca

In [None]:
start = datetime.now()
print(start)

In [None]:
eigenvalues, scores, loadings = hl.hwe_normalized_pca(aou_wgs.GT)

In [None]:
end = datetime.now()
print(end)
print(end - start)

In [None]:
eigenvalues

In [None]:
scores.write(SCORES_TAB)

In [None]:
scores.describe()

In [None]:
scores.show(10)

**TODO(deflaux)**: next time we run this, emit a TSV, not a CSV. Also, flatten the list of PCs for easier reading into R.

In [None]:
scores.export(SCORES_CSV, delimiter = ',')

In [None]:
type(loadings)

In [None]:
hl.stop()

# Appendix

extra cells that are useful

In [None]:
start = datetime.now()
print(start)

In [None]:
end = datetime.now()
print(end)
print(end - start)

# Provenance

In [None]:
print(datetime.now())

In [None]:
!pip3 freeze