# Explore results from the genome-wide association study

In this notebook, we use qqman and Hail to explore the results from the genome-wide association study on the merged variants of the AoU and UKB participants.

# Setup 

In [None]:
!pip3 install qqman

In [None]:
from qqman import qqman

# If this import statement fails, restart the kernel.

In [None]:
from datetime import datetime
import hail as hl
from hail.plot import show
import os
import pandas as pd
from pprint import pprint
import time

In [None]:
hl.plot.output_notebook()

## Define constants

<div class="alert alert-block alert-info">
The GWAS results were computed via <a href="https://hail.is/docs/0.2/methods/stats.html#hail.methods.linear_regression_rows">hail.methods.linear_regression_rows</a> in notebook <kbd>hail_gwas.ipynb</kbd>.
</div>

In [None]:
HAIL_GWAS_TAB = 'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/merged/20210727/gwas-LDL_norm-chr21.tab'

## Check access

In [None]:
!gsutil ls {HAIL_GWAS_TAB}

## Start Hail 

In [None]:
# See also https://towardsdatascience.com/fetch-failed-exception-in-apache-spark-decrypting-the-most-common-causes-b8dff21075c
# See https://spark.apache.org/docs/2.4.7/configuration.html

EXTRA_SPARK_CONFIG = {
    # If set to "true", performs speculative execution of tasks. This means if one or more tasks are running
    # slowly in a stage, they will be re-launched.
    'spark.speculation': 'true', # Default is false.
    
    # Fraction of tasks which must be complete before speculation is enabled for a particular stage.
    'spark.speculation.quantile': '0.95', # Default is 0.75

    # Default timeout for all network interactions. This config will be used in place of 
    # spark.core.connection.ack.wait.timeout, spark.storage.blockManagerSlaveTimeoutMs, 
    # spark.shuffle.io.connectionTimeout, spark.rpc.askTimeout or spark.rpc.lookupTimeout if they are not configured.
    'spark.network.timeout': '180s', # Default is 120s
        
    # (Netty only) Fetches that fail due to IO-related exceptions are automatically retried if this is set to a
    # non-zero value. This retry logic helps stabilize large shuffles in the face of long GC pauses or transient
    # network connectivity issues.
    'spark.shuffle.io.maxRetries': '10',  # Default is 3
    
    # (Netty only) How long to wait between retries of fetches. The maximum delay caused by retrying is 15 seconds
    # by default, calculated as maxRetries * retryWait.
    'spark.shuffle.io.retryWait': '15s',  # Default is 5s
    
    # Number of failures of any particular task before giving up on the job. The total number of failures spread
    # across different tasks will not cause the job to fail; a particular task has to fail this number of attempts.
    # Should be greater than or equal to 1. Number of allowed retries = this value - 1.
    'spark.task.maxFailures': '10', # Default is 4.

    # Number of consecutive stage attempts allowed before a stage is aborted.
    'spark.stage.maxConsecutiveAttempts': '10' # Default is 4.
}

In [None]:
hl.init(spark_conf=EXTRA_SPARK_CONFIG,
        min_block_size=50,
        default_reference='GRCh38')

Check the configuration.

In [None]:
sc = hl.spark_context()
config = sc._conf.getAll()
config.sort()
config

# Load the GWAS results

In [None]:
gwas_results = hl.read_table(HAIL_GWAS_TAB)

In [None]:
gwas_results.describe()

In [None]:
gwas_results.count()

# Manhattan plot

## Interactive plot

In [None]:
p = hl.plot.manhattan(gwas_results.p_value)

In [None]:
show(p)

## Static plot

TODO(deflaux) paste plot image here

In [None]:
gwas_df = gwas_results.to_pandas()

In [None]:
gwas_df.head()

In [None]:
gwas_df.columns

In [None]:
gwas_df.columns = gwas_df.columns.str.replace('.', '_')

gwas_df.columns

In [None]:
qqman.manhattan(gwas_df, col_chr='locus_contig', col_bp='locus_position',
                col_p='p_value', gap=100, show=True,
                title=os.path.basename(HAIL_GWAS_TAB))

# Q-Q plot

## Interactive plot

In [None]:
p2 = hl.plot.qq(gwas_results.p_value)

In [None]:
show(p2)

## Static plot

TODO(deflaux) paste plot image here

In [None]:
qqman.qqplot(gwas_df.p_value, show=True, title=os.path.basename(HAIL_GWAS_TAB))

# Provenance

In [None]:
print(datetime.now())

In [None]:
!pip3 freeze