# Perform the genome-wide association study

In this notebook, we use Hail to perform a genome-wide association study on the merged variants of the AoU and UKB participants.

# Setup 

In [None]:
from datetime import datetime
import hail as hl
import os
import time

## Define constants

<div class='alert alert-block alert-info'>
    <b>Note:</b> These matrix tables were created via notebook <kbd>merge_variants.ipynb</kbd>.
</div>

```
1500 gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/merged/20210712/merged-filtered-chr1.mt
1000 gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/merged/20210712/merged-filtered-chr2.mt
879 gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/merged/20210622/merged-filtered-chr3.mt
1152 gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/merged/20210624/merged-filtered-chr4.mt
1284 gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/merged/20210624/merged-filtered-chr5.mt
14286 gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/merged/20210628/merged-filtered-chr6_chr7_chr8_chr9_chr10_chr11_chr12_chr13_chr14_chr15_chr16_chr17_chr18_chr19_chr20.mt
1775 gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/merged/20210603/merged-filtered-chr21.mt
1812 gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/merged/20210624/merged-filtered-chr22.mt
576 gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/merged/20210628/merged-filtered-chrX.mt
9 gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/merged/20210624/merged-filtered-chrY.mt
```

In [None]:
MERGED_MT = [
    'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/merged/20210712/merged-filtered-chr1.mt',
    'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/merged/20210712/merged-filtered-chr2.mt',
    'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/merged/20210622/merged-filtered-chr3.mt',
    'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/merged/20210624/merged-filtered-chr4.mt',
    'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/merged/20210624/merged-filtered-chr5.mt',
    'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/merged/20210628/merged-filtered-chr6_chr7_chr8_chr9_chr10_chr11_chr12_chr13_chr14_chr15_chr16_chr17_chr18_chr19_chr20.mt',
    'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/merged/20210603/merged-filtered-chr21.mt',
    'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/merged/20210624/merged-filtered-chr22.mt',
    'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/merged/20210628/merged-filtered-chrX.mt',
    'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/merged/20210624/merged-filtered-chrY.mt'
]

<div class='alert alert-block alert-info'>
The variant quality control metrics were computed via <a href='https://hail.is/docs/0.2/methods/genetics.html#hail.methods.variant_qc'>hail.methods.variant_qc</a> in notebook <kbd>compute_variant_qcs.ipynb</kbd>.
</div>

In [None]:
VARIANT_QCS_TAB = 'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/merged/20210712/variant_qcs.tab'

<div class='alert alert-block alert-info'>
The phenotypes and covariates were wrangled via notebooks <kbd>AOU_UKB_phenotypes.ipynb</kbd> and <kbd>compute_pcs.ipynb</kbd>.
</div>

In [None]:
PHENOTYPES_CSV = 'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/FULL_Data_Iteration1_ForGWAS.csv'

<div class='alert alert-block alert-info'>
We have several lipids phenotypes for which we will perform a GWAS. But we compute them one at a time, controlled by the constant <kbd>TARGET_PHENOTYPE</kbd>.
</div>

In [None]:
TARGET_PHENOTYPE = 'LDL_norm'

<div class='alert alert-block alert-success'>
    Limit to a small region for testing purposes. Limit to the <b>autosome</b> for the full analysis.
</div>

In [None]:
INTERVALS_TO_EXAMINE = [f'chr{chrom}' for chrom in range(21, 22)]
INTERVALS_TO_EXAMINE_NAME = '_'.join(INTERVALS_TO_EXAMINE).replace(':', 'range')

In [None]:
RESULT_BUCKET = os.getenv('WORKSPACE_BUCKET')
DATESTAMP = time.strftime('%Y%m%d')
TIMESTAMP = time.strftime('%Y%m%d_%H%M%S')
WORK_DIR = !pwd

# Outputs
GWAS_TAB = f'{os.getenv("WORKSPACE_BUCKET")}/data/merged/{DATESTAMP}/gwas-{TARGET_PHENOTYPE}-{INTERVALS_TO_EXAMINE_NAME}.tab'
HAIL_LOG = f'{WORK_DIR[0]}/hail-gwas-{TIMESTAMP}.log'
HAIL_LOG_DIR_FOR_PROVENANCE = f'{os.getenv("WORKSPACE_BUCKET")}/hail-logs/{DATESTAMP}/'

## Check access

In [None]:
for mt in MERGED_MT:
    !gsutil ls {mt}
    print('\n')

In [None]:
!gsutil ls {VARIANT_QCS_TAB}

In [None]:
!gsutil ls {PHENOTYPES_CSV}

## Start Hail 

In [None]:
# See also https://towardsdatascience.com/fetch-failed-exception-in-apache-spark-decrypting-the-most-common-causes-b8dff21075c
# See https://spark.apache.org/docs/2.4.7/configuration.html

EXTRA_SPARK_CONFIG = {
    # If set to 'true', performs speculative execution of tasks. This means if one or more tasks are running
    # slowly in a stage, they will be re-launched.
    'spark.speculation': 'true', # Default is false.
    
    # Fraction of tasks which must be complete before speculation is enabled for a particular stage.
    'spark.speculation.quantile': '0.95', # Default is 0.75

    # Default timeout for all network interactions. This config will be used in place of 
    # spark.core.connection.ack.wait.timeout, spark.storage.blockManagerSlaveTimeoutMs, 
    # spark.shuffle.io.connectionTimeout, spark.rpc.askTimeout or spark.rpc.lookupTimeout if they are not configured.
    'spark.network.timeout': '180s', # Default is 120s
        
    # (Netty only) Fetches that fail due to IO-related exceptions are automatically retried if this is set to a
    # non-zero value. This retry logic helps stabilize large shuffles in the face of long GC pauses or transient
    # network connectivity issues.
    'spark.shuffle.io.maxRetries': '10',  # Default is 3
    
    # (Netty only) How long to wait between retries of fetches. The maximum delay caused by retrying is 15 seconds
    # by default, calculated as maxRetries * retryWait.
    'spark.shuffle.io.retryWait': '15s',  # Default is 5s
    
    # Number of failures of any particular task before giving up on the job. The total number of failures spread
    # across different tasks will not cause the job to fail; a particular task has to fail this number of attempts.
    # Should be greater than or equal to 1. Number of allowed retries = this value - 1.
    'spark.task.maxFailures': '10', # Default is 4.

    # Number of consecutive stage attempts allowed before a stage is aborted.
    'spark.stage.maxConsecutiveAttempts': '10' # Default is 4.
}

In [None]:
hl.init(spark_conf=EXTRA_SPARK_CONFIG,
        min_block_size=50,
        default_reference='GRCh38',
        log=HAIL_LOG)

Check the configuration.

In [None]:
sc = hl.spark_context()
config = sc._conf.getAll()
config.sort()
config

# Load input data

## Read the matrix table

In [None]:
for i in range(0, len(MERGED_MT)):
    mt = hl.read_matrix_table(MERGED_MT[i])
    print(f'{mt.n_partitions()} {MERGED_MT[i]}')

In [None]:
merged = hl.read_matrix_table(MERGED_MT[0])

for i in range(1, len(MERGED_MT)):
    merged = merged.union_rows(hl.read_matrix_table(MERGED_MT[i]))

In [None]:
merged.describe()

### TEMPORARY: fix the sample id key

In [None]:
merged = merged.annotate_cols(cohort_key = merged.s + '_' + merged.cohort)

In [None]:
merged = merged.key_cols_by(merged.cohort_key)

In [None]:
merged.cols().show()

In [None]:
merged.describe()

## Filter to our intervals of interest

In [None]:
if len(INTERVALS_TO_EXAMINE) > 0:
    merged = hl.filter_intervals(
        merged,
        [hl.parse_locus_interval(x) for x in INTERVALS_TO_EXAMINE],
        keep=True)

In [None]:
merged_count = merged.count()

merged_count

## Read the variant QCs

In [None]:
combined_variant_qc = hl.read_table(VARIANT_QCS_TAB)

In [None]:
combined_variant_qc.describe()

In [None]:
combined_variant_qc.count()

## Read the phenotypes and covariates

In [None]:
phenotypes = (hl.import_table(PHENOTYPES_CSV,
                              impute=True,
                              delimiter=',',
                              missing='',
                              types={'s':hl.tstr})
              .rename({'\ufeffid': 's', 'gender': 'sex'}))

In [None]:
phenotypes.describe()

In [None]:
phenotypes.count()

### TEMPORARY: fix the sample id key

In [None]:
phenotypes = phenotypes.key_by(phenotypes.s)

In [None]:
id_map = hl.import_table('gs://uk-biobank-sek-data-us-east1/sample-info/bridge_7089_31063.tsv',
                         impute=True).key_by('eid_7089')

In [None]:
phenotypes = phenotypes.join(id_map, how='left')

In [None]:
phenotypes.count()

In [None]:
phenotypes = phenotypes.annotate(
    cohort_key = hl.if_else(
        hl.is_defined(phenotypes.eid_31063),
        hl.str(phenotypes.eid_31063) + '_' + phenotypes.CohortName.lower(),
        hl.str(phenotypes.s) + '_' + phenotypes.CohortName.lower()
    ))

In [None]:
phenotypes = phenotypes.key_by(phenotypes.cohort_key)

In [None]:
phenotypes.describe()

### Create indicator variables for the categorical variables

In [None]:
phenotypes.aggregate(hl.agg.counter(phenotypes.CohortName))

In [None]:
phenotypes = phenotypes.annotate(
    is_aou_cohort = (hl.case()
                     .when(phenotypes.CohortName == 'AOU', 1)
                     .when(phenotypes.CohortName == 'UKB', 0)
                     .or_missing()))

In [None]:
phenotypes.aggregate(hl.agg.counter(phenotypes.is_aou_cohort))

In [None]:
phenotypes.aggregate(hl.agg.counter(phenotypes.sex))

In [None]:
phenotypes = phenotypes.annotate(
    is_male = (hl.case()
               # https://biobank.ndph.ox.ac.uk/showcase/coding.cgi?id=9
               .when(phenotypes.sex == '1', 1)
               .when(phenotypes.sex == '0', 0)
               .when(phenotypes.sex == 'Male', 1)
               .when(phenotypes.sex == 'Female', 0)
               .or_missing()))

In [None]:
phenotypes.aggregate(hl.agg.counter(phenotypes.is_male))

In [None]:
#phenotypes.show()

In [None]:
phenotypes.describe()

In [None]:
num_samples_expected = phenotypes.count()

num_samples_expected

### TEMPORARY: write out fixed phenotypes for use with regenie

In [None]:
FIXED_PHENOTYPES_TSV = f'{os.getenv("WORKSPACE_BUCKET")}/data/merged/{DATESTAMP}/phenotypes.tsv'

phenotypes.export(FIXED_PHENOTYPES_TSV)

## TODO AoU WGS sample QC results?

## TODO UKB WES sample QC results?

# Annotate and filter to high quality input data

## Filter variants

In [None]:
merged = merged.annotate_rows(
    callRate = combined_variant_qc[merged.row_key].variant_qc.call_rate)

merged = merged.annotate_rows(
    AF = combined_variant_qc[merged.row_key].variant_qc.AF)

merged = merged.annotate_rows(
    AC = combined_variant_qc[merged.row_key].variant_qc.AC)

merged = merged.annotate_rows(
    pHWE = combined_variant_qc[merged.row_key].variant_qc.p_value_hwe)

In [None]:
merged.describe()

Per Margaret: include high-quality independent autosomal variants subset' with 
* MAF > 0.1%, missingness < 1%, 
* HWE P > 10-6 and two rounds of pruning using --indep-pairwise 200 100 0.1 and --indep-pairwise 200 100 0.05 in PLINK4. 

In [None]:
# TODO(deflaux) reconcile these cutoffs from the albuminuria GWAS with Margaret's recommendation
merged = merged.filter_rows(merged.pHWE > 1e-20, keep=True)       # HWE P > 10-20 ---> less strict
merged = merged.filter_rows(merged.callRate > 0.95, keep=True)    # missingness < 5% ---> less strict
merged = merged.filter_rows(hl.min(merged.AF) > 0.001, keep=True) # MAF > 0.1%
merged = merged.filter_rows(hl.min(merged.AF) < 0.999, keep=True)

## Filter samples

In [None]:
merged = merged.filter_cols(hl.is_defined(phenotypes[merged.col_key]), keep=True)

In [None]:
merged = merged.annotate_cols(**phenotypes[merged.col_key])

In [None]:
merged.describe()

## Check the result of filtering

In [None]:
start = datetime.now()
print(start)

In [None]:
filtered_merged_count = merged.count()

filtered_merged_count

In [None]:
end = datetime.now()
print(end)
print(end - start)

In [None]:
if num_samples_expected == filtered_merged_count[1]:
    print(f'Number of samples: {filtered_merged_count[1]}')
else:
    print(f'''
        Our data does not have the same number of samples as those from the
        derived phenotype.
            Expected: {num_samples_expected}
            Actual: {filtered_merged_count[1]}
        ''')

In [None]:
print(f'''
    Number of variants prior to filtering: {merged_count[0]}
    Number of variants after filtering: {filtered_merged_count[0]}
    Number of variants removed by filtering: {merged_count[0] - filtered_merged_count[0]}
    ''')

# Perform the GWAS 

In [None]:
merged = merged.rename({TARGET_PHENOTYPE: 'target_phenotype'})

TARGET_PHENOTYPE

In [None]:
covar_cols = [1.0, merged.is_male, merged.age, merged.age2, merged.is_aou_cohort,
              merged.pc1, merged.pc2, merged.pc3, merged.pc4, merged.pc5,
              merged.pc6, merged.pc7, merged.pc8, merged.pc9, merged.pc10]

In [None]:
merged_linassoc = hl.linear_regression_rows(
    y=merged.target_phenotype,
    x=merged.GT.n_alt_alleles(),
    covariates=covar_cols,
    pass_through=[
        'callRate',
        'AF',
        'AC',
        'pHWE']
)

In [None]:
merged_linassoc.describe()

In [None]:
start = datetime.now()
print(start)

In [None]:
print('Starting linear regression:')
merged_linassoc.write(GWAS_TAB)

In [None]:
end = datetime.now()
print(end)
print(end - start)

# Provenance

In [None]:
# Copy the Hail log to the workspace bucket so that we can retain it.
!gzip --keep {HAIL_LOG}
!gsutil cp {HAIL_LOG}.gz {HAIL_LOG_DIR_FOR_PROVENANCE}

In [None]:
print(datetime.now())

In [None]:
!pip3 freeze