# GWAS workflow
This notebook contains a typical workflow for running a GWAS.
In this case, we study a set of cardiac morphological and functional parameters of the heart ventricles extracted from shape models derived from cardiovascular magnetic resonance (CMR).

In [1]:
import os, shlex
from subprocess import call, check_output
repo_rootdir = check_output(shlex.split("git rev-parse --show-toplevel")).strip().decode('ascii')
os.chdir(repo_rootdir)

In [2]:
import src.auxiliary

In [3]:
# Import modules
import ipywidgets as widgets
import pandas as pd
import rpy2
import yaml
from copy import deepcopy
import re

In [4]:
import sys
import src.run_gwas as gwas 
from pprint import pprint

In [5]:
from src.run_gwas import GWAS_Run

### Define GWAS configuration

In [22]:
config_dir = "config_files"
w = widgets.Dropdown(options=[x for x in os.listdir(config_dir) if x.endswith("yaml")], value="ref_config.yaml")
display(w)

Dropdown(index=1, options=('config_coma.yaml', 'ref_config.yaml'), value='ref_config.yaml')

In [23]:
config_file = os.path.join(config_dir, w.value)
config = yaml.load(open(config_file))
pprint(config)

{'chromosomes': '1-22',
 'covariates': 'config_files/covariates/std_covariates.yaml',
 'exec': {'plink': 'plink'},
 'filename_patterns': {'genotype': 'config_files/genotype_patterns/genotype_patterns.yaml',
                       'gwas': 'output/traditional_indices/{suffix}/GWAS__{{phenotype}}__{suffix}',
                       'phenotype': 'data/cardiac_indices/CMR_info_LVRVLARA_11350.csv',
                       'phenotype_intermediate': 'data/cardiac_indices/adjusted_for_covariates/CMR_info_LVRVLARA_11350.csv',
                       'tmpdir': 'data/tmp/traditional_indices/{suffix}'},
 'quality_control': 'config_files/quality_control/quality_control.yaml',
 'sample_black_lists': None,
 'sample_white_lists': ['data/ids_list/cmr_british_ids.txt',
                        'data/ids_list/women.txt']}


  


##### Define file name rules

In [24]:
from main import adjust_for_covariates, generate_summary_and_figures, extract_formatter_tokens

In [25]:
name_rules = yaml.load(open(os.path.join(config_dir, "filename_rules/filename_rules.yaml")))
pprint(name_rules)

{'covariates': {'config_files/covariates/std_covariates.yaml': 'std_covariates',
                'config_files/covariates/std_covariates_and_z5.yaml': 'std_covariates_adj_by_z5',
                'config_files/covariates/std_covariates_no_SBP.yaml': 'std_covariates_no_SBP'},
 'quality_control': {'config_files/quality_control/no_quality_control.yaml': 'no_qc',
                     'config_files/quality_control/quality_control.yaml': 'qc'},
 'sample_white_lists': {('data/ids_list/cmr_british_ids.txt',): 'GBR',
                        ('data/ids_list/cmr_british_ids.txt', 'data/ids_list/men.txt'): 'GBR_MEN',
                        ('data/ids_list/cmr_british_ids.txt', 'data/ids_list/women.txt'): 'GBR_WOMEN',
                        ('data/ids_list/cmr_ids.txt',): 'ALL_ETHNICITIES'}}


  """Entry point for launching an IPython kernel.


In [26]:
suffix_pattern = "{covariates}__{sample_white_lists}__{quality_control}"

In [27]:
tokens = extract_formatter_tokens(suffix_pattern)

In [28]:
for token in tokens:
    if token in config.keys():
        if isinstance(config[token], list):
            # need to cast to tuple because lists cannot be dict keys
            option_value = tuple(config[token])
        else:
            option_value = config[token]
        tokens[token] = name_rules[token][option_value]
suffix = suffix_pattern.format(**tokens) 
config["suffix"] = suffix

In [29]:
pprint(config)

{'chromosomes': '1-22',
 'covariates': 'config_files/covariates/std_covariates.yaml',
 'exec': {'plink': 'plink'},
 'filename_patterns': {'genotype': 'config_files/genotype_patterns/genotype_patterns.yaml',
                       'gwas': 'output/traditional_indices/{suffix}/GWAS__{{phenotype}}__{suffix}',
                       'phenotype': 'data/cardiac_indices/CMR_info_LVRVLARA_11350.csv',
                       'phenotype_intermediate': 'data/cardiac_indices/adjusted_for_covariates/CMR_info_LVRVLARA_11350.csv',
                       'tmpdir': 'data/tmp/traditional_indices/{suffix}'},
 'quality_control': 'config_files/quality_control/quality_control.yaml',
 'sample_black_lists': None,
 'sample_white_lists': ['data/ids_list/cmr_british_ids.txt',
                        'data/ids_list/women.txt'],
 'suffix': 'std_covariates__GBR_WOMEN__qc'}


In [30]:
config["filenames"] = {}
for _fp in ["phenotype", "phenotype_intermediate", "tmpdir", "gwas"]:
    fp = config["filename_patterns"][_fp]
    tokens = extract_formatter_tokens(fp)
    filename = fp.format(**{token: config.get(token, None) for token in tokens})
        
    if _fp != "gwas":
        config["filename_patterns"].pop(_fp, None)
        config["filenames"][_fp] = filename
    else:
        config["filename_patterns"][_fp] = filename

In [31]:
config["chromosomes"] = 22

In [32]:
config

{'chromosomes': 22,
 'sample_white_lists': ['data/ids_list/cmr_british_ids.txt',
  'data/ids_list/women.txt'],
 'sample_black_lists': None,
 'covariates': 'config_files/covariates/std_covariates.yaml',
 'quality_control': 'config_files/quality_control/quality_control.yaml',
 'filename_patterns': {'genotype': 'config_files/genotype_patterns/genotype_patterns.yaml',
  'gwas': 'output/traditional_indices/std_covariates__GBR_WOMEN__qc/GWAS__{phenotype}__std_covariates__GBR_WOMEN__qc'},
 'exec': {'plink': 'plink'},
 'suffix': 'std_covariates__GBR_WOMEN__qc',
 'filenames': {'phenotype': 'data/cardiac_indices/CMR_info_LVRVLARA_11350.csv',
  'phenotype_intermediate': 'data/cardiac_indices/adjusted_for_covariates/CMR_info_LVRVLARA_11350.csv',
  'tmpdir': 'data/tmp/traditional_indices/std_covariates__GBR_WOMEN__qc'}}

### Adjust for covariates and inverse-normalise
The studied phenotypes were found to be strongly associated with variables such as gender, height, BMI, age and blood pressure.
In order to standardize the phenotypes, they are adjusted for all of these covariates.

In [33]:
adjust_for_covariates(config)

Rscript src/adjust_for_covariates.R -i data/cardiac_indices/CMR_info_LVRVLARA_11350.csv -o data/cardiac_indices/adjusted_for_covariates/CMR_info_LVRVLARA_11350.csv --samples_white_list data/ids_list/cmr_british_ids.txt data/ids_list/women.txt --covariates_file config_files/covariates/std_covariates.yaml --phenotypes_black_list ID subset


### Run GWAS

In [34]:
gwas = GWAS_Run(config)
gwas.run()

Processing LVEDV...
  Processing chromosome 22...
Processing LVESV...
  Processing chromosome 22...
Processing LVSV...
  Processing chromosome 22...
Processing LVEF...
  Processing chromosome 22...
Processing LVM...
  Processing chromosome 22...
Processing RVEDV...
  Processing chromosome 22...
Processing RVESV...
  Processing chromosome 22...
Processing RVSV...
  Processing chromosome 22...
Processing RVEF...
  Processing chromosome 22...
Processing LAEDV...
  Processing chromosome 22...
Processing LAESV...
  Processing chromosome 22...
Processing LASV...
  Processing chromosome 22...
Processing LAEF...
  Processing chromosome 22...
Processing RAEDV...
  Processing chromosome 22...
Processing RAESV...
  Processing chromosome 22...
Processing RASV...
  Processing chromosome 22...
Processing RAEF...
  Processing chromosome 22...


In [35]:
config

{'chromosomes': 22,
 'sample_white_lists': ['data/ids_list/cmr_british_ids.txt',
  'data/ids_list/women.txt'],
 'sample_black_lists': None,
 'covariates': {'data/covariates.csv': [{'id': 'eid'},
   'X50',
   'X4079',
   'X4080',
   'X21001',
   'X21003',
   'X31']},
 'quality_control': {'hwe_pval_thres': '1e-6',
  'snp_missing_rate_thres': 0.1,
  'sample_missing_rate_thres': 0.1,
  'maf_thres': 0.01},
 'filename_patterns': {'genotype': {'bed': 'data/genotypes/calls/ukb_cal_chr{chromosome}_v2_31803_indiv.bed',
   'bim': 'data/genotypes/calls/ukb_cal_chr{chromosome}_v2_31803_indiv.bim',
   'fam': 'data/genotypes/calls/ukb_cal_chr{chromosome}_v2_31803_indiv.fam'},
  'gwas': 'output/traditional_indices/std_covariates__GBR_WOMEN__qc/GWAS__{phenotype}__std_covariates__GBR_WOMEN__qc'},
 'exec': {'plink': 'plink'},
 'suffix': 'std_covariates__GBR_WOMEN__qc',
 'filenames': {'phenotype': 'data/cardiac_indices/CMR_info_LVRVLARA_11350.csv',
  'phenotype_intermediate': 'data/cardiac_indices/adjuste

### Generate figures