# GWAS workflow
This notebook contains a typical workflow for running a GWAS.
In this case, we study a set of cardiac morphological and functional parameters of the heart ventricles extracted from shape models derived from cardiovascular magnetic resonance (CMR).

In [1]:
import os, shlex
from subprocess import call, check_output
repo_rootdir = check_output(shlex.split("git rev-parse --show-toplevel")).strip().decode('ascii')
os.chdir(repo_rootdir)

In [2]:
import src.auxiliary

In [3]:
# Import modules
import ipywidgets as widgets
import pandas as pd
import rpy2
import yaml
from copy import deepcopy
import re

In [4]:
import sys
import src.run_gwas as gwas 
from pprint import pprint

## Select GWAS input values

In [63]:
selection = widgets.SelectMultiple(
  options=sorted(os.listdir("data/coma_output")), 
  layout=widgets.Layout(height="300px")
)
display(selection)

SelectMultiple(layout=Layout(height='300px'), options=('2020-09-11_02-13-41', '2020-09-30_10-51-51', '2020-09-…

In [132]:
# PARAMETERS

qc_files = {
  "qc": "quality_control/quality_control.yaml", 
  "no_qc": "quality_control/no_quality_control.yaml"
}

covariates = {
  "adj_sbp":  ["X50", "X4079", "X4080", "X21001", "X21003", "X31"],
  "adj_no_sbp":  ["X50", "X4079", "X21001", "X21003", "X31"]
}

sample_white_lists = {
  "GBR_ALL": ["data/ids_list/cmr_british_ids.txt"],
  "GBR_MEN": ["data/ids_list/cmr_british_ids.txt", "data/ids_list/men.txt"],
  "GBR_WOMEN": ["data/ids_list/cmr_british_ids.txt", "data/ids_list/women.txt"]
}

In [138]:
coma_yml_dir = "config_files/coma"
ref_config = yaml.load(open(os.path.join(coma_yml_dir, "config_coma.yaml")))

pprint(selection.value)

latent_variables = {
  "2020-09-11_02-13-41": ["z5", "z5_adj"],
  "2020-09-30_12-36-48": ["z0", "z0_adj"]
}

for run_id in latent_variables:
  for sample_white_list in sample_white_lists:
    for cov in covariates:            
        for qc in qc_files:
          print(run_id, sample_white_list, cov,qc )
          # suffix = hash(qc_file, covariates_, sample_white_list)
          
          suffix = "{}__{}__{}".format(sample_white_list, cov, qc)
          config = deepcopy(ref_config)
          config['sample_white_lists'] = sample_white_lists[sample_white_list]
          config['covariates'] = covariates[cov]
          config['quality_control'] = qc_files[qc]
        
          config['filename_patterns']['phenotype'] = {
              'phenotype_file': 'coma_output/%s/latent_space__%s.csv' % (run_id, suffix),
              'phenotype_file_tmp': 'tmp/%s__latent_space__%s.csv.tmp' % (run_id, suffix),
              'covariates': ""
          }
          config['filename_patterns']['gwas'] = "%s__%s/GWAS__{{phenotype}}" % (run_id, suffix)
          # config['phenotypes'] = latent_variables[run_id]
            
          config["phenotype_list"] = latent_variables[run_id]
          # config["phenotype_list"] = "phenotypes%s.txt" % suffix
          
          yaml_file = os.path.join(coma_yml_dir, "config__{}__{}.yaml".format(run_id, suffix))
          yaml.dump(config, open(yaml_file, "w"))
          # pprint(config)

('2020-09-11_02-13-41', '2020-09-30_12-36-48')
2020-09-11_02-13-41 GBR_ALL adj_sbp qc
2020-09-11_02-13-41 GBR_ALL adj_sbp no_qc
2020-09-11_02-13-41 GBR_ALL adj_no_sbp qc
2020-09-11_02-13-41 GBR_ALL adj_no_sbp no_qc
2020-09-11_02-13-41 GBR_MEN adj_sbp qc
2020-09-11_02-13-41 GBR_MEN adj_sbp no_qc
2020-09-11_02-13-41 GBR_MEN adj_no_sbp qc
2020-09-11_02-13-41 GBR_MEN adj_no_sbp no_qc
2020-09-11_02-13-41 GBR_WOMEN adj_sbp qc
2020-09-11_02-13-41 GBR_WOMEN adj_sbp no_qc
2020-09-11_02-13-41 GBR_WOMEN adj_no_sbp qc
2020-09-11_02-13-41 GBR_WOMEN adj_no_sbp no_qc
2020-09-30_12-36-48 GBR_ALL adj_sbp qc
2020-09-30_12-36-48 GBR_ALL adj_sbp no_qc
2020-09-30_12-36-48 GBR_ALL adj_no_sbp qc
2020-09-30_12-36-48 GBR_ALL adj_no_sbp no_qc
2020-09-30_12-36-48 GBR_MEN adj_sbp qc
2020-09-30_12-36-48 GBR_MEN adj_sbp no_qc
2020-09-30_12-36-48 GBR_MEN adj_no_sbp qc
2020-09-30_12-36-48 GBR_MEN adj_no_sbp no_qc
2020-09-30_12-36-48 GBR_WOMEN adj_sbp qc
2020-09-30_12-36-48 GBR_WOMEN adj_sbp no_qc
2020-09-30_12-36-48 

  


In [134]:
# yaml_files = [x for x in os.listdir(coma_yml_dir) if x.endswith(".yaml") and x.startswith("config")] 
# yaml_files = sorted([x for x in os.listdir(coma_yml_dir) if x.endswith("adjusted.yaml") and x.startswith("config")])
yaml_files = sorted([x for x in os.listdir(coma_yml_dir) if x.startswith("config")])

w_yaml = widgets.SelectMultiple(
    options=yaml_files,
    description='Yaml File',
    disabled=False,
    layout=widgets.Layout(width='500px', height='300px')
)

display(w_yaml)

SelectMultiple(description='Yaml File', layout=Layout(height='300px', width='500px'), options=('config__2020-0…

In [99]:
yaml.load(open(os.path.join("config_files/coma", w_yaml.value[0])))

  """Entry point for launching an IPython kernel.


{'chromosomes': '1-22',
 'covariates': ('X50', 'X4079', 'X21001', 'X21003', 'X31'),
 'covariates_file': 'data/covariates.csv',
 'data_dir': 'data',
 'exec': {'plink': 'plink'},
 'filename_patterns': {'genotype': 'genotype_patterns/genotype_patterns.yaml',
  'gwas': '2020-09-11_02-13-41__GBR_WOMEN__adj_no_sbp__qc/GWAS__{{phenotype}}',
  'phenotype': {'covariates': '',
   'phenotype_file': 'coma_output/2020-09-11_02-13-41/latent_space__GBR_WOMEN__adj_no_sbp__qc.csv',
   'phenotype_file_tmp': 'tmp/2020-09-11_02-13-41__latent_space__GBR_WOMEN__adj_no_sbp__qc.csv.tmp'}},
 'output_dir': 'output/coma',
 'quality_control': 'quality_control/quality_control.yaml',
 'sample_black_lists': None,
 'sample_white_lists': ('data/ids_list/cmr_british_ids.txt',
  'data/ids_list/women.txt')}

## Preprocess data (not yet implemented here, this is done separately in analysis/adjust_for_covariates.R)


### Filter by ethnicity (or control for population stratification)
Taking care of population stratification is essential to avoid spurious associations in the GWAS.
In this case we filter for the major ethnic group in UK Biobank (British).

### Adjust for covariates and inverse-normalize
The studied phenotypes were found to be strongly associated with variables such as gender, height, BMI, age and blood pressure.
In order to standardize the phenotypes, they are adjusted for all of these covariates.

## Run GWAS with Plink

In [123]:
for config in w_yaml.value:
  config = yaml.load(open(os.path.join("config_files/coma", config)))
  experiment = config["filename_patterns"]["gwas"].split("__")[0]
  print(experiment)
  adj_command = [
      "Rscript", "src/adjust_for_covariates.R", 
      "-i", "data/coma_output/{}/latent_space.csv".format(experiment), 
      "-c", "data/covariates.csv",
      "-o", "data/{}".format(config["filename_patterns"]["phenotype"]["phenotype_file"]),
      "--samples_white_list"] + list(config["sample_white_lists"]) + \
      ["--phenotypes"] + [x for x in config["phenotypes"] if not x.endswith("adj")] + \
      [ "--covariates"] + list(config["covariates"]) + [
      "--phenotypes_black_list", "ID", "subset",
      "--keep_non_cov_adj"]
  pprint(adj_command)
  call(adj_command)

  


2020-09-11_02-13-41


  


2020-09-11_02-13-41


  


2020-09-11_02-13-41


  


2020-09-11_02-13-41
2020-09-11_02-13-41


  


KeyError: 'phenotypes'

In [140]:
for yaml_file in [os.path.join(coma_yml_dir, x) for x in w_yaml.value]:
    print(yaml_file)
    # pprint(yaml.safe_load(open(yaml_file)))
    gwas_run = gwas.GWAS_Run(yaml_file)
    gwas_run.run()

config_files/coma/config__2020-09-11_02-13-41__GBR_ALL__adj_no_sbp__qc.yaml
Processing z5...
  Processing chromosome 1...
  Processing chromosome 2...
  Processing chromosome 3...
  Processing chromosome 4...
  Processing chromosome 5...
  Processing chromosome 6...
  Processing chromosome 7...
  Processing chromosome 8...
  Processing chromosome 9...
  Processing chromosome 10...
  Processing chromosome 11...
  Processing chromosome 12...
  Processing chromosome 13...
  Processing chromosome 14...
  Processing chromosome 15...
  Processing chromosome 16...
  Processing chromosome 17...
  Processing chromosome 18...
  Processing chromosome 19...
  Processing chromosome 20...
  Processing chromosome 21...
  Processing chromosome 22...
Processing z5_adj...
  Processing chromosome 1...
  Processing chromosome 2...
  Processing chromosome 3...
  Processing chromosome 4...
  Processing chromosome 5...
  Processing chromosome 6...
  Processing chromosome 7...
  Processing chromosome 8...
  P

## Select phenotypes and chromosomes

In [20]:
w_phenos = widgets.SelectMultiple(
    options=gwas_run.phenotypes,
    value=gwas_run.phenotypes,
    description='Phenotypes:',
    disabled=False
)
display(w_phenos)

w_chr = widgets.SelectMultiple(
    options=["all"] + [i for i in range(1,23)],
    value=['all'],
    description='Chromosomes',
    disabled=False
)

display(w_chr)


AttributeError: 'GWAS_Run' object has no attribute 'phenotypes'

In [None]:
gwas_run.chromosomes = w_chr.value if 'all' not in w_chr.value else [i for i in range(1,23)]
gwas_run.phenotypes = w_phenos.value
print(gwas_run.chromosomes)
print(gwas_run.phenotypes)

## Generate Manhattan plots

For this, we use the R qqman package. In order to run R code within this Python notebook, I load the rpy2 module.
Then I can run R commands by writing %R in front of the command

In [None]:
# To plot in R
%load_ext rpy2.ipython
%R suppressMessages(require(qqman))
%R suppressMessages(require(tidyverse))

### Control panel

In [None]:
w_pheno = widgets.Dropdown(
    options=gwas_run.phenotypes,
    description='Phenotype',
    disabled=False,
)

display(w_pheno)

w_chr = widgets.Dropdown(
    options=['all'] + list(gwas_run.chromosomes),
    value='all',
    rows=10,
    description='chromosomes:',
)

display(w_chr)

### Plot

In [None]:
gwas_filename = gwas_run.gwas_fp.format(phenotype=w_pheno.value, suffix=gwas_run.output_suffix)
img_filename = gwas_run.gwas_fp.format(phenotype=w_pheno.value, suffix=gwas_run.output_suffix) + ".png"

chromosome = int(w_chr.value) if w_chr.value != "all" else "all"
chromosome = w_chr.value

%R -i img_filename
%R -i gwas_filename

if chromosome == "all":
  if not os.path.exists(img_filename):
    %R gwas_df <- suppressMessages(read_delim(gwas_filename, delim="\t")  )
    %R png(img_filename, width=800, height=400); manhattan(gwas_df %>% filter(!is.na(P))); dev.off()
  image = open(img_filename, "rb").read()
  w_img = widgets.Image(
    value=image,
    format='png',
    width=800,
    height=400,
  )
  display(w_img)

else:
    %R gwas_df <-  suppressMessages(read_delim(gwas_filename, delim="\t"))
    %R -i chromosome    
    %R manhattan(gwas_df %>% filter(!is.na(P) & CHR == chromosome), main= glue::glue("{chromosome}")
    