# GWAS workflow
This notebook contains a typical workflow for running a GWAS.
In this case, we study a set of cardiac morphological and functional parameters of the heart ventricles extracted from shape models derived from cardiovascular magnetic resonance (CMR).

In [1]:
import os, shlex
from subprocess import call, check_output
repo_rootdir = check_output(shlex.split("git rev-parse --show-toplevel")).strip().decode('ascii')
os.chdir(repo_rootdir)

In [2]:
import src.auxiliary

In [3]:
# Import modules
import ipywidgets as widgets
import pandas as pd
import rpy2
import yaml
from copy import deepcopy

In [4]:
## Import my modules
import sys
import src.run_gwas as gwas 
from pprint import pprint

## Select GWAS input values

In [15]:
selection = widgets.SelectMultiple(
  options=sorted(os.listdir("data/coma_output")), 
  layout=widgets.Layout(height="300px")
)
display(selection)

SelectMultiple(layout=Layout(height='300px'), options=('2020-09-11_02-13-41', '2020-09-30_10-51-51', '2020-09-…

## Preprocess data (not yet implemented here, this is done separately in analysis/adjust_for_covariates.R)


### Filter by ethnicity (or control for population stratification)
Taking care of population stratification is essential to avoid spurious associations in the GWAS.
In this case we filter for the major ethnic group in UK Biobank (British).

### Adjust for covariates and inverse-normalize
The studied phenotypes were found to be strongly associated with variables such as gender, height, BMI, age and blood pressure.
In order to standardize the phenotypes, they are adjusted for all of these covariates.

In [44]:
covariates = ["X50", "X4079", "X4080", "X21001", "X21003", "X31"]
# covariates = ["X50", "X4079", "X21001", "X21003", "X31"]

In [46]:
for experiment in selection.value:
    print(experiment)
    adj_command = [
        "Rscript", "src/adjust_for_covariates_2.R", 
        "-i", "data/coma_output/{}/latent_space.csv".format(experiment), 
        "-c", "data/covariates.csv",
        "-o", "data/coma_output/{}/latent_space_adjusted.csv".format(experiment),
        "--covariates"] + covariates + [
        "--phenotypes_black_list", "ID", "subset",
        "--keep_non_cov_adj"]
    call(adj_command)

2020-09-11_02-13-41
2020-09-30_12-36-48


## Run GWAS with Plink

### Create new yaml files based on old one

In [47]:
import re
coma_yml_dir = "config_files/coma"
ref_config = yaml.load(open(os.path.join(coma_yml_dir, "config_coma.yaml")))

pprint(selection.value)

for suffix in ["_adjusted"]:
  for run_id in selection.value:  
    config = deepcopy(ref_config)
    config['chromosomes'] = '1-22'    
    config['filename_patterns']['phenotype'] = {
        'phenotype_file': 'coma_output/%s/latent_space%s.csv' % (run_id, suffix),
        'phenotype_file_tmp': 'tmp/%s__latent_space%s.csv.tmp' % (run_id, suffix),
        'covariates': ""
    }
    config['filename_patterns']['gwas'] = "%s%s/GWAS__{{phenotype}}" % (run_id, suffix)
    # config["phenotype_list"] = "phenotypes%s.txt" % suffix
    yaml_file = os.path.join(coma_yml_dir, "config__{}{}.yaml".format(run_id, suffix))
    yaml.dump(config, open(yaml_file, "w"))
    # pprint(config)

('2020-09-11_02-13-41', '2020-09-30_12-36-48')


  This is separate from the ipykernel package so we can avoid doing imports until


In [48]:
# yaml_files = [x for x in os.listdir(coma_yml_dir) if x.endswith(".yaml") and x.startswith("config")] 
yaml_files = sorted([x for x in os.listdir(coma_yml_dir) if x.endswith("adjusted.yaml") and x.startswith("config")])

w_yaml = widgets.SelectMultiple(
    options=yaml_files,
    description='Yaml File',
    disabled=False,
    layout=widgets.Layout(width='500px', height='300px')
)

display(w_yaml)

SelectMultiple(description='Yaml File', layout=Layout(height='300px', width='500px'), options=('config__2020-0…

In [39]:
for yaml_file in [os.path.join(coma_yml_dir, x) for x in w_yaml.value]:
    gwas_run = gwas.GWAS_Run(yaml_file)
    gwas_run.run()

['IID', 'z0_adj', 'z1_adj', 'z2_adj', 'z3_adj', 'z4_adj', 'z5_adj', 'z6_adj', 'z7_adj']
Processing z0_adj...
  Processing chromosome 1...
  Processing chromosome 2...
  Processing chromosome 3...
  Processing chromosome 4...
  Processing chromosome 5...
  Processing chromosome 6...
  Processing chromosome 7...
  Processing chromosome 8...
  Processing chromosome 9...
  Processing chromosome 10...
  Processing chromosome 11...
  Processing chromosome 12...
  Processing chromosome 13...
  Processing chromosome 14...
  Processing chromosome 15...
  Processing chromosome 16...
  Processing chromosome 17...
  Processing chromosome 18...
  Processing chromosome 19...
  Processing chromosome 20...
  Processing chromosome 21...
  Processing chromosome 22...
Processing z1_adj...
  Processing chromosome 1...
  Processing chromosome 2...
  Processing chromosome 3...
  Processing chromosome 4...
  Processing chromosome 5...
  Processing chromosome 6...
  Processing chromosome 7...
  Processing chr

KeyboardInterrupt: 

## Select phenotypes and chromosomes

In [20]:
w_phenos = widgets.SelectMultiple(
    options=gwas_run.phenotypes,
    value=gwas_run.phenotypes,
    description='Phenotypes:',
    disabled=False
)
display(w_phenos)

w_chr = widgets.SelectMultiple(
    options=["all"] + [i for i in range(1,23)],
    value=['all'],
    description='Chromosomes',
    disabled=False
)

display(w_chr)


AttributeError: 'GWAS_Run' object has no attribute 'phenotypes'

In [None]:
gwas_run.chromosomes = w_chr.value if 'all' not in w_chr.value else [i for i in range(1,23)]
gwas_run.phenotypes = w_phenos.value
print(gwas_run.chromosomes)
print(gwas_run.phenotypes)

## Generate Manhattan plots

For this, we use the R qqman package. In order to run R code within this Python notebook, I load the rpy2 module.
Then I can run R commands by writing %R in front of the command

In [None]:
# To plot in R
%load_ext rpy2.ipython
%R suppressMessages(require(qqman))
%R suppressMessages(require(tidyverse))

### Control panel

In [None]:
w_pheno = widgets.Dropdown(
    options=gwas_run.phenotypes,
    description='Phenotype',
    disabled=False,
)

display(w_pheno)

w_chr = widgets.Dropdown(
    options=['all'] + list(gwas_run.chromosomes),
    value='all',
    rows=10,
    description='chromosomes:',
)

display(w_chr)

### Plot

In [None]:
gwas_filename = gwas_run.gwas_fp.format(phenotype=w_pheno.value, suffix=gwas_run.output_suffix)
img_filename = gwas_run.gwas_fp.format(phenotype=w_pheno.value, suffix=gwas_run.output_suffix) + ".png"

chromosome = int(w_chr.value) if w_chr.value != "all" else "all"
chromosome = w_chr.value

%R -i img_filename
%R -i gwas_filename

if chromosome == "all":
  if not os.path.exists(img_filename):
    %R gwas_df <- suppressMessages(read_delim(gwas_filename, delim="\t")  )
    %R png(img_filename, width=800, height=400); manhattan(gwas_df %>% filter(!is.na(P))); dev.off()
  image = open(img_filename, "rb").read()
  w_img = widgets.Image(
    value=image,
    format='png',
    width=800,
    height=400,
  )
  display(w_img)

else:
    %R gwas_df <-  suppressMessages(read_delim(gwas_filename, delim="\t"))
    %R -i chromosome    
    %R manhattan(gwas_df %>% filter(!is.na(P) & CHR == chromosome), main= glue::glue("{chromosome}")
    