# GWAS workflow
This notebook contains a typical workflow for running a GWAS.
In this case, we study a set of cardiac morphological and functional parameters of the heart ventricles extracted from shape models derived from cardiovascular magnetic resonance (CMR).

In [1]:
# Import standard modules
import ipywidgets as widgets
import os
import pandas as pd

# Import my modules
import GWAS_pipeline.code.run_gwas as gwas

In [2]:
# Filter individuals according to input with a widget.
# Adjust for a set of covariates selected with a widget
# Use pybiomart
# Use some gene ontology?

## Preprocess data (not yet implemented, this is done separately)


In [3]:
# Widget "adjusted by covariates"
## if yes, which covariates

# Widget "Gender": all, male, female

# Widget "Ethnicity"

# Widget "InvNorm"

# Select number of individuals?

### Filter by ethnicity (or control for population stratification)
Taking care of population stratification is essential to avoid fake associations in the GWAS.
In this case we filter for the major ethnic group in UK Biobank (British).

### Inverse-normalization
This a usual procedure where the phenotype values are mapped to new values following a standard normal distribution. This step is necessary if the usual hypothesis test for GWAS is applied.


## Run GWAS with Plink

In [4]:
# Add widgets that control the GWAS run

In [5]:
# Validator for YAML file
yaml_config_file = "GWAS_pipeline/code/yaml_files/config_test.yaml"
gwas_run = gwas.GWAS_Run(yaml_config_file)

### Print configuration file

In [6]:
from json import dumps; print(dumps(gwas_run.config, indent=4, sort_keys=True))

{
    "chromosomes": "20-22",
    "data_dir": "/home/rodrigo/Leeds/doctorado/repos/GWAS_pipeline/data",
    "exec": {
        "plink": "plink"
    },
    "filename_patterns": {
        "genotype": {
            "bed": "{data_dir}/calls_filtered_31802/ukb_cal_chr{{chromosome}}_v2_31802_indiv.bed",
            "bim": "{data_dir}/calls_filtered_31802/ukb_cal_chr{{chromosome}}_v2_31802_indiv.bim",
            "fam": "{data_dir}/calls_filtered_31802/ukb_cal_chr{{chromosome}}_v2_31802_indiv.fam"
        },
        "gwas": "gwas/{phenotype}/gwas__{phenotype}{suffix}",
        "phenotype": {
            "covariates": "",
            "phenotype_file": "Cardiac_Function_Indexes_11350.tsv",
            "phenotype_file_tmp": "tmp/Cardiac_Function_Indexes_11350_tmp.tsv"
        }
    },
    "individuals": "{data_dir}/ids_list/cmr_british_ids.txt",
    "options": {
        "covariate_adjustment": true,
        "ethnicity": "white",
        "inverse_normalization": true,
        "segmentation_method"

In [7]:
gwas_run.run()

Processing LVEDV_manual...
Output file already exists, if a new run is desired please delete the previous file.


## Generate Manhattan plots

In order to run R code within this Python notebook, I load the rpy2 module.
Then I can run R commands by writing %R in front of the command

In [8]:
# To plot in R
%load_ext rpy2.ipython
%R suppressMessages(require(qqman))
%R suppressMessages(require(tidyverse))

array([1], dtype=int32)

### Control panel

In [9]:
w_pheno = widgets.Dropdown(
    options=gwas_run.phenotypes,
    # value='pineapple', # Defaults to 'pineapple'
    # layout={'width': 'max-content'}, # If the items' names are long
    description='Phenotype',
    disabled=False,
)

display(w_pheno)

w_chr = widgets.Dropdown(
    options=["all"] + [i for i in range(1,23)] + ["X", "Y"],
    value='all', # rows=10,
    description='chromosome:',
)

display(w_chr)

Dropdown(description='Phenotype', options=('LVEDV_manual', 'LVEDV_automatic', 'LVESV_manual', 'LVESV_automatic…

Dropdown(description='chromosome:', options=('all', 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,…

### Plot

In [10]:
gwas_filename = gwas_run.gwas_fp.format(phenotype=w_pheno.value, suffix=gwas_run.output_suffix)
img_filename = gwas_run.gwas_fp.format(phenotype=w_pheno.value, suffix=gwas_run.output_suffix) + ".png"

chromosome = int(w_chr.value) if w_chr.value != "all" else "all"
chromosome = w_chr.value

%R -i img_filename
%R -i gwas_filename

if chromosome == "all":
  if not os.path.exists(img_filename):
    %R gwas_df <- suppressMessages(read_delim(gwas_filename, delim="\t")  )
    %R png(img_filename, width=800, height=400); manhattan(gwas_df %>% filter(!is.na(P))); dev.off()
  image = open(img_filename, "rb").read()
  w_img = widgets.Image(
    value=image,
    format='png',
    width=800,
    height=400,
  )
  display(w_img)

else:
    %R gwas_df <-  suppressMessages(read_delim(gwas_filename, delim="\t"))
    %R -i chromosome
    %R manhattan(gwas_df %>% filter(!is.na(P) & CHR == chromosome))
    

Image(value=b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x01\xe0\x00\x00\x01\xe0\x08\x02\x00\x00\x00\xf2\xb6)…

In [11]:
#To plot in Python
#import fastlmm.util.util as flutil
#from fastlmm.util.stats import plotp
#flutil.manhattan_plot(
#  gwas_df[["CHR", "BP", "P"]].values, 
#  pvalue_line=1e-8, 
#  xaxis_unit_bp=False
#)

# Downstream analysis

In [21]:
# List top GWAS hits as a table.
w_pvalthr = widgets.FloatLogSlider(
    value=-7,
    base=10,
    min=-10, # max exponent of base
    max=-1.5, # min exponent of base
    step=0.5, # exponent step
    description='p-value threshold'
)
display(w_pvalthr)

FloatLogSlider(value=1e-10, description='p-value threshold', max=-1.5, min=-10.0, step=0.5)

## Integration with Biomart databases


In [13]:
from pybiomart import Server

server = Server(host='http://www.ensembl.org')

dataset = (server.marts['ENSEMBL_MART_ENSEMBL']
                 .datasets['hsapiens_gene_ensembl'])

dataset.query(attributes=['ensembl_gene_id', 'external_gene_name', 'chromosome_name', 'transcription_start_site'],
              filters={'chromosome_name': ['1','2'], 'biotype': "protein_coding"})

Unnamed: 0,Gene stable ID,Gene name,Chromosome/scaffold name,Transcription start site (TSS)
0,ENSG00000116996,ZP4,1,237890794
1,ENSG00000116996,ZP4,1,237890922
2,ENSG00000270188,MTRNR2L11,1,237945275
3,ENSG00000203685,STUM,1,226548764
4,ENSG00000203685,STUM,1,226600589
...,...,...,...,...
21877,ENSG00000144036,EXOC6B,2,72515754
21878,ENSG00000171551,ECEL1,2,232482458
21879,ENSG00000171551,ECEL1,2,232487821
21880,ENSG00000171551,ECEL1,2,232487834
