# Phenome-wide association analysis (PheWAS)

Most prevalent diseases in Belgium:

- Diabetes: type 2 diabetes 
- Cancer: prostate and breast cancer 
- Cardiovascular disease: ischemic heart disease and hypertension 
- Muscoloskeletal: low back pain, neck pain, osteoarthritis, dosorpathies, and arthropathies
- Mental and neurological disorders: anxiety and depression
- Infectious diseases: influenza
- Immune disorders: rheumatoid arthritis 
- Respiratory disease: chronic obstructive disease
- Obesity

In [1]:
library(data.table)
library(dplyr)
library(stringr)
library(reshape2)
library(PheWAS)


Attaching package: ‘dplyr’


The following objects are masked from ‘package:data.table’:

    between, first, last


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union



Attaching package: ‘reshape2’


The following objects are masked from ‘package:data.table’:

    dcast, melt


Loading required package: tidyr


Attaching package: ‘tidyr’


The following object is masked from ‘package:reshape2’:

    smiths


Loading required package: ggplot2

Loading required package: parallel

Welcome to the new version of PheWAS. This version has many updates; please see https://github.com/PheWAS/PheWAS/tree/legacy for the legacy release if needed. Check ?PheWAS for more documentation



In [2]:
# Test dataset
panel.test = read.table('panel_test.tab', header = TRUE)
ids = read.table('phewas.ids', header = FALSE)

panel.test = panel.test %>% filter(FID %in% ids$V1)

target.ids = panel.test %>% select(FID,IID)

gender = panel.test %>% select(FID, sex)
males = gender %>% filter(sex == 1) %>% select(FID)
females = gender %>% filter(sex == 2) %>% select(FID)

In [4]:
# Load ICD diagnoses
load('ukb_icd10.rda') # ukb.icd10
load('ukb_icd9.rda') # ukb.icd9 = ukb.ic9

In [5]:
# Load phecode databases
data(sex_restriction)
data(phemap)
data(phecode_map_icd10)
data(phecode_exclude)
data(phecode_rollup_map)
data(pheinfo)

phecode.sex = sex_restriction
phemap.icd9 = phemap
phemap.icd10 = phecode_map_icd10
phecode.excl = phecode_exclude
phecode.ont = phecode_rollup_map
pheinfo = pheinfo

phemap.icd10 = phemap.icd10 %>% rename('icd10' = 'code')
#phecodes.main = pheinfo %>% filter(!str_detect(phecode, "\\."))

In [6]:
# Subset diagnoses data to target ids
target.icd9 = ukb.icd9 %>% filter(sample %in% target.ids$FID)
target.icd10 = ukb.icd10 %>% filter(sample %in% target.ids$FID)

In [7]:
target.full = bind_rows(target.icd9,target.icd10)

In [8]:
# Map ICD9 data to phecodes
phemap.icd9$code = sub("\\.", "", phemap.icd9$icd9)
phemap.icd9 = inner_join(phemap.icd9, pheinfo, by = 'phecode')
phemap.icd9 = phemap.icd9 %>% select(c(icd9, phecode, code, description, group))

# Map ICD10 data to phecodes
phemap.icd10$code = sub("\\.", "", phemap.icd10$icd10)
phemap.icd10 = inner_join(phemap.icd10, pheinfo, by = 'phecode')
phemap.icd10 = phemap.icd10 %>% select(c(icd10, phecode, code, description, group))

# Merge with target.icd9
target.icd9 = inner_join(target.icd9, phemap.icd9, by = 'code')
target.icd9 = target.icd9 %>% rename('icd' = 'icd9')
target.icd10 = inner_join(target.icd10, phemap.icd10, by = 'code')
target.icd10 = target.icd10 %>% rename('icd' = 'icd10')

# Full diagnoses
target.icd = bind_rows(target.icd9,target.icd10)

write.table(target.icd, 'target_phecode_qc.tab', col.names = TRUE, row.names = FALSE, quote = FALSE, sep = '\t')

### Endocrine/metabolic disorders

#### Type 2 diabetes

In [11]:
dis.excl.codes = phecode.excl %>% filter(code == '250.2')
dis.excl.codes = dis.excl.codes$exclusion_criteria

dis.excl.ind = target.icd %>% filter(phecode %in% dis.excl.codes)
dis.excl.samples = dis.excl.ind$sample

dis.cases = target.icd %>% filter(phecode == '250.2')
dis.cases = dis.cases$sample

dis.controls = target.ids %>% filter(!FID %in% dis.cases) %>% filter(!FID %in% dis.excl.samples)
dis.controls = dis.controls$FID

# Cases and controls
cases = rep(1,length(dis.cases))
length(cases)
controls = rep(0,length(dis.controls))
length(controls)
T2D = c(cases,controls)
FID = c(dis.cases, dis.controls)
IID = FID
T2D.df = data.frame(FID,IID,T2D)
T2D.df = distinct(T2D.df)
write.table(T2D.df, 'disease_table/t2d.pheno', col.names = TRUE, row.names = FALSE, quote = FALSE)

#### Obesity

In [22]:
dis.excl.codes = phecode.excl %>% filter(code == '278.1')
dis.excl.codes = dis.excl.codes$exclusion_criteria

dis.excl.ind = target.icd %>% filter(phecode %in% dis.excl.codes)
dis.excl.samples = dis.excl.ind$sample

dis.cases = target.icd %>% filter(phecode == '278.1')
dis.cases = dis.cases$sample

dis.controls = target.ids %>% filter(!FID %in% dis.cases) %>% filter(!FID %in% dis.excl.samples)
dis.controls = dis.controls$FID

# Cases and controls
cases = rep(1,length(dis.cases))
length(cases)
controls = rep(0,length(dis.controls))
length(controls)
OBS = c(cases,controls)
FID = c(dis.cases, dis.controls)
IID = FID
OBS.df = data.frame(FID,IID,OBS)
OBS.df = distinct(OBS.df)
write.table(OBS.df, 'disease_table/obs.pheno', col.names = TRUE, row.names = FALSE, quote = FALSE)

### Cardiovascular diseases

#### Ischemic heart disease

In [24]:
dis.cases.p = target.icd %>% filter(str_detect(phecode, c('411')))
dis.cases = dis.cases.p$sample

dis.controls = target.ids %>% filter(!FID %in% dis.cases) 
dis.controls = dis.controls$FID

# Cases and controls
cases = rep(1,length(dis.cases))
length(cases)
controls = rep(0,length(dis.controls))
length(controls)
ISC = c(cases,controls)
FID = c(dis.cases, dis.controls)
IID = FID
ISC.df = data.frame(FID,IID,ISC)
ISC.df = ISC.df %>% distinct()
write.table(ISC.df, 'disease_table/isc.pheno', col.names = TRUE, row.names = FALSE, quote = FALSE)

In [25]:
dim(ISC.df[ISC.df$ISC == 1,])

#### Angina pectoris

In [26]:
dis.excl.codes = phecode.excl %>% filter(code == '411.3')
dis.excl.codes = dis.excl.codes$exclusion_criteria

dis.excl.ind = target.icd %>% filter(phecode %in% dis.excl.codes)
dis.excl.samples = dis.excl.ind$sample

dis.cases = target.icd %>% filter(phecode == '411.3')
dis.cases = dis.cases$sample

dis.controls = target.ids %>% filter(!FID %in% dis.cases) %>% filter(!FID %in% dis.excl.samples)
dis.controls = dis.controls$FID

# Cases and controls
cases = rep(1,length(dis.cases))
length(cases)
controls = rep(0,length(dis.controls))
length(controls)
ANG = c(cases,controls)
FID = c(dis.cases, dis.controls)
IID = FID
ANG.df = data.frame(FID,IID,ANG)
write.table(ANG.df, 'disease_table/ang.pheno', col.names = TRUE, row.names = FALSE, quote = FALSE)

#### Myocardial infarction

In [28]:
dis.excl.codes = phecode.excl %>% filter(code == '411.2')
dis.excl.codes = dis.excl.codes$exclusion_criteria

dis.excl.ind = target.icd %>% filter(phecode %in% dis.excl.codes)
dis.excl.samples = dis.excl.ind$sample

dis.cases = target.icd %>% filter(phecode == '411.2')
dis.cases = dis.cases$sample

dis.controls = target.ids %>% filter(!FID %in% dis.cases) %>% filter(!FID %in% dis.excl.samples)
dis.controls = dis.controls$FID

# Cases and controls
cases = rep(1,length(dis.cases))
length(cases)
controls = rep(0,length(dis.controls))
length(controls)
MYO = c(cases,controls)
FID = c(dis.cases, dis.controls)
IID = FID
MYO.df = data.frame(FID,IID,MYO)
write.table(MYO.df, 'disease_table/myo.pheno', col.names = TRUE, row.names = FALSE, quote = FALSE)

#### Coronary atherosclerosis

In [29]:
dis.excl.codes = phecode.excl %>% filter(code == '411.4')
dis.excl.codes = dis.excl.codes$exclusion_criteria

dis.excl.ind = target.icd %>% filter(phecode %in% dis.excl.codes)
dis.excl.samples = dis.excl.ind$sample

dis.cases = target.icd %>% filter(phecode == '411.4')
dis.cases = dis.cases$sample

dis.controls = target.ids %>% filter(!FID %in% dis.cases) %>% filter(!FID %in% dis.excl.samples)
dis.controls = dis.controls$FID

# Cases and controls
cases = rep(1,length(dis.cases))
length(cases)
controls = rep(0,length(dis.controls))
length(controls)
CAD = c(cases,controls)
FID = c(dis.cases, dis.controls)
IID = FID
CAD.df = data.frame(FID,IID,CAD)
write.table(CAD.df, 'disease_table/cad.pheno', col.names = TRUE, row.names = FALSE, quote = FALSE)

#### Hypertension

In [31]:
dis.excl.codes = phecode.excl %>% filter(code == '401.1') %>% filter(!code == '571.81')
dis.excl.codes = dis.excl.codes$exclusion_criteria

dis.excl.ind = target.icd %>% filter(phecode %in% dis.excl.codes)
dis.excl.samples = dis.excl.ind$sample

dis.cases = target.icd %>% filter(phecode == '401.1')
dis.cases = dis.cases$sample

dis.controls = target.ids %>% filter(!FID %in% dis.cases) %>% filter(!FID %in% dis.excl.samples)
dis.controls = dis.controls$FID

# Cases and controls
cases = rep(1,length(dis.cases))
length(cases)
controls = rep(0,length(dis.controls))
length(controls)
HYP = c(cases,controls)
FID = c(dis.cases, dis.controls)
IID = FID
HYP.df = data.frame(FID,IID,HYP)
write.table(HYP.df, 'disease_table/hyp.pheno', col.names = TRUE, row.names = FALSE, quote = FALSE)

### Musculoskeletal disorders

#### Low back pain (Dorsopathy)

In [33]:
dis.excl.codes = phecode.excl %>% filter(code == '721')
dis.excl.codes = dis.excl.codes$exclusion_criteria

dis.excl.ind = target.icd %>% filter(phecode %in% dis.excl.codes)
dis.excl.samples = dis.excl.ind$sample

dis.cases = target.icd %>% filter(phecode == '721')
dis.cases = dis.cases$sample

dis.controls = target.ids %>% filter(!FID %in% dis.cases) %>% filter(!FID %in% dis.excl.samples)
dis.controls = dis.controls$FID

# Cases and controls
cases = rep(1,length(dis.cases))
length(cases)
controls = rep(0,length(dis.controls))
length(controls)
BPAIN = c(cases,controls)
FID = c(dis.cases, dis.controls)
IID = FID
BPAIN.df = data.frame(FID,IID,BPAIN)
write.table(BPAIN.df, 'disease_table/bpain.pheno', col.names = TRUE, row.names = FALSE, quote = FALSE)

#### Low back pain (Cervicalgia)

In [35]:
dis.excl.codes = phecode.excl %>% filter(code == '761')
dis.excl.codes = dis.excl.codes$exclusion_criteria

dis.excl.ind = target.icd %>% filter(phecode %in% dis.excl.codes)
dis.excl.samples = dis.excl.ind$sample

dis.cases = target.icd %>% filter(phecode == '761')
dis.cases = dis.cases$sample

dis.controls = target.ids %>% filter(!FID %in% dis.cases) %>% filter(!FID %in% dis.excl.samples)
dis.controls = dis.controls$FID

# Cases and controls
cases = rep(1,length(dis.cases))
length(cases)
controls = rep(0,length(dis.controls))
length(controls)
NPAIN = c(cases,controls)
FID = c(dis.cases, dis.controls)
IID = FID
NPAIN.df = data.frame(FID,IID,NPAIN)
write.table(NPAIN.df, 'disease_table/npain.pheno', col.names = TRUE, row.names = FALSE, quote = FALSE)

#### Osteoarthritis

In [36]:
dis.cases.p = target.icd %>% filter(str_detect(phecode, c('740')))
dis.cases = dis.cases.p$sample

dis.controls = target.ids %>% filter(!FID %in% dis.cases) 
dis.controls = dis.controls$FID

# Cases and controls
cases = rep(1,length(dis.cases))
length(cases)
controls = rep(0,length(dis.controls))
length(controls)
OART = c(cases,controls)
FID = c(dis.cases, dis.controls)
IID = FID
OART.df = data.frame(FID,IID,OART)
OART.df = OART.df %>% distinct()
write.table(OART.df, 'disease_table/oart.pheno', col.names = TRUE, row.names = FALSE, quote = FALSE)

### Immune disorders

#### Rheumatoid arthritis

In [38]:
dis.excl.codes = phecode.excl %>% filter(code == '714.1')
dis.excl.codes = dis.excl.codes$exclusion_criteria

dis.excl.ind = target.icd %>% filter(phecode %in% dis.excl.codes)
dis.excl.samples = dis.excl.ind$sample

dis.cases = target.icd %>% filter(phecode == '714.1')
dis.cases = dis.cases$sample

dis.controls = target.ids %>% filter(!FID %in% dis.cases) %>% filter(!FID %in% dis.excl.samples)
dis.controls = dis.controls$FID

# Cases and controls
cases = rep(1,length(dis.cases))
length(cases)
controls = rep(0,length(dis.controls))
length(controls)
RA = c(cases,controls)
FID = c(dis.cases, dis.controls)
IID = FID
RA.df = data.frame(FID,IID,RA)
write.table(RA.df, 'disease_table/ra.pheno', col.names = TRUE, row.names = FALSE, quote = FALSE)

### Respiratory diseases

#### Chronic obstructive pulmonary disease (COPD)

In [40]:
dis.excl.codes = phecode.excl %>% filter(code %in% c('496','496.1','496.2','496.21'))
dis.excl.codes = dis.excl.codes$exclusion_criteria

dis.excl.ind = target.icd %>% filter(phecode %in% dis.excl.codes)
dis.excl.samples = dis.excl.ind$sample

dis.cases = target.icd %>% filter(phecode  %in% c('496','496.1','496.2','496.21'))
dis.cases = dis.cases$sample

dis.controls = target.ids %>% filter(!FID %in% dis.cases) %>% filter(!FID %in% dis.excl.samples)
dis.controls = dis.controls$FID

# Cases and controls
cases = rep(1,length(dis.cases))
length(cases)
controls = rep(0,length(dis.controls))
length(controls)
COPD = c(cases,controls)
FID = c(dis.cases, dis.controls)
IID = FID
COPD.df = data.frame(FID,IID,COPD)
write.table(COPD.df, 'disease_table/copd.pheno', col.names = TRUE, row.names = FALSE, quote = FALSE)

### Mental illnesses

#### Anxiety

In [41]:
dis.excl.codes = phecode.excl %>% filter(code %in% c('300.1','300.11'))
dis.excl.codes = dis.excl.codes$exclusion_criteria

dis.excl.ind = target.icd %>% filter(phecode %in% dis.excl.codes)
dis.excl.samples = dis.excl.ind$sample

dis.cases = target.icd %>% filter(phecode  %in% c('300.1','300.11'))
dis.cases = dis.cases$sample

dis.controls = target.ids %>% filter(!FID %in% dis.cases) %>% filter(!FID %in% dis.excl.samples)
dis.controls = dis.controls$FID

# Cases and controls
cases = rep(1,length(dis.cases))
length(cases)
controls = rep(0,length(dis.controls))
length(controls)
ANX = c(cases,controls)
FID = c(dis.cases, dis.controls)
IID = FID
ANX.df = data.frame(FID,IID,ANX)
write.table(ANX.df, 'disease_table/anx.pheno', col.names = TRUE, row.names = FALSE, quote = FALSE)

#### Depression

In [42]:
dis.excl.codes = phecode.excl %>% filter(code == '296.2')
dis.excl.codes = dis.excl.codes$exclusion_criteria

dis.excl.ind = target.icd %>% filter(phecode %in% dis.excl.codes)
dis.excl.samples = dis.excl.ind$sample

dis.cases = target.icd %>% filter(phecode == '296.2')
dis.cases = dis.cases$sample

dis.controls = target.ids %>% filter(!FID %in% dis.cases) %>% filter(!FID %in% dis.excl.samples)
dis.controls = dis.controls$FID

In [43]:
dis.cases = target.icd %>% filter(phecode == '296.2')
dis.cases

sample,code,meaning,icd,phecode,description,group
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>


### Infectious diseases

#### Influenza and pneumonia

In [45]:
dis.excl.codes = phecode.excl %>% filter(code == '480')
dis.excl.codes = dis.excl.codes$exclusion_criteria

dis.excl.ind = target.icd %>% filter(phecode %in% dis.excl.codes)
dis.excl.samples = dis.excl.ind$sample

dis.cases = target.icd %>% filter(phecode == '480')
dis.cases = dis.cases$sample

dis.controls = target.ids %>% filter(!FID %in% dis.cases) %>% filter(!FID %in% dis.excl.samples)
dis.controls = dis.controls$FID

# Cases and controls
cases = rep(1,length(dis.cases))
length(cases)
controls = rep(0,length(dis.controls))
length(controls)
FLU = c(cases,controls)
FID = c(dis.cases, dis.controls)
IID = FID
FLU.df = data.frame(FID,IID,FLU)
write.table(FLU.df, 'disease_table/flu.pheno', col.names = TRUE, row.names = FALSE, quote = FALSE)

### Cancer

#### Prostate cancer (males)

In [46]:
target.males = target.icd %>% filter(sample %in% males$FID)
males.ids = males

In [47]:
dis.excl.codes = phecode.excl %>% filter(code == '185')
dis.excl.codes = dis.excl.codes$exclusion_criteria

dis.excl.ind = target.males %>% filter(phecode %in% dis.excl.codes)
dis.excl.samples = dis.excl.ind$sample

dis.cases = target.males %>% filter(phecode == '185')
dis.cases = dis.cases$sample

dis.controls = males.ids %>% filter(!FID %in% dis.cases) %>% filter(!FID %in% dis.excl.samples)
dis.controls = dis.controls$FID

# Cases and controls
cases = rep(1,length(dis.cases))
length(cases)
controls = rep(0,length(dis.controls))
length(controls)
PROST = c(cases,controls)
FID = c(dis.cases, dis.controls)
IID = FID
PROST.df = data.frame(FID,IID,PROST)
write.table(PROST.df, 'disease_table/prost.pheno', col.names = TRUE, row.names = FALSE, quote = FALSE)

#### Breast cancer (females)

In [48]:
target.females = target.icd %>% filter(sample %in% females$FID)
females.ids = females

In [49]:
dis.cases.p = target.females %>% filter(str_detect(phecode, c('174.1')))
dis.cases = dis.cases.p$sample

dis.controls = females.ids %>% filter(!FID %in% dis.cases) 
dis.controls = dis.controls$FID

# Cases and controls
cases = rep(1,length(dis.cases))
length(cases)
controls = rep(0,length(dis.controls))
length(controls)
BREAST = c(cases,controls)
FID = c(dis.cases, dis.controls)
IID = FID
BREAST.df = data.frame(FID,IID,BREAST)
BREAST.df = BREAST.df %>% distinct()
write.table(BREAST.df, 'disease_table/breast.pheno', col.names = TRUE, row.names = FALSE, quote = FALSE)