# Format GWAS summary-level data

In [6]:
library(MRAPSS)
library(readr)

In [4]:
## 1. AD
AD_raw <- readr::read_delim("./GWAS_26and5_raw/AD", "\t", escape_double = FALSE, trim_ws = TRUE, progress = F)
AD = format_data(AD_raw,
                 snp_col = "MarkerName",
                 b_col = "Beta",
                 se_col = "SE",
                 A1_col = "Effect_allele",
                 A2_col = "Non_Effect_allele",
                 p_col = "Pvalue",
                 n = 54162)

write.table(AD, "./GWAS_26and5_formatted/AD", sep="\t", quote = F, row.names = F, col.names = T)

rm(list=c("AD","AD_raw"))

[1m[1mRows: [1m[22m[34m[34m7055881[34m[39m [1m[1mColumns: [1m[22m[34m[34m8[34m[39m

[36m──[39m [1m[1mColumn specification[1m[22m [36m────────────────────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m (3): MarkerName, Effect_allele, Non_Effect_allele
[32mdbl[39m (5): Chromosome, Position, Beta, SE, Pvalue


[36mℹ[39m Use [30m[47m[30m[47m`spec()`[47m[30m[49m[39m to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set [30m[47m[30m[47m`show_col_types = FALSE`[47m[30m[49m[39m to quiet this message.

Begin formatting .... 

The raw data set has 7055881 dat lines

Remove ambiguous SNPs ..., remaining 6004428 SNPs.

Remove SNPs in MHC region ..., remaining 5993962 SNPs.

Remove duplicated SNPs ..., remaining 5993414 SNPs.

Merge SNPs with the hapmap3 snplist ..., remaining 1148529 SNPs.

Remove SNPs with alleles not matched wi

In [5]:
## 2. Alcohol
Alcohol_raw <- readr::read_delim("./GWAS_26and5_raw/Alcohol", "\t", escape_double = FALSE, trim_ws = TRUE, progress = F)

Alcohol = format_data(Alcohol_raw,
                      snp_col = "MarkerName",
                      b_col = "Beta",
                      se_col = "SE",
                      freq_col = "EAF_A1",
                      A1_col = "A1",
                      A2_col = "A2",
                      p_col = "Pval",
                      n = 414343)

write.table(Alcohol, "./GWAS_26and5_formatted/Alcohol", sep="\t", quote = F, row.names = F, col.names = T)

rm(list=c("Alcohol_raw","Alcohol"))

[1m[1mRows: [1m[22m[34m[34m11514936[34m[39m [1m[1mColumns: [1m[22m[34m[34m9[34m[39m

[36m──[39m [1m[1mColumn specification[1m[22m [36m────────────────────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m (3): MarkerName, A1, A2
[32mdbl[39m (6): CHR, POS, EAF_A1, Beta, SE, Pval


[36mℹ[39m Use [30m[47m[30m[47m`spec()`[47m[30m[49m[39m to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set [30m[47m[30m[47m`show_col_types = FALSE`[47m[30m[49m[39m to quiet this message.

Begin formatting .... 

The raw data set has 11514936 dat lines

Remove ambiguous SNPs ..., remaining 9791370 SNPs.

Remove SNPs in MHC region ..., remaining 9742990 SNPs.

Remove duplicated SNPs ..., remaining 9661810 SNPs.

Merge SNPs with the hapmap3 snplist ..., remaining 1198814 SNPs.

Remove SNPs with alleles not matched with the hapmap3 snplist, remain

In [6]:
## 3. Angina
Angina_raw <- readr::read_delim("./GWAS_26and5_raw/Angina", "\t", escape_double = FALSE, trim_ws = TRUE, progress = F)

Angina = format_data(Angina_raw,
                     snp_col = "SNPID_UKB",
                     or_col = "OR",
                     se_col = "SE",
                     freq_col = "MAF_UKB",
                     A1_col = "A1",
                     A2_col = "A2",
                     p_col = "P",
                     n_col = "NMISS",
                     info_col = "INFO_UKB")

write.table(Angina, "./GWAS_26and5_formatted/Angina", sep="\t", quote = F, row.names = F, col.names = T)

rm(list=c("Angina_raw","Angina"))

[1m[1mRows: [1m[22m[34m[34m10321705[34m[39m [1m[1mColumns: [1m[22m[34m[34m20[34m[39m

[36m──[39m [1m[1mColumn specification[1m[22m [36m────────────────────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m  (7): SNP, A1, TEST, A2, SNPID_UKB, A1_UKB, A2_UKB
[32mdbl[39m (13): CHR, BP, NMISS, OR, SE, L95, U95, STAT, P, MAF, NCHROBS, INFO_UKB,...


[36mℹ[39m Use [30m[47m[30m[47m`spec()`[47m[30m[49m[39m to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set [30m[47m[30m[47m`show_col_types = FALSE`[47m[30m[49m[39m to quiet this message.

Begin formatting .... 

The raw data set has 10321705 dat lines

Remove SNPs with imputation info less than 0.9 ..., remaining 10321675 SNPs.

Remove ambiguous SNPs ..., remaining 8761299 SNPs.

Remove SNPs in MHC region ..., remaining 8711402 SNPs.

Remove duplicated SNPs ..., remaining 871140

In [7]:
## 4. Anorexia
Anorexia_raw <- readr::read_delim("./GWAS_26and5_raw/Anorexia", "\t", escape_double = FALSE, trim_ws = TRUE, progress = F)

Anorexia = format_data(Anorexia_raw,
                       snp_col = "ID",
                       b_col = "BETA",
                       se_col = "SE",
                       A1_col = "ALT",
                       A2_col = "REF",
                       p_col = "PVAL",
                       ncase_col  = "NCAS",
                       ncontrol_col = "NCON",
                       info_col = "IMPINFO")

write.table(Anorexia, "./GWAS_26and5_formatted/Anorexia", sep = "\t", quote = F, row.names = F, col.names = T)

rm(list=c("Anorexia_raw","Anorexia"))


[1m[1mRows: [1m[22m[34m[34m8219102[34m[39m [1m[1mColumns: [1m[22m[34m[34m14[34m[39m

[36m──[39m [1m[1mColumn specification[1m[22m [36m────────────────────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m  (4): ID, REF, ALT, DIRE
[32mdbl[39m (10): CHROM, POS, BETA, SE, PVAL, NGT, IMPINFO, NEFFDIV2, NCAS, NCON


[36mℹ[39m Use [30m[47m[30m[47m`spec()`[47m[30m[49m[39m to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set [30m[47m[30m[47m`show_col_types = FALSE`[47m[30m[49m[39m to quiet this message.

Begin formatting .... 

The raw data set has 8219102 dat lines

Remove SNPs with imputation info less than 0.9 ..., remaining 5880021 SNPs.

effect allele column has some values that are not A/C/T/G. Remove these SNPs..., remaining 5880020 SNPs.

the other allele column has some values that are not A/C/T/G. Remove these SNPs...

In [8]:
## 5. ASD
ASD_raw <- readr::read_delim("./GWAS_26and5_raw/ASD", "\t", escape_double = FALSE, trim_ws = TRUE, progress = F)

ASD = format_data(ASD_raw,
                  snp_col = "snp",
                  b_col = "b",
                  se_col = "StdErr",
                  freq_col = "freq_A1",
                  A1_col = "A1",
                  A2_col = "A2",
                  p_col = "p",
                  n_col = "N")

write.table(ASD, "./GWAS_26and5_formatted/ASD", sep = "\t", quote = F, row.names = F, col.names = T)

rm(list=c("ASD_raw","ASD"))

[1m[1mRows: [1m[22m[34m[34m6517324[34m[39m [1m[1mColumns: [1m[22m[34m[34m15[34m[39m

[36m──[39m [1m[1mColumn specification[1m[22m [36m────────────────────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m  (4): snp, A1, A2, direction
[32mdbl[39m (11): chr, bp_hg19, or, tor_l95, or_u95, b, se, p, freq_A1, info, N


[36mℹ[39m Use [30m[47m[30m[47m`spec()`[47m[30m[49m[39m to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set [30m[47m[30m[47m`show_col_types = FALSE`[47m[30m[49m[39m to quiet this message.

Begin formatting .... 

The raw data set has 6517324 dat lines

effect allele column has some values that are not A/C/T/G. Remove these SNPs..., remaining 6517323 SNPs.

the other allele column has some values that are not A/C/T/G. Remove these SNPs..., remaining 6517322 SNPs.

Remove ambiguous SNPs ..., remaining 5593524 SN

In [9]:
## 6. BMI
BMI_raw <- readr::read_delim("./GWAS_26and5_raw/BMI", "\t", escape_double = FALSE, trim_ws = TRUE, progress = F)

BMI = format_data(BMI_raw,
                  snp_col = "SNPID_UKB",
                  b_col = "BETA",
                  se_col = "SE",
                  freq_col = "MAF_UKB",
                  A1_col = "A1",
                  A2_col = "A2",
                  p_col = "P",
                  n_col = "NMISS",
                  info_col = "INFO_UKB")

write.table(BMI, "./GWAS_26and5_formatted/BMI", sep = "\t", quote = F, row.names = F, col.names = T)

rm(list=c("BMI_raw","BMI"))

[1m[1mRows: [1m[22m[34m[34m10599054[34m[39m [1m[1mColumns: [1m[22m[34m[34m20[34m[39m

[36m──[39m [1m[1mColumn specification[1m[22m [36m────────────────────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m  (7): SNP, A1, TEST, A2, SNPID_UKB, A1_UKB, A2_UKB
[32mdbl[39m (13): CHR, BP, NMISS, BETA, SE, L95, U95, STAT, P, MAF, NCHROBS, INFO_UK...


[36mℹ[39m Use [30m[47m[30m[47m`spec()`[47m[30m[49m[39m to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set [30m[47m[30m[47m`show_col_types = FALSE`[47m[30m[49m[39m to quiet this message.

Begin formatting .... 

The raw data set has 10599054 dat lines

Remove SNPs with imputation info less than 0.9 ..., remaining 10599018 SNPs.

Remove ambiguous SNPs ..., remaining 8995068 SNPs.

Remove SNPs in MHC region ..., remaining 8944044 SNPs.

Remove duplicated SNPs ..., remaining 894404

In [10]:
## 7. CAD
CAD_raw <- readr::read_delim("./GWAS_26and5_raw/CAD", "\t", escape_double = FALSE, trim_ws = TRUE, progress = F)

CAD = format_data(CAD_raw,
                  snp_col = "markername",
                  b_col = "beta",
                  se_col = "se",
                  freq_col = "effect_allele_freq",
                  A1_col = "effect_allele",
                  A2_col = "noneffect_allele",
                  p_col = "p_dgc",
                  n = 184305)

write.table(CAD, "./GWAS_26and5_formatted/CAD", sep = "\t", quote = F, row.names = F, col.names = T)

rm(list=c("CAD_raw","CAD"))

[1m[1mRows: [1m[22m[34m[34m9455778[34m[39m [1m[1mColumns: [1m[22m[34m[34m13[34m[39m

[36m──[39m [1m[1mColumn specification[1m[22m [36m────────────────────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m (4): markername, effect_allele, noneffect_allele, model
[32mdbl[39m (9): chr, bp_hg19, effect_allele_freq, median_info, beta, se_dgc, p_dgc,...


[36mℹ[39m Use [30m[47m[30m[47m`spec()`[47m[30m[49m[39m to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set [30m[47m[30m[47m`show_col_types = FALSE`[47m[30m[49m[39m to quiet this message.

Begin formatting .... 

The raw data set has 9455778 dat lines

effect allele column has some values that are not A/C/T/G. Remove these SNPs..., remaining 9455777 SNPs.

the other allele column has some values that are not A/C/T/G. Remove these SNPs..., remaining 9455776 SNPs.

Remove ambig

In [7]:
## 8. CD (Crohn Disease)
# CD_rsid: CD annotated with rs-number and freq

CD_raw <- readr::read_delim("./GWAS_26and5_raw/CD_rsid", "\t", escape_double = FALSE, trim_ws = TRUE, progress = F)

CD = format_data(CD_raw,
                 snp_col = "rsid",
                 b_col = "Effect",
                 se_col = "StdErr",
                 freq_col = "freq",
                 A1_col = "Allele1",
                 A2_col = "Allele2",
                 p_col = "P.value",
                 n = 40266)

write.table(CD, "./GWAS_26and5_formatted/CD", sep = "\t", quote = F, row.names = F, col.names = T)

rm(list=c("CD_raw","CD"))

[1m[1mRows: [1m[22m[34m[34m7055380[34m[39m [1m[1mColumns: [1m[22m[34m[34m7[34m[39m

[36m──[39m [1m[1mColumn specification[1m[22m [36m────────────────────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m (3): rsid, Allele1, Allele2
[32mdbl[39m (4): freq, Effect, StdErr, P.value


[36mℹ[39m Use [30m[47m[30m[47m`spec()`[47m[30m[49m[39m to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set [30m[47m[30m[47m`show_col_types = FALSE`[47m[30m[49m[39m to quiet this message.

Begin formatting .... 

The raw data set has 7055380 dat lines

Remove ambiguous SNPs ..., remaining 6012676 SNPs.

Remove SNPs in MHC region ..., remaining 5983174 SNPs.

Remove duplicated SNPs ..., remaining 5979640 SNPs.

Merge SNPs with the hapmap3 snplist ..., remaining 1108128 SNPs.

Remove SNPs with alleles not matched with the hapmap3 snplist, remaini

In [12]:
## 9. DaytimeSleepiness
dat_raw <- readr::read_delim("./GWAS_26and5_raw/DaytimeSleepiness", "\t", escape_double = FALSE, trim_ws = TRUE, progress = F)

dat = format_data(dat_raw,
                  snp_col = "SNP",
                  b_col = "BETA",
                  se_col = "SE",
                  freq_col = "A1FREQ",
                  A1_col = "ALLELE1",
                  A2_col = "ALLELE0",
                  info_col = "INFO",
                  p_col = "P",
                  n = 452071)

write.table(dat, "./GWAS_26and5_formatted/Daytime_Sleepiness",  sep="\t", quote = F, row.names = F, col.names = T)

rm(list=c("dat_raw","dat"))

[1m[1mRows: [1m[22m[34m[34m14661601[34m[39m [1m[1mColumns: [1m[22m[34m[34m10[34m[39m

[36m──[39m [1m[1mColumn specification[1m[22m [36m────────────────────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m (3): SNP, ALLELE1, ALLELE0
[32mdbl[39m (7): CHR, BP, A1FREQ, INFO, BETA, SE, P


[36mℹ[39m Use [30m[47m[30m[47m`spec()`[47m[30m[49m[39m to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set [30m[47m[30m[47m`show_col_types = FALSE`[47m[30m[49m[39m to quiet this message.

Begin formatting .... 

The raw data set has 14661601 dat lines

Remove SNPs with imputation info less than 0.9 ..., remaining 10903657 SNPs.

Remove ambiguous SNPs ..., remaining 9250294 SNPs.

Remove SNPs in MHC region ..., remaining 9202639 SNPs.

Remove duplicated SNPs ..., remaining 9196675 SNPs.

Merge SNPs with the hapmap3 snplist ..., remaining 

In [13]:
## 10. Depression
dat_raw <- readr::read_delim("./GWAS_26and5_raw/Depression", "\t", escape_double = FALSE, trim_ws = TRUE, progress = F)

dat = MRAPSS::format_data(dat_raw,
                          snp_col = "RSID",
                          z_col = "Z",
                          freq_col = "MAF_UKB",
                          A1_col = "A1",
                          A2_col = "A2",
                          p_col = "P",
                          n_col = "N",
                          info_col = "INFO_UKB")

write.table(dat, "./GWAS_26and5_formatted/Depression", sep = "\t", quote = F, row.names = F, col.names = T)

rm(list=c("dat_raw","dat"))

[1m[1mRows: [1m[22m[34m[34m10886529[34m[39m [1m[1mColumns: [1m[22m[34m[34m12[34m[39m

[36m──[39m [1m[1mColumn specification[1m[22m [36m────────────────────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m (4): RSID, SNP, A1, A2
[32mdbl[39m (8): CHR, POS, EAF_UKB, MAF_UKB, Z, P, N, INFO_UKB


[36mℹ[39m Use [30m[47m[30m[47m`spec()`[47m[30m[49m[39m to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set [30m[47m[30m[47m`show_col_types = FALSE`[47m[30m[49m[39m to quiet this message.

Begin formatting .... 

The raw data set has 10886529 dat lines

Remove SNPs with imputation info less than 0.9 ..., remaining 10847393 SNPs.

Remove ambiguous SNPs ..., remaining 9204753 SNPs.

Remove SNPs in MHC region ..., remaining 9152908 SNPs.

Remove duplicated SNPs ..., remaining 9152468 SNPs.

Merge SNPs with the hapmap3 snplist ..., rem

In [14]:
## 11-14. Hair_Black, Hair_Blonde, Hair_Dark_Brown, Hair_Light_Brown
for (trait in c("Hair_Black", "Hair_Blonde", "Hair_Dark_Brown", "Hair_Light_Brown")){
  
  dat_raw <- readr::read_delim(paste0("./GWAS_26and5_raw/", trait), delim = "\t", escape_double = FALSE, trim_ws = TRUE, progress = F)
  
  dat = format_data(dat_raw,
                    snp_col = "SNPID_UKB",
                    or_col = "OR",
                    se_col = "SE",
                    freq_col = "MAF_UKB",
                    A1_col = "A1",
                    A2_col = "A2",
                    p_col = "P",
                    n_col = "NMISS",
                    info_col = "INFO_UKB")
  
  write.table(dat, paste0("./GWAS_26and5_formatted/", trait), sep="\t", quote = F, row.names = F, col.names = T)
  
  rm(list=c("dat_raw", "dat"))
  
}


[1m[1mRows: [1m[22m[34m[34m10599054[34m[39m [1m[1mColumns: [1m[22m[34m[34m20[34m[39m

[36m──[39m [1m[1mColumn specification[1m[22m [36m────────────────────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m  (7): SNP, A1, TEST, A2, SNPID_UKB, A1_UKB, A2_UKB
[32mdbl[39m (13): CHR, BP, NMISS, OR, SE, L95, U95, STAT, P, MAF, NCHROBS, INFO_UKB,...


[36mℹ[39m Use [30m[47m[30m[47m`spec()`[47m[30m[49m[39m to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set [30m[47m[30m[47m`show_col_types = FALSE`[47m[30m[49m[39m to quiet this message.

Begin formatting .... 

The raw data set has 10599054 dat lines

Remove SNPs with imputation info less than 0.9 ..., remaining 10599018 SNPs.

Remove ambiguous SNPs ..., remaining 8995068 SNPs.

Remove SNPs in MHC region ..., remaining 8944044 SNPs.

Remove duplicated SNPs ..., remaining 894404

In [15]:
## 15. HBP (High Blood Pressure)

HBP_raw <- readr::read_delim("./GWAS_26and5_raw/HBP", "\t", escape_double = FALSE, trim_ws = TRUE, progress = F)

HBP = MRAPSS::format_data(HBP_raw,
                          snp_col = "SNPID_UKB",
                          or_col = "OR",
                          se_col = "SE",
                          freq_col = "MAF_UKB",
                          A1_col = "A1",
                          A2_col = "A2",
                          p_col = "P",
                          n_col = "NMISS",
                          info_col = "INFO_UKB")

write.table(HBP, "./GWAS_26and5_formatted/HBP", sep="\t", quote = F, row.names = F, col.names = T)

rm(list=c("HBP_raw", "HBP"))

[1m[1mRows: [1m[22m[34m[34m10599054[34m[39m [1m[1mColumns: [1m[22m[34m[34m20[34m[39m

[36m──[39m [1m[1mColumn specification[1m[22m [36m────────────────────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m  (7): SNP, A1, TEST, A2, SNPID_UKB, A1_UKB, A2_UKB
[32mdbl[39m (13): CHR, BP, NMISS, OR, SE, L95, U95, STAT, P, MAF, NCHROBS, INFO_UKB,...


[36mℹ[39m Use [30m[47m[30m[47m`spec()`[47m[30m[49m[39m to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set [30m[47m[30m[47m`show_col_types = FALSE`[47m[30m[49m[39m to quiet this message.

Begin formatting .... 

The raw data set has 10599054 dat lines

Remove SNPs with imputation info less than 0.9 ..., remaining 10599018 SNPs.

Remove ambiguous SNPs ..., remaining 8995068 SNPs.

Remove SNPs in MHC region ..., remaining 8944044 SNPs.

Remove duplicated SNPs ..., remaining 894404

In [8]:
## 16. Height (GIANT)

Height_raw <- readr::read_delim("./GWAS_26and5_raw/Height_GIANT", "\t", escape_double = FALSE, trim_ws = TRUE, progress = F)
Height_raw = Height_raw[!is.na(Height_raw$Freq.Allele1.HapMapCEU), ]
Height = format_data(Height_raw,
                     snp_col = "MarkerName",
                     b_col = "b",
                     se_col = "SE",
                     freq_col = "Freq.Allele1.HapMapCEU",
                     A1_col = "Allele1",
                     A2_col = "Allele2",
                     p_col = "p",
                     n_col = "N")

write.table(Height, "./GWAS_26and5_formatted/Height_GIANT", sep="\t", quote = F, row.names = F, col.names = T)

rm(list=c("Height_raw","Height"))

[1m[1mRows: [1m[22m[34m[34m2550858[34m[39m [1m[1mColumns: [1m[22m[34m[34m8[34m[39m

[36m──[39m [1m[1mColumn specification[1m[22m [36m────────────────────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m (3): MarkerName, Allele1, Allele2
[32mdbl[39m (5): Freq.Allele1.HapMapCEU, b, SE, p, N


[36mℹ[39m Use [30m[47m[30m[47m`spec()`[47m[30m[49m[39m to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set [30m[47m[30m[47m`show_col_types = FALSE`[47m[30m[49m[39m to quiet this message.

Begin formatting .... 

The raw data set has 2547281 dat lines

Remove ambiguous SNPs ..., remaining 2154206 SNPs.

Remove SNPs in MHC region ..., remaining 2143731 SNPs.

Remove duplicated SNPs ..., remaining 2143731 SNPs.

Merge SNPs with the hapmap3 snplist ..., remaining 1091189 SNPs.

Remove SNPs with alleles not matched with the hapmap3 snpl

In [17]:
## 17. Height (UKBB)

Height_UKB_raw <- readr::read_delim("./GWAS_26and5_raw/Height_UKB", "\t", escape_double = FALSE,
                                    trim_ws = TRUE, progress = F)

Height_UKB = format_data(Height_UKB_raw,
                         snp_col = "SNPID_UKB",
                         b_col = "BETA",
                         se_col = "SE",
                         freq_col = "MAF_UKB",
                         A1_col = "A1",
                         A2_col = "A2",
                         p_col = "P",
                         n_col = "NMISS",
                         info_col = "INFO_UKB")

write.table(Height_UKB, "./GWAS_26and5_formatted/Height_UKB", sep = "\t", quote = F, row.names = F, col.names = T)

rm(list=c("Height_UKB_raw","Height_UKB"))

[1m[1mRows: [1m[22m[34m[34m10599054[34m[39m [1m[1mColumns: [1m[22m[34m[34m20[34m[39m

[36m──[39m [1m[1mColumn specification[1m[22m [36m────────────────────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m  (7): SNP, A1, TEST, A2, SNPID_UKB, A1_UKB, A2_UKB
[32mdbl[39m (13): CHR, BP, NMISS, BETA, SE, L95, U95, STAT, P, MAF, NCHROBS, INFO_UK...


[36mℹ[39m Use [30m[47m[30m[47m`spec()`[47m[30m[49m[39m to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set [30m[47m[30m[47m`show_col_types = FALSE`[47m[30m[49m[39m to quiet this message.

Begin formatting .... 

The raw data set has 10599054 dat lines

Remove SNPs with imputation info less than 0.9 ..., remaining 10599018 SNPs.

Remove ambiguous SNPs ..., remaining 8995068 SNPs.

Remove SNPs in MHC region ..., remaining 8944044 SNPs.

Remove duplicated SNPs ..., remaining 894404

In [9]:
# 18. IBD (Inflammatory Bowel Disease)

IBD_raw <- readr::read_delim("./GWAS_26and5_raw/IBD_rsid", "\t", escape_double = FALSE, trim_ws = TRUE, progress = F)

IBD = format_data(IBD_raw,
                  snp_col = "rsid",
                  b_col = "Effect",
                  se_col = "StdErr",
                  freq_col = "freq",
                  A1_col = "Allele1",
                  A2_col = "Allele2",
                  p_col = "P.value",
                  n = 59957)

write.table(IBD, "./GWAS_26and5_formatted/IBD", sep="\t", quote = F, row.names = F, col.names = T)

rm(list=c("IBD_raw","IBD"))

[1m[1mRows: [1m[22m[34m[34m7059162[34m[39m [1m[1mColumns: [1m[22m[34m[34m7[34m[39m

[36m──[39m [1m[1mColumn specification[1m[22m [36m────────────────────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m (3): rsid, Allele1, Allele2
[32mdbl[39m (4): freq, Effect, StdErr, P.value


[36mℹ[39m Use [30m[47m[30m[47m`spec()`[47m[30m[49m[39m to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set [30m[47m[30m[47m`show_col_types = FALSE`[47m[30m[49m[39m to quiet this message.

Begin formatting .... 

The raw data set has 7059162 dat lines

Remove ambiguous SNPs ..., remaining 6015921 SNPs.

Remove SNPs in MHC region ..., remaining 5986496 SNPs.

Remove duplicated SNPs ..., remaining 5982838 SNPs.

Merge SNPs with the hapmap3 snplist ..., remaining 1108191 SNPs.

Remove SNPs with alleles not matched with the hapmap3 snplist, remaini

In [19]:
## 19. Income

Income_raw <- readr::read_delim("./GWAS_26and5_raw/Income", " ", escape_double = FALSE, trim_ws = TRUE, progress = F)

Income = format_data(Income_raw,
                     snp_col = "SNP",
                     b_col = "Beta",
                     se_col = "Standard_Error_of_Beta",
                     A1_col = "Effect_Allele",
                     A2_col = "Non_effect_Allele",
                     p_col = "P",
                     n=286301)

write.table(Income, "./GWAS_26and5_formatted/Income", sep = "\t", quote = F, row.names = F, col.names = T)

rm(list=c("Income_raw","Income"))

[1m[1mRows: [1m[22m[34m[34m18485882[34m[39m [1m[1mColumns: [1m[22m[34m[34m8[34m[39m

[36m──[39m [1m[1mColumn specification[1m[22m [36m────────────────────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m " "
[31mchr[39m (3): SNP, Non_effect_Allele, Effect_Allele
[32mdbl[39m (5): Chr, BPos, Beta, Standard_Error_of_Beta, P


[36mℹ[39m Use [30m[47m[30m[47m`spec()`[47m[30m[49m[39m to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set [30m[47m[30m[47m`show_col_types = FALSE`[47m[30m[49m[39m to quiet this message.

Begin formatting .... 

The raw data set has 18485882 dat lines

Remove ambiguous SNPs ..., remaining 15709024 SNPs.

Remove SNPs in MHC region ..., remaining 15648261 SNPs.

Remove duplicated SNPs ..., remaining 15648261 SNPs.

Merge SNPs with the hapmap3 snplist ..., remaining 1208817 SNPs.

Remove SNPs with alleles not matched w

In [20]:
## 20. Insomnia

Insomnia_raw <- readr::read_delim("./GWAS_26and5_raw/Insomnia", " ", escape_double = FALSE, trim_ws = TRUE, progress = F)

Insomnia = format_data(Insomnia_raw,
                       snp_col = "SNP",
                       b_col = "BETA_INSOMNIA",
                       se_col = "SE_INSOMNIA",
                       freq_col = "A1FREQ",
                       A1_col = "ALLELE1",
                       A2_col = "ALLELE0",
                       info_col = "INFO",
                       p_col = "P_INSOMNIA",
                       n = 453379)

write.table(Insomnia, "./GWAS_26and5_formatted/Insomnia", sep = "\t", quote = F, row.names = F, col.names = T)

rm(list=c("Insomnia_raw","Insomnia"))

[1m[1mRows: [1m[22m[34m[34m14661601[34m[39m [1m[1mColumns: [1m[22m[34m[34m10[34m[39m

[36m──[39m [1m[1mColumn specification[1m[22m [36m────────────────────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m " "
[31mchr[39m (3): SNP, ALLELE1, ALLELE0
[32mdbl[39m (7): CHR, BP, A1FREQ, INFO, BETA_INSOMNIA, SE_INSOMNIA, P_INSOMNIA


[36mℹ[39m Use [30m[47m[30m[47m`spec()`[47m[30m[49m[39m to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set [30m[47m[30m[47m`show_col_types = FALSE`[47m[30m[49m[39m to quiet this message.

Begin formatting .... 

The raw data set has 14661601 dat lines

Remove SNPs with imputation info less than 0.9 ..., remaining 10903657 SNPs.

Remove ambiguous SNPs ..., remaining 9250294 SNPs.

Remove SNPs in MHC region ..., remaining 9202639 SNPs.

Remove duplicated SNPs ..., remaining 9196675 SNPs.

Merge SNPs with the hapma

In [21]:
## 21. Intelligence

Intelligence_raw <- readr::read_delim("./GWAS_26and5_raw/Intelligence", "\t", escape_double = FALSE, trim_ws = TRUE, progress = F)

Intelligence = format_data(Intelligence_raw,
                           snp_col = "SNPID_UKB",
                           b_col = "BETA",
                           se_col = "SE",
                           freq_col = "MAF_UKB",
                           A1_col = "A1",
                           A2_col = "A2",
                           p_col = "P",
                           n_col = "NMISS",
                           info_col = "INFO_UKB")

write.table(Intelligence, "./GWAS_26and5_formatted/Intelligence", sep = "\t", quote = F, row.names = F, col.names = T)

rm(list=c("Intelligence_raw","Intelligence"))

[1m[1mRows: [1m[22m[34m[34m9561902[34m[39m [1m[1mColumns: [1m[22m[34m[34m20[34m[39m

[36m──[39m [1m[1mColumn specification[1m[22m [36m────────────────────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m  (7): SNP, A1, TEST, A2, SNPID_UKB, A1_UKB, A2_UKB
[32mdbl[39m (13): CHR, BP, NMISS, BETA, SE, L95, U95, STAT, P, MAF, NCHROBS, INFO_UK...


[36mℹ[39m Use [30m[47m[30m[47m`spec()`[47m[30m[49m[39m to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set [30m[47m[30m[47m`show_col_types = FALSE`[47m[30m[49m[39m to quiet this message.

Begin formatting .... 

The raw data set has 9561902 dat lines

Remove SNPs with imputation info less than 0.9 ..., remaining 9561873 SNPs.

Remove ambiguous SNPs ..., remaining 8119324 SNPs.

Remove SNPs in MHC region ..., remaining 8072530 SNPs.

Remove duplicated SNPs ..., remaining 8072530 S

In [22]:
## 22. MDD
MDD_raw <- readr::read_delim("./GWAS_26and5_raw/MDD", "\t", escape_double = FALSE, trim_ws = TRUE, progress = F)

MDD = MRAPSS::format_data(MDD_raw,
                          snp_col = "SNPID_UKB",
                          or_col = "OR",
                          se_col = "SE",
                          freq_col = "MAF_UKB",
                          A1_col = "A1",
                          A2_col = "A2",
                          p_col = "P",
                          n_col = "NMISS",
                          info_col = "INFO_UKB")

write.table(MDD, "./GWAS_26and5_formatted/MDD", sep = "\t", quote = F, row.names = F, col.names = T)

rm(list=c("MDD_raw","MDD"))

[1m[1mRows: [1m[22m[34m[34m10154467[34m[39m [1m[1mColumns: [1m[22m[34m[34m20[34m[39m

[36m──[39m [1m[1mColumn specification[1m[22m [36m────────────────────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m  (7): SNP, A1, TEST, A2, SNPID_UKB, A1_UKB, A2_UKB
[32mdbl[39m (13): CHR, BP, NMISS, OR, SE, L95, U95, STAT, P, MAF, NCHROBS, INFO_UKB,...


[36mℹ[39m Use [30m[47m[30m[47m`spec()`[47m[30m[49m[39m to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set [30m[47m[30m[47m`show_col_types = FALSE`[47m[30m[49m[39m to quiet this message.

Begin formatting .... 

The raw data set has 10154467 dat lines

Remove SNPs with imputation info less than 0.9 ..., remaining 10154437 SNPs.

Remove ambiguous SNPs ..., remaining 8620374 SNPs.

Remove SNPs in MHC region ..., remaining 8571258 SNPs.

Remove duplicated SNPs ..., remaining 857125

In [23]:
## 23. NEB
NEB_raw <- readr::read_delim("./GWAS_26and5_raw/NEB", "\t", escape_double = FALSE, trim_ws = TRUE, progress = F)

NEB = MRAPSS::format_data(NEB_raw,
                          snp_col = "SNPID",
                          freq_col = "Freq_HapMap",
                          A1_col = "A1",
                          A2_col = "A2",
                          z_col = "Zscore",
                          p_col = "Pvalue",
                          n = 343072)

write.table(NEB, file = "./GWAS_26and5_formatted/NEB", sep = "\t", quote = F, row.names = F, col.names = T)

rm(list=c("NEB_raw","NEB"))

[1m[1mRows: [1m[22m[34m[34m2474037[34m[39m [1m[1mColumns: [1m[22m[34m[34m8[34m[39m

[36m──[39m [1m[1mColumn specification[1m[22m [36m────────────────────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m (3): SNPID, A1, A2
[32mdbl[39m (5): CHR, POS, Freq_HapMap, Zscore, Pvalue


[36mℹ[39m Use [30m[47m[30m[47m`spec()`[47m[30m[49m[39m to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set [30m[47m[30m[47m`show_col_types = FALSE`[47m[30m[49m[39m to quiet this message.

Begin formatting .... 

The raw data set has 2474037 dat lines

Remove ambiguous SNPs ..., remaining 2092463 SNPs.

Remove SNPs in MHC region ..., remaining 2089374 SNPs.

Remove duplicated SNPs ..., remaining 2089374 SNPs.

Merge SNPs with the hapmap3 snplist ..., remaining 1078165 SNPs.

Generating sample size from specified sample size

Remove SNPs with p-v

In [24]:
## 24. Neuroticism
dat_raw <- readr::read_delim("./GWAS_26and5_raw/Neuroticism", "\t", escape_double = FALSE, trim_ws = TRUE, progress = F)

dat = MRAPSS::format_data(dat_raw,
                          snp_col = "SNPID_UKB",
                          b_col = "BETA",
                          se_col = "SE",
                          freq_col = "MAF_UKB",
                          A1_col = "A1",
                          A2_col = "A2",
                          p_col = "P",
                          n_col = "NMISS",
                          info_col = "INFO_UKB")

write.table(dat, "./GWAS_26and5_formatted/Neuroticism", sep = "\t", quote = F, row.names = F, col.names = T)

rm(list=c("dat_raw","dat"))

[1m[1mRows: [1m[22m[34m[34m10399545[34m[39m [1m[1mColumns: [1m[22m[34m[34m20[34m[39m

[36m──[39m [1m[1mColumn specification[1m[22m [36m────────────────────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m  (7): SNP, A1, TEST, A2, SNPID_UKB, A1_UKB, A2_UKB
[32mdbl[39m (13): CHR, BP, NMISS, BETA, SE, L95, U95, STAT, P, MAF, NCHROBS, INFO_UK...


[36mℹ[39m Use [30m[47m[30m[47m`spec()`[47m[30m[49m[39m to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set [30m[47m[30m[47m`show_col_types = FALSE`[47m[30m[49m[39m to quiet this message.

Begin formatting .... 

The raw data set has 10399545 dat lines

Remove SNPs with imputation info less than 0.9 ..., remaining 10399514 SNPs.

Remove ambiguous SNPs ..., remaining 8826898 SNPs.

Remove SNPs in MHC region ..., remaining 8776612 SNPs.

Remove duplicated SNPs ..., remaining 877661

In [14]:
## 25. RA
# RA_freq: RA annotated with a column of freq
# Note that the freq_col is not a required column. If freq_col is not available, it will skip the step of QC with freq.

RA_raw <- readr::read_delim("./GWAS_26and5_raw/RA_freq", "\t", escape_double = FALSE, trim_ws = TRUE, progress = F)

RA = format_data(RA_raw,
                  snp_col = "SNPID",
                  or_col = "OR(A1)",
                  A1_col = "A1",
                  A2_col = "A2",
                  freq_col = "freq",
                  p_col = "P-val",
                  n = 58284)

write.table(RA, "./GWAS_26and5_formatted/RA", sep = "\t", quote = F, row.names = F, col.names = T)

rm(list=c("RA_raw","RA"))


[1m[1mRows: [1m[22m[34m[34m7022577[34m[39m [1m[1mColumns: [1m[22m[34m[34m10[34m[39m

[36m──[39m [1m[1mColumn specification[1m[22m [36m────────────────────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m (3): SNPID, A1, A2
[32mdbl[39m (7): freq, Chr, Position(hg19), OR(A1), OR_95%CIlow, OR_95%CIup, P-val


[36mℹ[39m Use [30m[47m[30m[47m`spec()`[47m[30m[49m[39m to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set [30m[47m[30m[47m`show_col_types = FALSE`[47m[30m[49m[39m to quiet this message.

Begin formatting .... 

The raw data set has 7022577 dat lines

Remove ambiguous SNPs ..., remaining 5963060 SNPs.

Remove SNPs in MHC region ..., remaining 5935712 SNPs.

Remove duplicated SNPs ..., remaining 5935712 SNPs.

Merge SNPs with the hapmap3 snplist ..., remaining 1131678 SNPs.

infer b column from log(or)...

Generating

In [26]:
## 26. SCZ

SCZ_raw <- readr::read_delim("./GWAS_26and5_raw/SCZ_rsid", "\t", escape_double = FALSE, trim_ws = TRUE, progress = F)

SCZ = format_data(SCZ_raw,
                  snp_col = "rsid",
                  or_col = "OR",
                  se_col = "SE",
                  freq_col = "Freq.A1",
                  A1_col = "A1",
                  A2_col = "A2",
                  p_col = "P",
                  n=105318)

write.table(SCZ, "./GWAS_26and5_formatted/SCZ", sep = "\t", quote = F, row.names = F, col.names = T)

rm(list=c("SCZ_raw","SCZ"))

[1m[1mRows: [1m[22m[34m[34m7480977[34m[39m [1m[1mColumns: [1m[22m[34m[34m10[34m[39m

[36m──[39m [1m[1mColumn specification[1m[22m [36m────────────────────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m (4): rsid, SNP, A1, A2
[32mdbl[39m (6): Freq.A1, CHR, BP, OR, SE, P


[36mℹ[39m Use [30m[47m[30m[47m`spec()`[47m[30m[49m[39m to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set [30m[47m[30m[47m`show_col_types = FALSE`[47m[30m[49m[39m to quiet this message.

Begin formatting .... 

The raw data set has 7480977 dat lines

Remove ambiguous SNPs ..., remaining 6341312 SNPs.

Remove SNPs in MHC region ..., remaining 6315985 SNPs.

Remove duplicated SNPs ..., remaining 6315867 SNPs.

Merge SNPs with the hapmap3 snplist ..., remaining 1145148 SNPs.

infer b column from log(or)...

Generating sample size from specified sample

In [11]:
## 27. Smoking
Smoking_raw <- readr::read_delim("./GWAS_26and5_raw/Smoking", "\t", escape_double = FALSE, trim_ws = TRUE, progress = F)
Smoking_raw = Smoking_raw[!is.na(Smoking_raw$AF), ]
Smoking = format_data(Smoking_raw,
                      snp_col = "RSID",
                      b_col = "BETA",
                      se_col = "SE",
                      freq_col = "AF",
                      A1_col = "ALT",
                      A2_col = "REF",
                      p_col = "PVALUE",
                      n_col = "N")

write.table(Smoking , "./GWAS_26and5_formatted/Smoking", sep = "\t", quote = F, row.names = F, col.names = T)

rm(list=c("Smoking_raw","Smoking"))


[1m[1mRows: [1m[22m[34m[34m13933175[34m[39m [1m[1mColumns: [1m[22m[34m[34m13[34m[39m

[36m──[39m [1m[1mColumn specification[1m[22m [36m────────────────────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m  (3): RSID, REF, ALT
[32mdbl[39m (10): CHROM, POS, AF, STAT, PVALUE, BETA, SE, N, EFFECTIVE_N, Number_of_...


[36mℹ[39m Use [30m[47m[30m[47m`spec()`[47m[30m[49m[39m to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set [30m[47m[30m[47m`show_col_types = FALSE`[47m[30m[49m[39m to quiet this message.

Begin formatting .... 

The raw data set has 12168248 dat lines

Remove ambiguous SNPs ..., remaining 10357018 SNPs.

Remove SNPs in MHC region ..., remaining 10306304 SNPs.

Remove duplicated SNPs ..., remaining 10298519 SNPs.

Merge SNPs with the hapmap3 snplist ..., remaining 1189942 SNPs.

Remove SNPs with p-value < 0 o

In [28]:
## 28. SWB

SWB_raw <- readr::read_delim("./GWAS_26and5_raw/SWB", "\t", escape_double = FALSE, trim_ws = TRUE, progress = F)

SWB = MRAPSS::format_data(SWB_raw,
                          snp_col = "MarkerName",
                          freq_col = "EAF",
                          A1_col = "A1",
                          A2_col = "A2",
                          b_col = "Beta",
                          se_col = "SE",
                          p_col = "Pval",
                          n = 298420)

write.table(SWB, file = "./GWAS_26and5_formatted/SWB", sep = "\t", quote = F, row.names = F, col.names = T)

rm(list=c("SWB_raw","SWB"))

[1m[1mRows: [1m[22m[34m[34m2268674[34m[39m [1m[1mColumns: [1m[22m[34m[34m9[34m[39m

[36m──[39m [1m[1mColumn specification[1m[22m [36m────────────────────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m (3): MarkerName, A1, A2
[32mdbl[39m (6): CHR, POS, EAF, Beta, SE, Pval


[36mℹ[39m Use [30m[47m[30m[47m`spec()`[47m[30m[49m[39m to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set [30m[47m[30m[47m`show_col_types = FALSE`[47m[30m[49m[39m to quiet this message.

Begin formatting .... 

The raw data set has 2268674 dat lines

Remove ambiguous SNPs ..., remaining 1918560 SNPs.

Remove SNPs in MHC region ..., remaining 1915787 SNPs.

Remove duplicated SNPs ..., remaining 1915787 SNPs.

Merge SNPs with the hapmap3 snplist ..., remaining 1018811 SNPs.

Generating sample size from specified sample size

Remove SNPs with p-valu

In [29]:
## 29. T2D

T2D_raw <- readr::read_delim("./GWAS_26and5_raw/T2D_rsid", "\t", escape_double = FALSE, trim_ws = TRUE, progress = F)

T2D = MRAPSS::format_data(T2D_raw,
                          snp_col = "rsid",
                          b_col = "Beta",
                          se_col = "SE",
                          freq_col = "EAF",
                          A1_col = "EA",
                          A2_col = "NEA",
                          p_col = "Pvalue",
                          n_col = "Neff")

write.table(T2D, "./GWAS_26and5_formatted/T2D", sep = "\t", quote = F, row.names = F, col.names = T)

rm(list=c("T2D_raw","T2D"))

[1m[1mRows: [1m[22m[34m[34m9073559[34m[39m [1m[1mColumns: [1m[22m[34m[34m11[34m[39m

[36m──[39m [1m[1mColumn specification[1m[22m [36m────────────────────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m  (3): rsid, EA, NEA
[32mdbl[39m  (7): Chr, Pos, EAF, Beta, SE, Pvalue, Neff
[34mtime[39m (1): SNP


[36mℹ[39m Use [30m[47m[30m[47m`spec()`[47m[30m[49m[39m to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set [30m[47m[30m[47m`show_col_types = FALSE`[47m[30m[49m[39m to quiet this message.

Begin formatting .... 

The raw data set has 9073559 dat lines

Remove ambiguous SNPs ..., remaining 7713357 SNPs.

Remove SNPs in MHC region ..., remaining 7671570 SNPs.

Remove duplicated SNPs ..., remaining 7663228 SNPs.

Merge SNPs with the hapmap3 snplist ..., remaining 1186472 SNPs.

Remove SNPs with p-value < 0 or p-value > 1,

In [30]:
## 30. Tanning

dat_raw <- readr::read_delim("./GWAS_26and5_raw/Tanning", delim = "\t", escape_double = FALSE, trim_ws = TRUE, progress = F)

dat = MRAPSS::format_data(dat_raw,
                          snp_col = "SNPID_UKB",
                          b_col = "BETA",
                          se_col = "SE",
                          freq_col = "MAF_UKB",
                          A1_col = "A1",
                          A2_col = "A2",
                          p_col = "P",
                          n_col = "NMISS",
                          info_col = "INFO_UKB")

write.table(dat, "./GWAS_26and5_formatted/Tanning", sep = "\t", quote = F, row.names = F, col.names = T)

rm(list=c("dat_raw", "dat"))

[1m[1mRows: [1m[22m[34m[34m10579925[34m[39m [1m[1mColumns: [1m[22m[34m[34m20[34m[39m

[36m──[39m [1m[1mColumn specification[1m[22m [36m────────────────────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m  (7): SNP, A1, TEST, A2, SNPID_UKB, A1_UKB, A2_UKB
[32mdbl[39m (13): CHR, BP, NMISS, BETA, SE, L95, U95, STAT, P, MAF, NCHROBS, INFO_UK...


[36mℹ[39m Use [30m[47m[30m[47m`spec()`[47m[30m[49m[39m to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set [30m[47m[30m[47m`show_col_types = FALSE`[47m[30m[49m[39m to quiet this message.

Begin formatting .... 

The raw data set has 10579925 dat lines

Remove SNPs with imputation info less than 0.9 ..., remaining 10579890 SNPs.

Remove ambiguous SNPs ..., remaining 8978911 SNPs.

Remove SNPs in MHC region ..., remaining 8927942 SNPs.

Remove duplicated SNPs ..., remaining 892794

In [31]:
## 31. Urate

Urate_raw <- readr::read_delim("./GWAS_26and5_raw/Urate", ",", escape_double = FALSE, trim_ws = TRUE, progress = F)

Urate = format_data(Urate_raw,
                    snp_col = "MarkerName",
                    b_col = "beta",
                    se_col = "se",
                    A1_col = "A1",
                    A2_col = "A2",
                    p_col = "p_gc",
                    n_col = "n_total")

write.table(Urate, "./GWAS_26and5_formatted/Urate", sep = "\t", quote = F, row.names = F, col.names = T)

rm(list=c("Urate_raw","Urate"))

[1m[1mRows: [1m[22m[34m[34m2450547[34m[39m [1m[1mColumns: [1m[22m[34m[34m7[34m[39m

[36m──[39m [1m[1mColumn specification[1m[22m [36m────────────────────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (3): MarkerName, A1, A2
[32mdbl[39m (4): n_total, beta, se, p_gc


[36mℹ[39m Use [30m[47m[30m[47m`spec()`[47m[30m[49m[39m to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set [30m[47m[30m[47m`show_col_types = FALSE`[47m[30m[49m[39m to quiet this message.

Begin formatting .... 

The raw data set has 2450547 dat lines

Remove ambiguous SNPs ..., remaining 2072342 SNPs.

Remove SNPs in MHC region ..., remaining 2062309 SNPs.

Remove duplicated SNPs ..., remaining 2062309 SNPs.

Merge SNPs with the hapmap3 snplist ..., remaining 1064399 SNPs.

Remove SNPs with alleles not matched with the hapmap3 snplist, remaining 1064363 