# Processing of UK Biobank data

In [1]:
using DataFrames
using CSV
using SnpArrays
using ProgressMeter

## Phenotypes

+ Master phenotype file: `/oak/stanford/groups/candes/ukbiobank/phenotypes/ukb25261.csv`
+ For description of column names, see `/oak/stanford/groups/candes/ukbiobank/phenotypes/ukb25261.html`

In [1]:
# copy master phenotype file to scratch
phenotype_file = "/scratch/groups/sabatti/ukb_phenotypes/ukb25261.csv"
if !isfile(phenotype_file)
    mkpath("/scratch/groups/sabatti/ukb_phenotypes")
    run(`cp /oak/stanford/groups/candes/ukbiobank/phenotypes/ukb25261.csv /scratch/groups/sabatti/ukb_phenotypes`)
end

# helper function to construct y for disease diagnosis
function build_y_for_disease_diagnosis(df, code_header)
    disease_codes = unique(skipmissing(df[!, "41202-0.0"]))
    possible_codes = disease_codes[startswith.(disease_codes, code_header)]
    y = zeros(size(df, 1))
    @showprogress for code in possible_codes
        fill!(y, 0)
        idx = findall(x -> x === code, df[!, "41202-0.0"])
        y[idx] .= 1
    end
    return y
end

# read raw phenotypes
df = CSV.read(phenotype_file, DataFrame)

Row,eid,21-0.0,21-1.0,21-2.0,31-0.0,34-0.0,35-0.0,35-1.0,35-2.0,48-0.0,48-1.0,48-2.0,49-0.0,49-1.0,49-2.0,50-0.0,50-1.0,50-2.0,51-0.0,51-1.0,51-2.0,52-0.0,53-0.0,53-1.0,53-2.0,54-0.0,54-1.0,54-2.0,68-0.0,68-1.0,68-2.0,74-0.0,74-1.0,74-2.0,84-0.0,84-0.1,84-0.2,84-0.3,84-0.4,84-0.5,84-1.0,84-1.1,84-1.2,84-1.3,84-1.4,84-1.5,84-2.0,84-2.1,84-2.2,84-2.3,84-2.4,84-2.5,87-0.0,87-0.1,87-0.2,87-0.3,87-0.4,87-0.5,87-0.6,87-0.7,87-0.8,87-0.9,87-0.10,87-0.11,87-0.12,87-0.13,87-0.14,87-0.15,87-0.16,87-0.17,87-0.18,87-0.19,87-0.20,87-0.21,87-0.22,87-0.23,87-0.24,87-0.25,87-0.26,87-0.27,87-0.28,87-0.29,87-0.30,87-0.31,87-0.32,87-1.0,87-1.1,87-1.2,87-1.3,87-1.4,87-1.5,87-1.6,87-1.7,87-1.8,87-1.9,87-1.10,87-1.11,87-1.12,87-1.13,87-1.14,⋯
Unnamed: 0_level_1,Int64,Int64?,Int64?,Int64?,Int64,Int64,Int64?,Int64?,Int64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Int64,Date,Date,Date,Int64,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,Missing,Missing,Int64?,Int64?,Int64?,Int64?,Int64?,Missing,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,Missing,Missing,Missing,Missing,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,⋯
1,3831650,1,missing,missing,0,1944,1,missing,missing,89.0,missing,missing,101.0,missing,missing,152.0,missing,missing,131.0,missing,missing,9,2008-04-02,missing,missing,11004,missing,missing,6,missing,missing,4,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,40,2006,2004,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,⋯
2,1585880,1,missing,missing,1,1946,1,missing,missing,104.0,missing,missing,110.0,missing,missing,186.5,missing,missing,148.0,missing,missing,3,2010-02-04,missing,missing,11021,missing,missing,7,missing,missing,4,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,60,2007,2008,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,⋯
3,1636662,1,missing,1,1,1948,1,missing,1,87.0,missing,87.0,95.0,missing,95.0,173.0,missing,172.0,141.0,missing,140.0,1,2010-02-15,missing,2015-09-09,11016,missing,11025,7,missing,7,2,missing,3,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,⋯
4,2167238,1,missing,missing,1,1946,1,missing,missing,107.0,missing,missing,110.0,missing,missing,179.0,missing,missing,141.0,missing,missing,2,2009-01-10,missing,missing,11007,missing,missing,6,missing,missing,4,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,24,62,2008,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,⋯
5,4462151,1,missing,missing,0,1967,1,missing,missing,65.0,missing,missing,92.0,missing,missing,171.0,missing,missing,140.0,missing,missing,4,2009-10-10,missing,missing,11017,missing,missing,6,missing,missing,8,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,17,2008,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,⋯
6,2268936,1,missing,missing,1,1967,1,missing,missing,91.0,missing,missing,106.0,missing,missing,189.0,missing,missing,150.0,missing,missing,12,2008-09-06,missing,missing,11007,missing,missing,6,missing,missing,3,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,11,2005,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,⋯
7,3462504,1,missing,missing,0,1964,1,missing,missing,70.0,missing,missing,98.0,missing,missing,163.0,missing,missing,134.0,missing,missing,1,2008-11-08,missing,missing,11011,missing,missing,6,missing,missing,3,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,⋯
8,4256945,1,missing,missing,1,1959,1,missing,missing,108.0,missing,missing,116.0,missing,missing,168.0,missing,missing,140.0,missing,missing,1,2008-08-19,missing,missing,11007,missing,missing,6,missing,missing,2,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,⋯
9,3327189,1,missing,missing,1,1943,1,missing,missing,105.0,missing,missing,108.0,missing,missing,171.0,missing,missing,139.0,missing,missing,8,2010-07-05,missing,missing,11014,missing,missing,7,missing,missing,5,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,55,8,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,⋯
10,1952586,1,missing,missing,0,1953,1,missing,missing,66.0,missing,missing,98.0,missing,missing,166.0,missing,missing,141.0,missing,missing,10,2010-02-05,missing,missing,11018,missing,missing,7,missing,missing,3,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,missing,⋯


Process raw phenotype data

In [2]:
# create new dataframe
df_new = DataFrame(eid = df[!, "eid"])

# ethnic background
population_dict = Dict(
    1 => "White", 
    1001 => "British",
    2001 => "White and Black Caribbean",
    3001 => "Indian",
    4001 => "Caribbean",
    2 => "Mixed",
    1002 => "Irish",
    2002 => "White and Black African",
    3002 => "Pakistani",
    4002 => "African",
    3 => "Asian or Asian British",
    1003 => "Any other white background",
    2003 => "White and Asian",
    3003 => "Bangladeshi",
    4003 => "Any other Black background",
    4 => "Black or Black British",
    2004 => "Any other mixed background",
    3004 => "Any other Asian background",
    5 => "Chinese",
    6 => "Other ethnic group",
    -1 => "Do not know",
    -3 => "Prefer not to answer"
)
labels = df[!, "21000-0.0"]
population = Union{String, Missing}[]
for label in labels
    pop = haskey(population_dict, label) ? population_dict[label] : missing
    push!(population, pop)
end
df_new[!, "ethnicity"] = population

# basic measurements
df_new[!, "sex"] = df[!, "22001-0.0"]
df_new[!, "age"] = df[!, "21003-0.0"]
df_new[!, "age_squared"] = df[!, "21003-0.0"] .^ 2
df_new[!, "height"] = df[!, "50-0.0"]
df_new[!, "weight"] = df[!, "21002-0.0"]
df_new[!, "waist_circ"] = df[!, "48-0.0"]
df_new[!, "bmi"] = df[!, "21001-0.0"]

# save some continuous phenotypes
df_new[!, "dbp"] = df[!, "4079-0.0"]
df_new[!, "sbp"] = df[!, "4080-0.0"]

# blood measurements
df_new[!, "whitecell"] = df[!, "30000-0.0"]
df_new[!, "redcell"] = df[!, "30010-0.0"]
df_new[!, "haemoglobin"] = df[!, "30020-0.0"]
df_new[!, "haematocrit"] = df[!, "30030-0.0"]
df_new[!, "corpuscular_volume"] = df[!, "30040-0.0"]
df_new[!, "corpuscular_haemoglobin"] = df[!, "30050-0.0"]
df_new[!, "redcell_width"] = df[!, "30070-0.0"]
df_new[!, "platelet"] = df[!, "30080-0.0"]
df_new[!, "platelet_volume"] = df[!, "30100-0.0"]
df_new[!, "platelet_width"] = df[!, "30110-0.0"]
df_new[!, "lymphocyte"] = df[!, "30120-0.0"]
df_new[!, "monocyte"] = df[!, "30130-0.0"]
df_new[!, "neutrophill"] = df[!, "30140-0.0"]
df_new[!, "eosinophill"] = df[!, "30150-0.0"]
df_new[!, "basophill"] = df[!, "30160-0.0"]
df_new[!, "nucleated_redcell"] = df[!, "30170-0.0"]
df_new[!, "reticulocyte"] = df[!, "30250-0.0"]
df_new[!, "reticulocyte_volume"] = df[!, "30260-0.0"]
df_new[!, "spheredcell_volume"] = df[!, "30270-0.0"]

# body compositions
df_new[!, "body_fat_mass"] = df[!, "23100-0.0"]
df_new[!, "right_leg_fat_mass"] = df[!, "23112-0.0"]
df_new[!, "left_leg_fat_mass"] = df[!, "23116-0.0"]
df_new[!, "right_arm_fat_mass"] = df[!, "23120-0.0"]
df_new[!, "left_arm_fat_mass"] = df[!, "23124-0.0"]
df_new[!, "trunk_fat_mass"] = df[!, "23128-0.0"]

# self reported cancer
cancer_dict = Dict{Int, String}()
cancer_dict[1002] = "breast_cancer"
cancer_dict[1044] = "prostate_cancer"
cancer_dict[1059] = "malignant_melanoma"
cancer_dict[1061] = "basal_cell_carcinoma"
cancer_col = df[!, "20001-0.0"]
for (d, v) in cancer_dict
    cancer_patients = findall(x -> x === d, cancer_col)
    y = zeros(size(df_new, 1))
    y[cancer_patients] .= 1
    df_new[!, v] = y
end

# self reported non-cancer illness
noncancer_dict = Dict{Int, String}()
noncancer_dict[1065] = "hypertension"
noncancer_dict[1111] = "asthma"
noncancer_dict[1226] = "hypothyroidism"
noncancer_dict[1094] = "thrombosis"
noncancer_dict[1074] = "angina"
noncancer_dict[1387] = "hayfever"
noncancer_dict[1075] = "myocardial_infarction"
noncancer_dict[1465] = "osteoarthritis"
noncancer_dict[1220] = "diabetes"
noncancer_dict[1081] = "stroke"
noncancer_dict[1286] = "depression"
noncancer_dict[1473] = "high_cholesterol"
noncancer_col = df[!, "20002-0.0"]
for (d, v) in noncancer_dict
    patients = findall(x -> x === d, noncancer_col)
    y = zeros(size(df_new, 1))
    y[patients] .= 1
    df_new[!, v] = y
end

# disease diagnosis
df_new[!, "malignant_neoplasms"] = build_y_for_disease_diagnosis(df, "C")
df_new[!, "digestive_disease"] = build_y_for_disease_diagnosis(df, "K")
df_new[!, "musculoskeletal_disease"] = build_y_for_disease_diagnosis(df, "M")
df_new[!, "genitourinary_disease"] = build_y_for_disease_diagnosis(df, "M")
df_new[!, "nerv_disease"] = build_y_for_disease_diagnosis(df, "G")

# PCs
for i in 1:40
    df_new[!, "PC$i"] = df[!, "22009-0.$i"]
end

# QC related
df_new[!, "recommend_exclude"] = df[!, "22010-0.0"]
df_new[!, "related_pairing"] = df[!, "22011-0.0"]

df_new

[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:01[39m
[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:00[39m
[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:00[39m
[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:00[39m
[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:00[39m


Row,eid,ethnicity,sex,age,age_squared,height,weight,waist_circ,bmi,dbp,sbp,whitecell,redcell,haemoglobin,haematocrit,corpuscular_volume,corpuscular_haemoglobin,redcell_width,platelet,platelet_volume,platelet_width,lymphocyte,monocyte,neutrophill,eosinophill,basophill,nucleated_redcell,reticulocyte,reticulocyte_volume,spheredcell_volume,body_fat_mass,right_leg_fat_mass,left_leg_fat_mass,right_arm_fat_mass,left_arm_fat_mass,trunk_fat_mass,malignant_melanoma,breast_cancer,basal_cell_carcinoma,prostate_cancer,hayfever,hypertension,stroke,angina,depression,high_cholesterol,osteoarthritis,thrombosis,hypothyroidism,myocardial_infarction,asthma,diabetes,malignant_neoplasms,digestive_disease,musculoskeletal_disease,genitourinary_disease,nerv_disease,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,PC11,PC12,PC13,PC14,PC15,PC16,PC17,PC18,PC19,PC20,PC21,PC22,PC23,PC24,PC25,PC26,PC27,PC28,PC29,PC30,PC31,PC32,PC33,PC34,PC35,PC36,PC37,PC38,PC39,PC40,recommend_exclude,related_pairing
Unnamed: 0_level_1,Int64,String?,Int64?,Int64,Int64,Float64?,Float64?,Float64?,Float64?,Int64?,Int64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Int64?,Int64?
1,3831650,Any other white background,0,63,3969,152.0,66.5,89.0,28.7829,missing,missing,5.4,4.22,14.2,38.1,90.2,33.6,11.7,191.0,9.1,16.7,1.8,0.3,3.1,0.2,0.0,0.0,0.067,104.0,81.4,25.3,5.4,5.3,1.3,1.4,12.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-6.79679,-5.72467,-0.889775,1.64259,-9.09482,26.8107,-34.9859,41.5491,-7.80189,-5.00808,9.8858,-2.83252,1.35181,2.79104,-0.798463,2.16172,1.42887,3.19611,-5.91596,-0.838235,-1.06657,3.94081,1.44346,-2.30757,-1.39602,1.78703,-2.03021,4.73473,-1.45981,1.79864,0.93178,1.37083,1.61619,-2.10561,2.69647,0.296454,-0.738116,-1.74635,0.0633084,-2.47506,missing,missing
2,1585880,British,1,63,3969,186.5,100.3,104.0,28.8365,87,137,5.66,4.582,15.59,44.29,96.67,34.02,12.65,169.8,10.28,17.52,1.75,0.5,3.06,0.33,0.02,0.0,0.06,117.49,86.95,27.5,3.5,3.6,1.3,1.4,17.8,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-13.9888,4.52334,-0.583159,4.23136,8.52121,-3.07393,3.56391,-0.698801,-21.8176,4.5205,9.90654,-6.06418,0.887828,9.57377,-8.26223,-15.82,-0.950277,8.59583,4.38431,-3.0436,-4.53414,0.875016,-5.55015,-0.856686,-4.27533,-3.04527,1.20411,-2.68732,0.147301,-5.6981,1.85351,-1.73147,1.08301,-1.00784,4.6133,1.82114,3.37137,5.08446,3.18535,-0.954864,missing,missing
3,1636662,British,1,62,3844,173.0,64.6,87.0,21.5844,77,129,7.39,5.218,15.16,44.94,86.12,29.06,13.34,282.5,8.08,16.39,1.57,0.75,4.72,0.31,0.04,0.0,0.042,100.35,79.06,14.9,2.0,1.9,0.6,0.6,9.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-12.9668,4.37347,-0.0836815,3.78297,3.65852,0.136707,2.19733,0.388501,-0.259216,0.528051,1.77582,-1.3516,0.148245,2.48234,-2.67543,-4.72521,0.621704,-0.153505,1.04465,0.391674,-3.43899,-1.74519,-6.84637,1.71267,-1.50315,-0.801507,1.20345,-4.67765,0.468793,-3.05507,0.375,2.14657,4.55789,1.34671,-2.46234,0.840909,-3.44592,-1.22095,-0.149903,-0.95351,missing,missing
4,2167238,British,1,62,3844,179.0,97.0,107.0,30.2737,85,131,5.3,4.645,16.43,45.54,98.05,35.38,12.91,182.8,9.48,16.18,1.22,0.68,3.2,0.18,0.02,0.0,0.069,104.61,83.08,27.3,4.3,4.2,1.2,1.3,16.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,-13.7763,3.36748,-0.568386,0.327065,-7.54269,0.110288,-4.00561,-2.4736,5.08877,0.397343,3.47388,0.736705,1.57226,1.0083,0.0449918,2.30324,0.203928,-3.29895,0.625211,1.50564,-2.84879,-1.57275,-3.48241,-2.01081,-3.24113,-9.69671,-0.233665,-7.78702,-6.98811,-1.08937,3.20191,3.10126,-0.78327,0.982087,1.8995,-1.27111,1.6369,-1.22407,-1.75864,3.28782,missing,missing
5,4462151,British,0,42,1764,171.0,59.1,65.0,20.2113,78,122,7.81,4.631,13.45,39.27,84.8,29.04,12.5,294.9,8.72,17.42,2.67,0.42,4.55,0.13,0.06,0.0,0.048,94.46,73.96,15.5,3.2,3.3,0.6,0.6,7.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-13.19,4.4652,-0.557933,-0.0283174,0.944134,2.01466,-0.164368,0.615322,3.34736,-2.11681,-1.44999,0.499021,-4.09461,4.00707,-0.27513,1.58425,-0.77843,2.62138,3.01469,0.785397,2.79935,-3.48454,-0.73385,-2.14894,0.68748,1.66703,-7.06949,1.07955,-0.602806,3.43454,-0.981911,0.557907,-2.18862,-3.27583,-0.431231,-1.05678,1.31972,-7.98701,-1.94961,2.15982,missing,missing
6,2268936,British,1,40,1600,189.0,88.3,91.0,24.7194,82,135,5.99,5.114,15.33,44.82,87.65,29.97,13.28,274.4,8.54,16.07,1.61,0.35,3.6,0.42,0.01,0.0,0.036,95.15,71.87,20.7,2.5,2.4,1.1,1.1,13.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,-14.7951,4.46365,-0.539141,1.65227,-3.84824,1.82587,3.27178,-0.926173,1.04391,2.58283,0.151791,1.39674,1.78159,-1.82798,-1.12349,2.46677,-0.152994,1.91847,-0.321016,-1.99655,-1.04928,6.55065,2.53598,-0.606389,0.0843075,-3.06135,2.67409,-1.76689,-2.81477,1.15938,-1.36451,4.06323,-4.22977,0.9526,0.573767,-4.48155,-0.900771,2.05907,1.39232,1.56734,missing,missing
7,3462504,British,0,44,1936,163.0,59.4,70.0,22.3569,83,121,6.49,4.393,13.83,39.37,89.63,31.48,12.65,325.2,9.19,15.64,1.14,0.44,4.82,0.05,0.04,0.0,0.041,108.94,83.97,19.5,3.6,3.6,0.9,0.9,10.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-10.1992,7.70482,-2.08296,-2.32233,-6.46273,0.590275,0.809516,-1.81278,-2.93386,-4.52605,-5.2486,2.57004,-2.55518,-1.01777,3.19746,0.797547,2.60144,2.65496,5.08125,0.710969,-1.35203,-1.37647,0.265208,0.56842,2.55269,0.857405,-3.2495,3.94658,0.948104,-1.24147,1.90839,3.34213,2.26913,6.33912,-2.92098,1.05135,0.243418,-1.4562,-2.41878,0.611315,missing,missing
8,4256945,British,1,49,2401,168.0,105.0,108.0,37.2024,87,143,7.3,5.0,15.6,45.5,90.9,31.3,13.6,287.0,9.3,16.1,1.4,0.5,5.1,0.2,0.0,0.0,0.085,97.3,77.5,36.6,5.2,5.4,1.8,2.1,22.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-10.2422,3.29665,-0.309951,-2.35527,-3.67374,-0.659152,-0.197812,0.683547,-0.598722,0.995909,-4.58489,1.52566,-2.45697,-1.34202,-1.62694,-1.72875,3.59038,-1.39138,-0.579133,0.0931258,-1.18129,-1.45914,2.40011,0.312422,-2.06964,1.23983,5.05882,3.23408,-7.20318,2.07819,2.3897,-7.15351,2.13024,2.42399,-0.972033,2.07103,-0.517231,-0.862144,0.229905,-0.961889,missing,missing
9,3327189,British,1,66,4356,171.0,86.0,105.0,29.4108,83,138,11.25,4.785,14.55,44.37,92.74,30.42,14.54,221.5,9.66,16.41,2.34,0.74,7.97,0.13,0.07,0.0,0.099,108.89,86.14,28.9,3.8,3.2,1.4,1.6,18.9,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-13.3485,4.32198,-3.21435,2.94151,-2.58228,-1.62825,-1.92052,-1.89329,1.10736,-0.825762,3.30947,-3.56905,-2.08075,0.686668,1.05573,-2.59693,4.10707,1.72762,3.08509,-1.03592,1.0022,-5.16344,-2.93246,-2.2833,-1.70268,-0.392667,-2.673,2.85081,-0.437277,-2.26508,3.8922,-1.63084,1.72223,5.53536,-4.79882,-5.7231,1.24274,1.30451,1.17841,-6.65708,missing,missing
10,1952586,British,0,56,3136,166.0,61.2,66.0,22.2093,71,125,6.84,3.951,13.08,38.0,96.19,33.1,14.15,221.8,10.89,16.41,2.03,0.36,4.39,0.05,0.01,0.0,0.055,106.98,89.61,11.7,2.9,2.8,0.5,0.6,4.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-12.2479,2.56216,-2.42317,-0.270878,-6.42852,-0.582263,-1.50898,-5.20213,4.85428,2.77621,2.93994,-1.74416,0.761961,1.94522,1.09227,5.76399,-3.18235,4.16352,0.151795,-2.37888,-2.76326,-1.64167,1.71861,2.47868,0.607908,-1.91046,-4.1848,-0.0847944,3.54914,-3.5979,3.62144,2.43786,-1.45472,0.0410614,2.79074,2.82709,-2.8883,-3.13834,-1.7785,-0.672912,missing,missing


In [3]:
for (v, d) in noncancer_dict
    println("$d sample size = ", sum(df_new[!, d]))
end

hayfever sample size = 8949.0
hypertension sample size = 114036.0
stroke sample size = 5206.0
angina sample size = 10908.0
depression sample size = 8123.0
high_cholesterol sample size = 9271.0
osteoarthritis sample size = 11342.0
thrombosis sample size = 5849.0
hypothyroidism sample size = 9373.0
myocardial_infarction sample size = 11208.0
asthma sample size = 37724.0
diabetes sample size = 5874.0


In [4]:
CSV.write("/scratch/groups/sabatti/ukb_phenotypes/phenotypes.csv", df_new)

"/scratch/groups/sabatti/ukb_phenotypes/phenotypes.csv"

# Quality control on phenotype file

Here we try to match phenotypes to genotype data, remove missings, do basic quality control...etc

+ Copied genotype files (original + SHAPEIT knockoff SNPs): `/scratch/groups/sabatti/ukb_genotypes`
+ Copied population label: `/scratch/groups/sabatti/ukb_populations`
+ Original SHAPEIT knockoffs are located at: `/oak/stanford/groups/candes/popstruct/analysis/knockoffs/
`
+ Original population labels: `/oak/stanford/groups/candes/prs/data/populations`

To copy the genotypes, I used this script (runs for ~4h):
```
#!/bin/bash
#
#SBATCH --job-name=copy
#
#SBATCH --time=48:00:00
#SBATCH --cpus-per-task=1
#SBATCH --mem-per-cpu=10G
#SBATCH --partition=normal,candes

#save job info on joblog:
echo "Job $JOB_ID started on:   " `hostname -s`
echo "Job $JOB_ID started on:   " `date `

# run code
cp /oak/stanford/groups/candes/popstruct/analysis/knockoffs/* /scratch/groups/sabatti/ukb_genotypes
cp /oak/stanford/groups/candes/prs/data/populations/* /scratch/groups/sabatti/ukb_populations

#echo job info on joblog:
echo "Job $JOB_ID ended on:   " `hostname -s`
echo "Job $JOB_ID ended on:   " `date `
#echo " "
```

In [8]:
using DataFrames
using CSV
using SnpArrays
using ProgressMeter

df_complex = CSV.read("/scratch/groups/sabatti/ukb_phenotypes/phenotypes.csv", DataFrame)
include_idx = trues(size(df_complex, 1))

# first exclude samples recommended by UKB
ukb_recommend_exclude = findall(!ismissing, df_complex[!, end-1])
include_idx[ukb_recommend_exclude] .= false

# exclude related samples
ukb_related = findall(!ismissing, df_complex[!, end])
include_idx[ukb_related] .= false

# exclude samples that have missing data in any of the columns
no_missing = completecases(df_complex[!, 1:end-2])
include_idx[.!no_missing] .= false

df_qc = df_complex[include_idx, 1:end-2]

Row,eid,ethnicity,sex,age,age_squared,height,weight,waist_circ,bmi,dbp,sbp,whitecell,redcell,haemoglobin,haematocrit,corpuscular_volume,corpuscular_haemoglobin,redcell_width,platelet,platelet_volume,platelet_width,lymphocyte,monocyte,neutrophill,eosinophill,basophill,nucleated_redcell,reticulocyte,reticulocyte_volume,spheredcell_volume,body_fat_mass,right_leg_fat_mass,left_leg_fat_mass,right_arm_fat_mass,left_arm_fat_mass,trunk_fat_mass,malignant_melanoma,breast_cancer,basal_cell_carcinoma,prostate_cancer,hayfever,hypertension,stroke,angina,depression,high_cholesterol,osteoarthritis,thrombosis,hypothyroidism,myocardial_infarction,asthma,diabetes,malignant_neoplasms,digestive_disease,musculoskeletal_disease,genitourinary_disease,nerv_disease,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,PC11,PC12,PC13,PC14,PC15,PC16,PC17,PC18,PC19,PC20,PC21,PC22,PC23,PC24,PC25,PC26,PC27,PC28,PC29,PC30,PC31,PC32,PC33,PC34,PC35,PC36,PC37,PC38,PC39,PC40
Unnamed: 0_level_1,Int64,String31?,Int64?,Int64,Int64,Float64?,Float64?,Float64?,Float64?,Int64?,Int64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?
1,1585880,British,1,63,3969,186.5,100.3,104.0,28.8365,87,137,5.66,4.582,15.59,44.29,96.67,34.02,12.65,169.8,10.28,17.52,1.75,0.5,3.06,0.33,0.02,0.0,0.06,117.49,86.95,27.5,3.5,3.6,1.3,1.4,17.8,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-13.9888,4.52334,-0.583159,4.23136,8.52121,-3.07393,3.56391,-0.698801,-21.8176,4.5205,9.90654,-6.06418,0.887828,9.57377,-8.26223,-15.82,-0.950277,8.59583,4.38431,-3.0436,-4.53414,0.875016,-5.55015,-0.856686,-4.27533,-3.04527,1.20411,-2.68732,0.147301,-5.6981,1.85351,-1.73147,1.08301,-1.00784,4.6133,1.82114,3.37137,5.08446,3.18535,-0.954864
2,1636662,British,1,62,3844,173.0,64.6,87.0,21.5844,77,129,7.39,5.218,15.16,44.94,86.12,29.06,13.34,282.5,8.08,16.39,1.57,0.75,4.72,0.31,0.04,0.0,0.042,100.35,79.06,14.9,2.0,1.9,0.6,0.6,9.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-12.9668,4.37347,-0.0836815,3.78297,3.65852,0.136707,2.19733,0.388501,-0.259216,0.528051,1.77582,-1.3516,0.148245,2.48234,-2.67543,-4.72521,0.621704,-0.153505,1.04465,0.391674,-3.43899,-1.74519,-6.84637,1.71267,-1.50315,-0.801507,1.20345,-4.67765,0.468793,-3.05507,0.375,2.14657,4.55789,1.34671,-2.46234,0.840909,-3.44592,-1.22095,-0.149903,-0.95351
3,2167238,British,1,62,3844,179.0,97.0,107.0,30.2737,85,131,5.3,4.645,16.43,45.54,98.05,35.38,12.91,182.8,9.48,16.18,1.22,0.68,3.2,0.18,0.02,0.0,0.069,104.61,83.08,27.3,4.3,4.2,1.2,1.3,16.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,-13.7763,3.36748,-0.568386,0.327065,-7.54269,0.110288,-4.00561,-2.4736,5.08877,0.397343,3.47388,0.736705,1.57226,1.0083,0.0449918,2.30324,0.203928,-3.29895,0.625211,1.50564,-2.84879,-1.57275,-3.48241,-2.01081,-3.24113,-9.69671,-0.233665,-7.78702,-6.98811,-1.08937,3.20191,3.10126,-0.78327,0.982087,1.8995,-1.27111,1.6369,-1.22407,-1.75864,3.28782
4,4462151,British,0,42,1764,171.0,59.1,65.0,20.2113,78,122,7.81,4.631,13.45,39.27,84.8,29.04,12.5,294.9,8.72,17.42,2.67,0.42,4.55,0.13,0.06,0.0,0.048,94.46,73.96,15.5,3.2,3.3,0.6,0.6,7.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-13.19,4.4652,-0.557933,-0.0283174,0.944134,2.01466,-0.164368,0.615322,3.34736,-2.11681,-1.44999,0.499021,-4.09461,4.00707,-0.27513,1.58425,-0.77843,2.62138,3.01469,0.785397,2.79935,-3.48454,-0.73385,-2.14894,0.68748,1.66703,-7.06949,1.07955,-0.602806,3.43454,-0.981911,0.557907,-2.18862,-3.27583,-0.431231,-1.05678,1.31972,-7.98701,-1.94961,2.15982
5,2268936,British,1,40,1600,189.0,88.3,91.0,24.7194,82,135,5.99,5.114,15.33,44.82,87.65,29.97,13.28,274.4,8.54,16.07,1.61,0.35,3.6,0.42,0.01,0.0,0.036,95.15,71.87,20.7,2.5,2.4,1.1,1.1,13.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,-14.7951,4.46365,-0.539141,1.65227,-3.84824,1.82587,3.27178,-0.926173,1.04391,2.58283,0.151791,1.39674,1.78159,-1.82798,-1.12349,2.46677,-0.152994,1.91847,-0.321016,-1.99655,-1.04928,6.55065,2.53598,-0.606389,0.0843075,-3.06135,2.67409,-1.76689,-2.81477,1.15938,-1.36451,4.06323,-4.22977,0.9526,0.573767,-4.48155,-0.900771,2.05907,1.39232,1.56734
6,3462504,British,0,44,1936,163.0,59.4,70.0,22.3569,83,121,6.49,4.393,13.83,39.37,89.63,31.48,12.65,325.2,9.19,15.64,1.14,0.44,4.82,0.05,0.04,0.0,0.041,108.94,83.97,19.5,3.6,3.6,0.9,0.9,10.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-10.1992,7.70482,-2.08296,-2.32233,-6.46273,0.590275,0.809516,-1.81278,-2.93386,-4.52605,-5.2486,2.57004,-2.55518,-1.01777,3.19746,0.797547,2.60144,2.65496,5.08125,0.710969,-1.35203,-1.37647,0.265208,0.56842,2.55269,0.857405,-3.2495,3.94658,0.948104,-1.24147,1.90839,3.34213,2.26913,6.33912,-2.92098,1.05135,0.243418,-1.4562,-2.41878,0.611315
7,4256945,British,1,49,2401,168.0,105.0,108.0,37.2024,87,143,7.3,5.0,15.6,45.5,90.9,31.3,13.6,287.0,9.3,16.1,1.4,0.5,5.1,0.2,0.0,0.0,0.085,97.3,77.5,36.6,5.2,5.4,1.8,2.1,22.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-10.2422,3.29665,-0.309951,-2.35527,-3.67374,-0.659152,-0.197812,0.683547,-0.598722,0.995909,-4.58489,1.52566,-2.45697,-1.34202,-1.62694,-1.72875,3.59038,-1.39138,-0.579133,0.0931258,-1.18129,-1.45914,2.40011,0.312422,-2.06964,1.23983,5.05882,3.23408,-7.20318,2.07819,2.3897,-7.15351,2.13024,2.42399,-0.972033,2.07103,-0.517231,-0.862144,0.229905,-0.961889
8,3327189,British,1,66,4356,171.0,86.0,105.0,29.4108,83,138,11.25,4.785,14.55,44.37,92.74,30.42,14.54,221.5,9.66,16.41,2.34,0.74,7.97,0.13,0.07,0.0,0.099,108.89,86.14,28.9,3.8,3.2,1.4,1.6,18.9,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-13.3485,4.32198,-3.21435,2.94151,-2.58228,-1.62825,-1.92052,-1.89329,1.10736,-0.825762,3.30947,-3.56905,-2.08075,0.686668,1.05573,-2.59693,4.10707,1.72762,3.08509,-1.03592,1.0022,-5.16344,-2.93246,-2.2833,-1.70268,-0.392667,-2.673,2.85081,-0.437277,-2.26508,3.8922,-1.63084,1.72223,5.53536,-4.79882,-5.7231,1.24274,1.30451,1.17841,-6.65708
9,1952586,British,0,56,3136,166.0,61.2,66.0,22.2093,71,125,6.84,3.951,13.08,38.0,96.19,33.1,14.15,221.8,10.89,16.41,2.03,0.36,4.39,0.05,0.01,0.0,0.055,106.98,89.61,11.7,2.9,2.8,0.5,0.6,4.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-12.2479,2.56216,-2.42317,-0.270878,-6.42852,-0.582263,-1.50898,-5.20213,4.85428,2.77621,2.93994,-1.74416,0.761961,1.94522,1.09227,5.76399,-3.18235,4.16352,0.151795,-2.37888,-2.76326,-1.64167,1.71861,2.47868,0.607908,-1.91046,-4.1848,-0.0847944,3.54914,-3.5979,3.62144,2.43786,-1.45472,0.0410614,2.79074,2.82709,-2.8883,-3.13834,-1.7785,-0.672912
10,2639147,British,0,60,3600,164.0,66.9,72.0,24.8736,77,171,5.61,3.692,12.54,37.12,100.6,33.98,13.35,304.4,8.6,15.96,1.58,0.41,3.53,0.05,0.03,0.0,0.043,117.78,92.85,24.7,4.2,4.2,1.0,1.0,14.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,-11.9362,4.68724,0.684852,-0.13159,-4.02345,0.86179,1.28984,0.33128,2.19196,-2.52924,-2.77132,2.95061,0.999735,-0.737206,2.49384,1.06396,2.52269,2.96527,-2.02215,2.65405,5.9808,-0.28999,2.15804,-6.73998,-1.41488,-0.154476,-2.586,0.971471,3.57244,-0.125054,-2.54739,-2.07834,-3.17918,2.08337,-1.45751,-2.16331,-4.25566,-0.511716,-2.0082,-5.12359


Check if final dataframe have any missing

In [9]:
nmissing = 0
for j in 1:size(df_qc, 2), i in 1:size(df_qc, 1)
    ismissing(df_qc[i, j]) && (nmissing += 1)
end
nmissing

0

In [10]:
CSV.write("/scratch/groups/sabatti/ukb_phenotypes/phenotypes.QC.csv", df_qc)

"/scratch/groups/sabatti/ukb_phenotypes/phenotypes.QC.csv"

# Process SHAPEIT knockoffs 

In the copied PLINK files, the original genotypes and the knockoffs are shuffled together and separated by chromosomes. We will combine the chromosomes into 1 file

+ SHAPEIT knockoffs: `/oak/stanford/groups/candes/popstruct/analysis/knockoffs`
+ These files have SNPs separated by chromosome, but the order of the original genotypes and their knockoffs have already been shuffled, so we can simply combine them into 1 large PLINK file

We will merge all chromosomes into a single file, for files with different resolutions.

In [None]:
#
# This script merges chromosomes 1-22 into a single PLINK file for a given resolution `res`
# It also merges the groups (original separated by chromosomes) into a single file
# The code below should be saved in a file e.g. merge.jl and can be executed via `julia merge.jl 0`
# where the `0` indicates res = 0
#
using SnpArrays, CSV, DataFrames
function merge(res::Int)
    # merge chromosome genotypes 
    d = Dict{AbstractString, SnpData}()
    for chr in 1:22
        d[string(chr)] = SnpData("/scratch/groups/sabatti/ukb_genotypes/ukb_gen_chr$(chr)_ibd1_res$(res)")
    end
    des = "/scratch/groups/sabatti/ukb_genotypes/ukb_gen_merged_res$res"
    merge_plink(des, d)

    # also, read group info for each chromosome and output a single file
    unique_groups = 0
    groups = DataFrame(SNP=String[], Group=Int[])
    for chr in 1:22
        group = CSV.read("/scratch/groups/sabatti/ukb_genotypes/ukb_gen_chr$(chr)_ibd1_res$(res)_grp.txt", DataFrame)
        group[!, 2] .+= unique_groups
        unique_groups += length(unique(group[!, 2]))
        groups = vcat(groups, group)
    end
    CSV.write("/scratch/groups/sabatti/ukb_genotypes/ukb_gen_merged_res$(res)_grp.txt", groups)
end
res = parse(Int, ARGS[1])
merge(res)

In [3]:
# execute this script in the Julia REPL to submit merge jobs simultaneously
function submit()
    for res in 0:6
        # create .sh file to submit jobs
        filename = "submit.sh"
        open(filename, "w") do io
            println(io, "#!/bin/bash")
            println(io, "#")
            println(io, "#SBATCH --job-name=res$res")
            println(io, "#")
            println(io, "#SBATCH --time=24:00:00")
            println(io, "#SBATCH --cpus-per-task=1")
            println(io, "#SBATCH --mem-per-cpu=150G")
            println(io, "#SBATCH --partition=normal,candes")
            println(io, "#SBATCH --output=/scratch/groups/sabatti/slurm-%j.out")
            println(io, "")
            println(io, "#save job info on joblog:")
            println(io, "echo \"Job \$JOB_ID started on:   \" `hostname -s`")
            println(io, "echo \"Job \$JOB_ID started on:   \" `date `")
            println(io, "")
            println(io, "# load the job environment:")
            println(io, "module load julia/1.8.4")
            println(io, "")
            println(io, "# run code")
            println(io, "echo 'julia /scratch/groups/sabatti/merge.jl $res'")
            println(io, "julia /scratch/groups/sabatti/merge.jl $res")
            println(io, "")
            println(io, "#echo job info on joblog:")
            println(io, "echo \"Job \$JOB_ID ended on:   \" `hostname -s`")
            println(io, "echo \"Job \$JOB_ID ended on:   \" `date `")
            println(io, "#echo \" \"")
        end
        # submit job
        run(`sbatch $filename`)
        println("submitted res $res")
        rm(filename, force=true)
    end
end
submit()

Submitted batch job 51300763
submitted res 0
Submitted batch job 51300764
submitted res 1
Submitted batch job 51300765
submitted res 2
Submitted batch job 51300766
submitted res 3
Submitted batch job 51300767
submitted res 4
Submitted batch job 51300768
submitted res 5
Submitted batch job 51300769
submitted res 6


# Create UKB file with only original genotypes

Note we filter the data located at
+ `/oak/stanford/groups/candes/popstruct/analysis/knockoffs_merged/ukb_gen_merged_res0.bed`


since these sample genotypes seemed to have undergone QC steps already, with the number of SNPs being $p=591513$. 

In [4]:
#
# This filters the merged data for original genotypes
#
using SnpArrays, CSV, DataFrames
function filter()
    geno_dir = "/scratch/groups/sabatti/ukb_genotypes"
    isdir(geno_dir) || mkpath(geno_dir)
    if !isfile(joinpath(geno_dir, "ukb_gen_merged_res0.bed"))
        original_file = "/oak/stanford/groups/candes/popstruct/analysis/knockoffs_merged/ukb_gen_merged_res0"
        run(`cp $(original_file * ".bim") $geno_dir`)
        run(`cp $(original_file * ".fam") $geno_dir`)
        run(`cp $(original_file * ".bed") $geno_dir`)
    end
    
    bimfile = CSV.read(joinpath(geno_dir, "ukb_gen_merged_res0.bim"), DataFrame, header=false)
    columns_to_keep = findall(x -> !endswith(x, ".k"), bimfile[!, 2])
    n = countlines(joinpath(geno_dir, "ukb_gen_merged_res0.fam"))
    SnpArrays.filter(
        joinpath(geno_dir, "ukb_gen_merged_res0"), # input PLINK file
        1:n, # row indices to save
        columns_to_keep, # column indices to save
        des = joinpath(geno_dir, "ukb_gen") # output file
    )
end
@time filter()

3041.032004 seconds (5.11 M allocations: 378.913 MiB, 0.01% gc time, 0.12% compilation time: 41% of which was recompilation)


486975×591513 SnpArray:
 0x03  0x00  0x00  0x00  0x03  0x00  …  0x03  0x03  0x02  0x02  0x03  0x03
 0x02  0x00  0x02  0x00  0x02  0x00     0x03  0x03  0x03  0x03  0x03  0x03
 0x02  0x00  0x02  0x00  0x02  0x00     0x03  0x03  0x03  0x02  0x03  0x03
 0x02  0x00  0x02  0x00  0x02  0x00     0x03  0x03  0x02  0x02  0x03  0x03
 0x03  0x02  0x00  0x00  0x02  0x02     0x02  0x03  0x03  0x02  0x03  0x03
 0x03  0x02  0x00  0x00  0x02  0x02  …  0x03  0x03  0x03  0x02  0x02  0x03
 0x03  0x02  0x00  0x00  0x02  0x02     0x03  0x03  0x03  0x02  0x02  0x03
 0x03  0x00  0x00  0x00  0x03  0x00     0x02  0x03  0x03  0x03  0x03  0x03
 0x03  0x00  0x00  0x00  0x03  0x00     0x03  0x03  0x03  0x03  0x03  0x03
 0x03  0x00  0x00  0x00  0x03  0x00     0x02  0x03  0x03  0x02  0x03  0x02
 0x03  0x02  0x00  0x00  0x02  0x02  …  0x03  0x03  0x03  0x03  0x03  0x02
 0x02  0x00  0x02  0x02  0x02  0x00     0x03  0x03  0x03  0x02  0x03  0x03
 0x02  0x00  0x02  0x00  0x02  0x00     0x03  0x02  0x03  0x00  0x03  0x03
 

# Find samples that are British, has phenotypes, and has genotypes

Here we get sample IDs that
1. Is British
2. Is present in phenotype file (which excluded many samples, mostly related samples)
3. Is present in genotype file (which did not exclude related samples)

In [3]:
# 1. get sample IDs in phenotype file
phenotype_file = CSV.read("/scratch/groups/sabatti/ukb_phenotypes/phenotypes.QC.csv", DataFrame)
phenotype_ids = phenotype_file[!, "eid"]

# 2. British sample IDs
population_file = "/scratch/groups/sabatti/ukb_populations/samples_british.txt"
if !isfile(population_file)
    run(`rm -rf /scratch/groups/sabatti/ukb_populations/ukb_populations`)
    run(`cp -r /oak/stanford/groups/candes/prs_zhimei/data/populations /scratch/groups/sabatti/ukb_populations`)
end
british_ids = CSV.read(population_file, DataFrame, header=false)
british_ids = british_ids[!, 1]

# 3. genotype sample IDs
genotype_ids = CSV.read("/scratch/groups/sabatti/ukb_genotypes/ukb_gen.fam", DataFrame, header=false)
genotype_ids = genotype_ids[!, 1]

# all sample IDs that are british and exist in both phenotype/genotype files
keep_ids = intersect(genotype_ids, british_ids, phenotype_ids)

306629-element Vector{Int64}:
 5393090
 1532732
 3186275
 1277047
 5282298
 1782425
 3359165
 5262420
 2168079
 4529749
 4442813
 2127187
 2198797
       ⋮
 2782014
 3339049
 2386613
 1725223
 5830003
 2797984
 1927131
 1174114
 1328060
 4760220
 5306973
 2305544

There are 306629 samples that are British and has genotypes/phenotypes. Now create the phenotype and genotype file that include only these samples

In [4]:
# filter genotypes
xdata = SnpData("/scratch/groups/sabatti/ukb_genotypes/ukb_gen")
n, p = size(xdata)
rows_to_keep = filter!(!isnothing, indexin(keep_ids, genotype_ids)) |> Vector{Int} |> sort!

# check rows_to_keep indeed keeps 306629 samples
famfile = CSV.read("/scratch/groups/sabatti/ukb_genotypes/ukb_gen.fam", DataFrame, header=false)
size(famfile[rows_to_keep, :], 1) == 306629

true

Filter knockoffs genotypes and save in separate folder

In [5]:
# note: each res takes ~40min
for res in 0:6
    SnpArrays.filter(
        "/scratch/groups/sabatti/ukb_genotypes/ukb_gen_merged_res$(res)",  # input file
        rows_to_keep, # row indices to save
        1:p,          # column indices to save
        des = "/scratch/groups/sabatti/ukb_genotypes/ukb_gen_merged_filtered_res$(res)" # output file
    )
    GC.gc();GC.gc();GC.gc()
end

Filter UKB data with only original genotypes and save in separate folder

In [6]:
@time SnpArrays.filter(
    "/scratch/groups/sabatti/ukb_genotypes/ukb_gen",  # input file
    rows_to_keep, # row indices to save
    1:p,          # column indices to save
    des = "/scratch/groups/sabatti/ukb_genotypes/ukb_gen_british" # output file
)

1209.750191 seconds (2.16 M allocations: 129.620 MiB)


306629×591513 SnpArray:
 0x03  0x00  0x00  0x00  0x03  0x00  …  0x03  0x03  0x02  0x02  0x03  0x03
 0x02  0x00  0x02  0x00  0x02  0x00     0x03  0x03  0x03  0x03  0x03  0x03
 0x02  0x00  0x02  0x00  0x02  0x00     0x03  0x03  0x03  0x02  0x03  0x03
 0x02  0x00  0x02  0x00  0x02  0x00     0x03  0x03  0x02  0x02  0x03  0x03
 0x03  0x02  0x00  0x00  0x02  0x02     0x02  0x03  0x03  0x02  0x03  0x03
 0x03  0x02  0x00  0x00  0x02  0x02  …  0x03  0x03  0x03  0x02  0x02  0x03
 0x03  0x00  0x00  0x00  0x03  0x00     0x03  0x03  0x03  0x03  0x03  0x03
 0x03  0x00  0x00  0x00  0x03  0x00     0x02  0x03  0x03  0x02  0x03  0x02
 0x03  0x02  0x00  0x00  0x02  0x02     0x03  0x03  0x03  0x03  0x03  0x02
 0x02  0x00  0x02  0x02  0x02  0x00     0x03  0x03  0x03  0x02  0x03  0x03
 0x02  0x00  0x02  0x00  0x02  0x00  …  0x03  0x02  0x03  0x00  0x03  0x03
 0x03  0x00  0x00  0x00  0x03  0x00     0x03  0x03  0x03  0x02  0x03  0x03
 0x02  0x00  0x02  0x00  0x00  0x00     0x03  0x03  0x03  0x03  0x03  0x03
 

Filter phenotype file, reorder the samples so samples come in same order as genotype file, then save as final phenotype file

In [7]:
# reorder the phenotypes so rows of y and the PLINK files match
reorder_idx = indexin(keep_ids, phenotype_ids) |> Vector{Int}
phenotypes_reordered = phenotype_file[reorder_idx, :]
CSV.write("/scratch/groups/sabatti/ukb_phenotypes/phenotypes.QC.britishonly.csv", phenotypes_reordered)

"/scratch/groups/sabatti/ukb_phenotypes/phenotypes.QC.britishonly.csv"

Check genotype samples come in same order as phenotypes

In [8]:
xdata = SnpData("/scratch/groups/sabatti/ukb_genotypes/ukb_gen_british")
[phenotypes_reordered[!, "eid"] xdata.person_info[!, "fid"]]

306629×2 Matrix{Any}:
 5393090  "5393090"
 1532732  "1532732"
 3186275  "3186275"
 1277047  "1277047"
 5282298  "5282298"
 1782425  "1782425"
 3359165  "3359165"
 5262420  "5262420"
 2168079  "2168079"
 4529749  "4529749"
 4442813  "4442813"
 2127187  "2127187"
 2198797  "2198797"
       ⋮  
 2782014  "2782014"
 3339049  "3339049"
 2386613  "2386613"
 1725223  "1725223"
 5830003  "5830003"
 2797984  "2797984"
 1927131  "1927131"
 1174114  "1174114"
 1328060  "1328060"
 4760220  "4760220"
 5306973  "5306973"
 2305544  "2305544"

# Training and testing data

For PRS, one needs to create training/testing datasets. 

In [3]:
using SnpArrays, StatsBase, Random

# filter genotypes
Random.seed!(2025)
xdata = SnpData("/scratch/groups/candes/for_parth/ukb_gen_british")
n, p = size(xdata)
# train_idx = sample(1:n, round(Int, 0.9n), replace=false) |> sort!
# test_idx = setdiff(1:n, train_idx)

# training (~40 min)
# SnpArrays.filter(
#     "/scratch/groups/sabatti/ukb_genotypes/ukb_gen_british",  # input file
#     train_idx, # row indices to save
#     1:p,       # column indices to save
#     des = "/scratch/groups/sabatti/ukb_genotypes/ukb_gen_british_train" # output file
# )

# testing (~3 min)
SnpArrays.filter(
    "/scratch/groups/candes/for_parth/ukb_gen_british",  # input file
    1:10000,  # row indices to save
    1:10000,  # column indices to save
    des = "/scratch/groups/candes/for_parth/ukb_gen_british_10ksubset" # output file
)

10000×10000 SnpArray:
 0x03  0x00  0x00  0x00  0x03  0x00  …  0x00  0x02  0x00  0x00  0x02  0x00
 0x02  0x00  0x02  0x00  0x02  0x00     0x00  0x03  0x00  0x00  0x00  0x00
 0x02  0x00  0x02  0x00  0x02  0x00     0x00  0x03  0x00  0x00  0x00  0x00
 0x02  0x00  0x02  0x00  0x02  0x00     0x00  0x00  0x00  0x00  0x00  0x00
 0x03  0x02  0x00  0x00  0x02  0x02     0x00  0x00  0x02  0x00  0x00  0x00
 0x03  0x02  0x00  0x00  0x02  0x02  …  0x00  0x02  0x00  0x00  0x03  0x02
 0x03  0x00  0x00  0x00  0x03  0x00     0x00  0x03  0x00  0x00  0x02  0x00
 0x03  0x00  0x00  0x00  0x03  0x00     0x00  0x02  0x00  0x00  0x02  0x00
 0x03  0x02  0x00  0x00  0x02  0x02     0x00  0x02  0x02  0x02  0x03  0x00
 0x02  0x00  0x02  0x02  0x02  0x00     0x00  0x03  0x00  0x00  0x00  0x00
 0x02  0x00  0x02  0x00  0x02  0x00  …  0x00  0x03  0x00  0x00  0x00  0x00
 0x03  0x00  0x00  0x00  0x03  0x00     0x00  0x03  0x00  0x02  0x03  0x00
 0x02  0x00  0x02  0x00  0x00  0x00     0x02  0x03  0x00  0x00  0x02  0x02
   

In [10]:
using DelimitedFiles
writedlm("/scratch/groups/sabatti/ukb_genotypes/test_idx", test_idx)
writedlm("/scratch/groups/sabatti/ukb_genotypes/train_idx", train_idx);

## File backed matrices

+ Because bigstatsr and bigsnpr requires FBM format, we create them here. 
+ ~2000 sec for all 3 files

Note: this was ran in the terminal

In [None]:
using RCall
R"library(bigsnpr)"

# bedfile = directory to plink file (with .bed extension)
# outfile = output filename (do not include .bk extension)
function make_fbm(bedfile::String, outfile::String)
    @rput bedfile outfile
    R"""
    snp_readBed2(bedfile, backingfile = outfile)
    """
    return nothing
end

geno_dir = "/scratch/groups/sabatti/ukb_genotypes"
fbm_dir = joinpath(geno_dir, "fbm")
for file in ["ukb_gen_british", "ukb_gen_british_train", "ukb_gen_british_test"]
    bedfile = joinpath(geno_dir, file * ".bed")
    outfile = joinpath(fbm_dir, file)
    @time make_fbm(bedfile, outfile)
end