In [None]:
##################################################
# Title:    T2D MVMR PGRS Analysis  
#
# Desc:    T2D GWAS (trans-ethnic GWAS results, non-overlapping UKBB)
#
# Date:    October 6, 2021
#
# Adapted from:  rsalem
#
##################################################

library(stringr)
library(tidyverse)
library(R.utils)
library(data.table)


In [None]:
# Reading in SNP Reference list w/ betas and SNP allele details (EA: effect allele, NEA: non-effect allele, EAF=Effect allele frequency)

#structure: snid, alt, ref, beta
snpinfo=read.table("/data/nrnb03/users/agarduno/jupyter/IRKD_SNP/T2D_MVMR_20SEP21.txt", header = TRUE, na.strings=c("",".","NA"))

# renaming alleles to Effect (EA) & Non-Effect (NEA)
	names(snpinfo)=c("rsid", "EA", "NEA", "Beta")
	head(snpinfo)         
    dim(snpinfo)

# CHECKING & DEALING WITH NEGATIVE Betas (OR < 1): Note:  All BETAS to Set Positive & Alleles Flipping as needed
	# Creating copy of Beta/EA/NEA variables (for reference, saved as BETAX, EAX & NEAX)
		#snpinfo$xrsid=lapply(snpinfo$xrsid, as.character)
        snpinfo$rsid=as.character(snpinfo$rsid)
		snpinfo$BETAX=snpinfo$Beta
		snpinfo$EAX=as.character(snpinfo$EA)
		snpinfo$NEAX=as.character(snpinfo$NEA)

	# Flipping if beta is negative: Betas and EA/NEA
snpinfo$Beta <- ifelse(snpinfo$BETAX < 0,snpinfo$BETAX*-1,snpinfo$BETAX)
snpinfo$NEA <- ifelse(snpinfo$BETAX < 0,snpinfo$EAX,snpinfo$NEAX)
snpinfo$EA <- ifelse(snpinfo$BETAX < 0,snpinfo$NEAX,snpinfo$EAX)

# Reviewing range of beta - to check if betas flipped (if need be)
dim(snpinfo)
head(snpinfo)
range(snpinfo$BETAX)
range(snpinfo$Beta)

str(snpinfo)

# Repalcing problematic rsid: rs2244020 (correct) for rs74995800 (problematic)
snpinfo$rsid[snpinfo$rsid=="rs74995800"]="rs2244020"


In [None]:
# Reading in imputed genetic data from UKBB - Dealing with Autosomal Chrs (1-22) SNPs
# Reading in UKBB EA dosage and sample files
        # Dosage
	dose_UKBB2_auto=fread("/cellar/users/rsalem/AG_T2D_ipynb/UKBB_Alexis_T2D_subset.dosage.gz", header=FALSE)
	dose_UKBB_auto=as.data.frame(dose_UKBB2_auto)
        rm(dose_UKBB2_auto)
	dim(dose_UKBB_auto)

    # Reviewing File
    dose_UKBB_auto[1:10, 1:10]

    # NOTE: The 1st 6 columns are structured as follow: "CHR SNPID(1) SNPID(2) POSITION Allele1(ref) Allele2(alt)", followed by SNP dosages (scaled from 0 - 2)
    # as counts of allele2 (note: dosages range from 0-2 to account for genotype imputation uncertainty), for each subject in file

    #Dose_SNPINFO <- geno_UKBB_Lipid[-1,1:6] ## THIS IS WRONG, don't remove first row (lose of first SNP)
        Dose_SNPINFO_auto <- dose_UKBB_auto[,1:6]
    #colnames(Dose_SNPINFO)=c("SNPID", "rsid", "POSITION", "ref", "EA") ## keep allele coding as ref/alt - to keep track of SNP allele use in dosage
        names(Dose_SNPINFO_auto)=c("CHR", "SNPID", "rsid", "POSITION", "ref", "alt")

        DOSAGE_auto  <- as.data.frame(t(dose_UKBB_auto[,-c(1:6)])) 
        colnames(DOSAGE_auto)<- as.character(Dose_SNPINFO_auto[,3])

        head(Dose_SNPINFO_auto)
        head(DOSAGE_auto)

        # Sample
	sample_UKBB2_auto=read.table("/cellar/users/rsalem/AG_T2D_ipynb/UKBB_Alexis_T2D_subset.sample.gz", header=TRUE)
                sample_UKBBX_auto=sample_UKBB2_auto[-1,1:2]
                names(sample_UKBBX_auto)=c("FID", "IID")

        head(sample_UKBBX_auto)

# Generating Combined file Dosage w/ Sample ID File
    geno_UKBB_auto=cbind(sample_UKBBX_auto, DOSAGE_auto)
    dim(sample_UKBBX_auto)
    dim(DOSAGE_auto)
    dim(geno_UKBB_auto)
    head(geno_UKBB_auto)
    
    table(names(geno_UKBB_auto))

	#rm(DOSAGE)  # removing large files



In [None]:
# Reading in imputed genetic data from UKBB - Dealing with X ChrSNPs
# Reading in UKBB EA dosage and sample files
        # Dosage
	dose_UKBB2_x=fread("/cellar/users/rsalem/AG_T2D_ipynb/UKBB_Alexis_T2D_chrX_subset.dosage.gz", header=FALSE)
	dose_UKBB_x=as.data.frame(dose_UKBB2_x)
        rm(dose_UKBB2_x)
	dim(dose_UKBB_x)

    # Reviewing File
    dose_UKBB_x[1, 1:10]

    # NOTE: The 1st 6 columns are structured as follow: "CHR SNPID(1) SNPID(2) POSITION Allele1(ref) Allele2(alt)", followed by SNP dosages (scaled from 0 - 2)
    # as counts of allele2 (note: dosages range from 0-2 to account for genotype imputation uncertainty), for each subject in file

    #Dose_SNPINFO <- geno_UKBB_Lipid[-1,1:6] ## THIS IS WRONG, don't remove first row (lose of first SNP)
        Dose_SNPINFO_x <- dose_UKBB_x[,1:6]
    #colnames(Dose_SNPINFO)=c("SNPID", "rsid", "POSITION", "ref", "EA") ## keep allele coding as ref/alt - to keep track of SNP allele use in dosage
        names(Dose_SNPINFO_x)=c("CHR", "SNPID", "rsid", "POSITION", "ref", "alt")

        DOSAGE_x  <- as.data.frame(t(dose_UKBB_x[,-c(1:6)])) 
        colnames(DOSAGE_x)<- as.character(Dose_SNPINFO_x[,3])

        head(Dose_SNPINFO_x)
        head(DOSAGE_x)

        # Sample
	sample_UKBB2_x=read.table("/cellar/users/rsalem/AG_T2D_ipynb/UKBB_Alexis_T2D_chrX_subset.sample.gz", header=TRUE)
                sample_UKBBX_x=sample_UKBB2_x[-1,1:2]
                names(sample_UKBBX_x)=c("FID", "IID")

    head(sample_UKBBX_x)

# Generating Combined file Dosage w/ Sample ID File
    geno_UKBB_x=cbind(sample_UKBBX_x, DOSAGE_x)
    dim(sample_UKBBX_x)
    dim(DOSAGE_x)
    dim(geno_UKBB_x)
    head(geno_UKBB_x)
    
    table(names(geno_UKBB_x))

	#rm(DOSAGE)  # removing large files



In [None]:
# Merging Reference SNPLists with SNP info from Dosage File
    # Combining autosomal and x-chr SNP SNPINFO Files
    Dose_SNPINFO=rbind(Dose_SNPINFO_auto, Dose_SNPINFO_x)

    dim(Dose_SNPINFO)
    dim(snpinfo)

    SNPINFO1=merge(Dose_SNPINFO, snpinfo, by="rsid", all=TRUE)
    dim(SNPINFO1)

# Checking for Problematic SNPs
    # SNPS with incomplete data (indicating not in imputation panel or missing from snpinfo file)
    SNPINFO1[complete.cases(SNPINFO1)=="FALSE",]
    ## Note: need to remove rs186838848 (not in UKBB imputation file)

        # Removing problematic SNP (rs186838848)
        SNPINFO2=SNPINFO1[complete.cases(SNPINFO1),]

    # Checking for Duplicate SNPs using rsids
    subset(as.data.frame(table(SNPINFO2$rsid)), Freq>1)
    SNPINFO2[SNPINFO2$rsid=="rs7403531",]

    #rsid	CHR	SNPID	POSITION	ref	alt	EA	NEA	Beta	BETAX	EAX	NEAX
    #102	rs7403531 	15 	rs7403531 	38822905 	T 	C 	T 	C 	0.0295588 	0.0295588 	T 	C
    #103	rs7403531 	15 	15:38822905_T_G	38822905 	T 	G 	T 	C 	0.0295588 	0.0295588 	T 	C 

    # Checking for Duplicates using SNPID 
    subset(as.data.frame(table(SNPINFO2$SNPID)), Freq>2)

    # Note: rs7403531 is tri-allelic (T/C/G alleles, G is very rare) -SEE: https://www.ncbi.nlm.nih.gov/snp/rs7403531
    # Tiallelelic SNPs are split into 2 biallelic SNPs and both retrieved since retrieveal script uses  chr/pos (or rsID)
    # The PGS script fails because of the trialllelic SNPs which are split into two biallelic variants 
    # (with same rsID), which can cause the PRS calculator fxn to fail (not designed for this)

    # Removing problematic SNP (rs186838848 with rare allele 15:38822905_T_G that does not match GWAS SNPLIST)
    SNPINFOZ=SNPINFO2[SNPINFO2$SNPID != "15:38822905_T_G",]

    # Checking file size to confirm removal of problematic SNPS
        dim(SNPINFO1)
        dim(SNPINFO2)
        dim(SNPINFOZ)


In [None]:
# Next  removing problematic SNP (rs186838848 with rare allele 15:38822905_T_G that does not match GWAS SNPLIST)
# from Dosage file using Dose_SNPINFO File

    # Checking SNPs in geno file labeled as rs7403531
    grep("rs7403531", names(geno_UKBB_auto))
    # 38 39 

    grep("rs7403531", Dose_SNPINFO_auto$rsid)
    Dose_SNPINFO_auto[grep("rs7403531", Dose_SNPINFO_auto$rsid),]

    # -> Removing  2nd copy rs7403531 id Dose file (mismatching alleles to reference
    geno_UKBB_auto2=geno_UKBB_auto[,-39]

    # checking for multiple copye of problematic  SNPs in file post removals step
    grep("rs7403531", names(geno_UKBB_auto2))



In [None]:
# Merging Dosage files: Autosomal + X-chr SNPS

names(geno_UKBB_x)

names(geno_UKBB_auto2)

# Note: x-chr genetic file uses Salem Lab IDs (which match IDS in phenotype files)
# Whereas Autosomal (chrs 1-22) use central IDs (which do not match up with phenotype IDs and need linker)

# Retrieving linker file to allow merging of autosomal and x-chr snps
xlinker=read.table("/nrnb/ukb-salem/GenoInfo/UKBiobank_genoQC_allancestry_linker.txt", header=TRUE)

names(xlinker)

names(geno_UKBB_x)=c('FID_Salem', 'IID_Salem', 'rs5945326' )

geno_UKBB_x2=merge(geno_UKBB_x, xlinker, by=c("FID_Salem", "IID_Salem"), all=FALSE)


print("Dims of: geno_UKBB_x xlinker geno_UKBB_x2 geno_UKBB_auto2")
dim(geno_UKBB_x)
dim(xlinker)
dim(geno_UKBB_x2)
dim(geno_UKBB_auto2)

geno_UKBB_x3=geno_UKBB_x2[,-c(1:2)]
names(geno_UKBB_x2)
names(geno_UKBB_x3)

# Merging Autosomal and X-chr variant files
    geno_UKBB=merge(geno_UKBB_auto2, geno_UKBB_x3, by=c("FID", "IID"), all=TRUE)

names(geno_UKBB)
 dim(geno_UKBB)



In [None]:
# Checking Allele Coding - Flip or not
        #  SNP orientation check
        SNPINFOZ$OK  =(SNPINFOZ$EA == SNPINFOZ$alt)
        SNPINFOZ$FLIP=(SNPINFOZ$EA == SNPINFOZ$ref) # fixed the error of two refs

	# Check if alleles match/overlap (if bad, need to review/double check) - CORRECTED
	SNPINFOZ$CHECK="BAD_Alleles"
        SNPINFOZ$CHECK[SNPINFOZ$EA  == SNPINFOZ$alt & SNPINFOZ$NEA == SNPINFOZ$ref ]="OK"
        SNPINFOZ$CHECK[SNPINFOZ$NEA == SNPINFOZ$alt & SNPINFOZ$EA  == SNPINFOZ$ref ]="OK"


	table(SNPINFOZ$OK)
	table(SNPINFOZ$FLIP)
	table(SNPINFOZ$CHECK) # ZERO  bad alleles

    subset(SNPINFOZ, CHECK=="BAD_Alleles")

#	SNPINFOX[(SNPINFOX$rsid=="rs9411425"),]


In [None]:
# SNP Orientation Checker Function and GRS Calc Fxn
    # Note: Checking and Aligning SNPs (Strand Orientation check/correction)

        SNP_Align_GRS=function(XDOSE, XINFO)
                {
                
                # Copying Dosage file (avoids internal loops/file reference within script)
                DOSE_POS=XDOSE[,c("FID", "IID", as.character(XINFO$rsid))]
                DOSE_WT=XDOSE[,c("FID", "IID", as.character(XINFO$rsid))]

                # Generating list of SNPs to loop through
                XSNPLIST=names(DOSE_POS)[-c(1:2)]
                NN=length(XSNPLIST)

                # Sets alleles so that all alleles are postively associated with outcome based on snpinfo file (positive beta)
                for(i in 1:NN)
                        {
                        # Additional formatting to insure exact SNP name match (avoids partial match issues)
                        xsnp=XSNPLIST[i]
                        ZZ=grep(paste("^", xsnp, "$", sep=""), XINFO$rsid)
                        snp_info=XINFO[ZZ,]

                        #print(i)
                        #print(xsnp)
                        if(snp_info$OK=="TRUE")
                                {
                                DOSE_POS[,xsnp]=XDOSE[,xsnp]
                                DOSE_WT[,xsnp]=snp_info$Beta*XDOSE[,xsnp]
                                }
                        if(snp_info$FLIP=="TRUE")
                                {
                                DOSE_POS[,xsnp]=abs(XDOSE[,xsnp] -2)
                                DOSE_WT[,xsnp]=snp_info$Beta*(abs(XDOSE[,xsnp] -2))
                                }

                        }

                # Creating GRS Variables (set to 'NA')
                XDOSE_POS=DOSE_POS
                XDOSE_POS$GRS_RAW=NA
                XDOSE_POS$GRS_WT=NA

                # Creating GRS (count of positive associated allele (raw) and weighted (WT) versions)
                XDOSE_POS$GRS_RAW=rowSums(DOSE_POS[,-c(1:2)], na.rm = FALSE, dims = 1)
                XDOSE_POS$GRS_WT =rowSums(DOSE_WT[ ,-c(1:2)], na.rm = FALSE, dims = 1)

		# Checking that all SNPS used in PRS
		NSNPi=nrow(XINFO)
		NSNPd=ncol(XDOSE[,!(names(XDOSE) %in% c("FID", "IID"))])
		NSNPp=ncol(DOSE_POS[,!(names(DOSE_POS) %in% c("FID", "IID"))])

		print("####################################")
		print(paste("Num NSNP in SNPINFO File  : ", NSNPi, sep=""))
		print(paste("Num NSNP in Dosage File   : ", NSNPd, sep=""))
		print(paste("Num NSNP used in PRS Calc : ", NSNPp, sep=""))
		print("####################################")

                return(XDOSE_POS)
                }

# Running GRS Check and Calculator Script (note: GRS added to end of file)
        UKBB_T2D_GRS=SNP_Align_GRS(XDOSE=geno_UKBB, XINFO=SNPINFOZ)


    write.table(UKBB_T2D_GRS, file="~/jupyter/IRKD_SNP/UKBB_T2D_MVMR_GRS_REV_20OCT21.txt", 
            quote=FALSE, sep="\t", row.names=FALSE)
