In [None]:
##################################################
# Insulin Resistance PG RS Analysis  
#
#	Lagour, 5e^-8 Genome Wide Significant
#
# Date:         Nov 29, 2020
#
# Adapted from:           rsalem
#
##################################################


# Reading in SNP Reference list w/ betas and SNP allele details (EA: effect allele, NEA: non-effect allele, EAF=Effect allele frequency)
library(stringr)
library(tidyverse)
library(R.utils)
library(data.table)

# Reading in SNP Reference list w/ betas and SNP allele details (EA: effect allele, NEA: non-effect allele, EAF=Effect allele frequency)
snpinfo1=read.table(paste0("/data/nrnb03/users/agarduno/jupyter/IRKD_SNP/Lagou_2019_clumped_and_merged.5e8_snplist.txt"), header=TRUE)
head(snpinfo1)

	# Original variable header
	# a. z – Z-score of association for FG or FI;
	# b. source – SSIMP for imputed and GWAS for SNPs present in the HapMap density;
	# c. a1 – reference allele;
	# d. a2 – effect allele;
	# e. r2.pred – SS-imp imputation quality;
	# f. p-value – P-value of association;
	# g. n – sample size;
	# h. maf – minor allele frequency;
	# i. beta – effect size for FG or FI;
	# j. se - standard error of the effect estimate for FG or FI.


	snpinfo <- snpinfo1[,c(2,5,6,12,8,9)] #keep relevant columns
	colnames(snpinfo)=c("rsid","NEA2","EA2","Beta", "r2.pred","pval")


# Fixing factor variable coding/structure - converts allele variables to 'character'
	snpinfo$NEA <- as.character(snpinfo$NEA2)
	snpinfo$EA  <- as.character(snpinfo$EA2)


# CHECKING & DEALING WITH NEGATIVE Betas (OR < 1): SETTING All BETAS to Positive & Flipping alleles as needed
	# Creating copy of Beta/EA/NEA variables (for reference, saved as BETAX, EAX & NEAX)
	snpinfo$BETAX =snpinfo$Beta
	snpinfo$EAX   =snpinfo$EA
	snpinfo$NEAX  =snpinfo$NEA

	# Flipping if beta is negative: Betas and EA/NEA
	snpinfo$Beta[snpinfo$BETAX < 0] =(snpinfo$BETAX*-1)[snpinfo$BETAX < 0]
	snpinfo$NEA[snpinfo$BETAX < 0]  =snpinfo$EAX[snpinfo$BETAX < 0]
	snpinfo$EA[snpinfo$BETAX < 0]   =snpinfo$NEAX[snpinfo$BETAX < 0]



# Reading in UKBB EA dosage and sample files
        # Dosage
        geno_UKBB_EA2=fread("/data/nrnb03/users/agarduno/jupyter/IRKD_SNP/Alexis_Lagou_2019_SNP_Retrieval_5e8_subset.dosage.gz", header=FALSE)
                # NOTE: The 1st 6 columns are structured as follow: "chr ID SNPID POSITION Allele1(ref) Allele2(alt)", followed by SNP dosages (scaled from 0 - 2)
                # as counts of allele2 (note: dosages range from 0-2 to account for genotype imputation uncertainty), for each subject in file

	# Formatting into data.frame str
	geno_UKBB_EA=as.data.frame(geno_UKBB_EA2)
	rm(geno_UKBB_EA2)

                Dose_SNPINFO <- geno_UKBB_EA[,1:6]
                colnames(Dose_SNPINFO)=c("chr", "SNPID", "rsid", "POSITION", "ref", "alt")

                DOSAGE  <- as.data.frame(t(geno_UKBB_EA[,- c(1:6)]))
		# Removing original dosage file (no longer needed - improve memory overhead)
		rm(geno_UKBB_EA)

                colnames(DOSAGE)<- as.character(Dose_SNPINFO[,3])

        head(Dose_SNPINFO)
        head(DOSAGE)


        # Sample
        sample_UKBB_EAX=read.table("/data/nrnb03/users/agarduno/jupyter/IRKD_SNP/Alexis_Lagou_2019_SNP_Retrieval_5e8_subset.sample.gz", header=TRUE)
                sample_UKBB_EA=sample_UKBB_EAX[-1,1:2]
                names(sample_UKBB_EA)=c("FID", "IID")

# Generating Combined file Dosage w/ Sample ID File
        geno_UKBB_EAX=cbind(sample_UKBB_EA, DOSAGE)
	rm(DOSAGE)  # removing large files	

# Combining Reference SNPList with SNP info from Dosage File
        SNPINFOX=merge(Dose_SNPINFO, snpinfo, by="rsid", all=FALSE)


# Checking Allele Coding - Flip or not
        # UKBB EA SNP orientation check
        SNPINFOX$OK  =(SNPINFOX$EA == SNPINFOX$alt)
        SNPINFOX$FLIP=(SNPINFOX$EA == SNPINFOX$ref)


# Check if alleles match/overlap (if bad, likely strand flip issue or problematic alleles need to review/double check)
	SNPINFOX$CHECK="BAD_Alleles"
        SNPINFOX$CHECK[SNPINFOX$EA == SNPINFOX$alt  || SNPINFOX$NEA == SNPINFOX$ref ]="OK"
        SNPINFOX$CHECK[SNPINFOX$NEA == SNPINFOX$alt || SNPINFOX$EA == SNPINFOX$ref ]="OK"


	# Checking for bad alleles via 'SNPINFOX$CHECK' variable
	table(SNPINFOX$CHECK)
	# NOTE: if 'BAD_Alleles' reported, STOP and doublecheck (need to resolve likely strand flip issue)

	head(SNPINFOX)

	table(SNPINFOX$OK)
	table(SNPINFOX$FLIP)
	table(SNPINFOX$CHECK)


# SNP Orientation Checker Function
    # Note: Checking and Aligning SNPs (Strand Orientation check/correction)

        SNP_Align_GRS=function(XDOSE, XINFO)
                {
                
                # Copying Dosage file (avoids internal loops/file reference within script)
                DOSE_POS=XDOSE[,c("FID", "IID", as.character(XINFO$rsid))]
                DOSE_WT=XDOSE[,c("FID", "IID", as.character(XINFO$rsid))]

                # Generating list of SNPs to loop through
                XSNPLIST=names(DOSE_POS)[-c(1:2)]
                NN=length(XSNPLIST)

                # Sets alleles so that all alleles are postively associated with outcome based on snpinfo file (positive beta)
                for(i in 1:NN)
                        {
                        # Additional formatting to insure exact SNP name match (avoids partial match issues)
                        xsnp=XSNPLIST[i]
                        ZZ=grep(paste("^", xsnp, "$", sep=""), XINFO$rsid)
                        snp_info=XINFO[ZZ,]

                        #print(i)
                        #print(xsnp)
                        if(snp_info$OK=="TRUE")
                                {
                                DOSE_POS[,xsnp]=XDOSE[,xsnp]
                                DOSE_WT[,xsnp]=snp_info$Beta*XDOSE[,xsnp]
                                }
                        if(snp_info$FLIP=="TRUE")
                                {
                                DOSE_POS[,xsnp]=abs(XDOSE[,xsnp] -2)
                                DOSE_WT[,xsnp]=snp_info$Beta*(abs(XDOSE[,xsnp] -2))
                                }

                        }

                # Creating GRS Variables (set to 'NA')
                XDOSE_POS=DOSE_POS
                XDOSE_POS$GRS_RAW=NA
                XDOSE_POS$GRS_WT=NA

                # Creating GRS (count of positive associated allele (raw) and weighted (WT) versions)
                XDOSE_POS$GRS_RAW=rowSums(DOSE_POS[,-c(1:2)], na.rm = FALSE, dims = 1)
                XDOSE_POS$GRS_WT =rowSums(DOSE_WT[ ,-c(1:2)], na.rm = FALSE, dims = 1)

		# Checking that all SNPS used in PRS
		NSNPi=nrow(XINFO)
		NSNPd=ncol(XDOSE[,!(names(XDOSE) %in% c("FID", "IID"))])
		NSNPp=ncol(DOSE_POS[,!(names(DOSE_POS) %in% c("FID", "IID"))])

		print("####################################")
		print(paste("Num NSNP in SNPINFO File  : ", NSNPi, sep=""))
		print(paste("Num NSNP in Dosage File   : ", NSNPd, sep=""))
		print(paste("Num NSNP used in PRS Calc : ", NSNPp, sep=""))
		print("####################################")

                return(XDOSE_POS)
                }


# Running GRS Check and Calculator Script (note: GRS added to end of file)
        UKBB_IR_GRS=SNP_Align_GRS(XDOSE=geno_UKBB_EAX, XINFO=SNPINFOX)


write.table(UKBB_IR_GRS, file="~/jupyter/Lagou_2019_5e8_28Nov20.txt", quote=FALSE, sep="\t", row.names=FALSE)



library(dplyr)
#Check that matches OK from the file
dim(Dose_SNPINFO)
#randomly pull rows to check flip
sample_n(SNPINFOX, 14)


