In [None]:
##################################################
# Insulin Resistance PGRS Analysis  
#
#	LOTTA 2017 Insulin Resistence GWAS 
#
# Date:        Nov 20, 20
# Revised:     Aug 23, 21 (Replaced lotta values with scott and manning beta coefficients)
#
# Adapted from: rsalem
#
##################################################

library(stringr)
library(tidyverse)
library(R.utils)
library(data.table)

# Reading in SNP Reference list w/ betas and SNP allele details (EA: effect allele, NEA: non-effect allele, EAF=Effect allele frequency)

# LOTTA SNPS
#structure: snid, alt, ref, beta
	snpinfo=read.csv(paste0("/data/nrnb03/users/agarduno/jupyter/IRKD_SNP/MAGIC_beta/MAGIC_LottaSnps_ScottMann_09Sep2021.csv"),header=TRUE,sep = ',')

# renaming alleles to Effect (EA) & Non-Effect (NEA)
	names(snpinfo)=c("line","rsid","Beta","SE","Pvalue","MAF","EA", "NEA")
    snpinfo        


# CHECKING & DEALING WITH NEGATIVE Betas (OR < 1): SETTING All BETAS to Positive & Flipping alleles as needed
	# Creating copy of Beta/EA/NEA variables (for reference, saved as BETAX, EAX & NEAX)
		snpinfo$BETAX =snpinfo$Beta
		snpinfo$EAX   =snpinfo$EA
		snpinfo$NEAX  =snpinfo$NEA

	# Flipping if beta is negative: Betas and EA/NEA
	snpinfo$Beta[snpinfo$BETAX < 0] =(snpinfo$BETAX*-1)[snpinfo$BETAX < 0]
	snpinfo$NEA[snpinfo$BETAX < 0]  =snpinfo$EAX[snpinfo$BETAX < 0]
	snpinfo$EA[snpinfo$BETAX < 0]   =snpinfo$NEAX[snpinfo$BETAX < 0]
      


# Reading in UKBB EA dosage and sample files
        # Dosage
	dose_UKBB_Lipid2=fread("/data/nrnb03/users/agarduno/jupyter/IRKD_SNP/Alexis_IR_SNP_Retrieval_subset.dosage.gz", header=FALSE)
	dose_UKBB_Lipid=as.data.frame(dose_UKBB_Lipid2)
	rm(dose_UKBB_Lipid2)
	dim(dose_UKBB_Lipid)

	dose_UKBB_Lipid[1:10, 1:10]

                # NOTE: The 1st 6 columns are structured as follow: "CHR SNPID(1) SNPID(2) POSITION Allele1(ref) Allele2(alt)", followed by SNP dosages (scaled from 0 - 2)
                # as counts of allele2 (note: dosages range from 0-2 to account for genotype imputation uncertainty), for each subject in file

                #Dose_SNPINFO <- geno_UKBB_Lipid[-1,1:6] ## THIS IS WRONG, don't remove first row (lose of first SNP)
                Dose_SNPINFO <- dose_UKBB_Lipid[,1:6]
                #colnames(Dose_SNPINFO)=c("SNPID", "rsid", "POSITION", "ref", "EA") ## keep allele coding as ref/alt - to keep track of SNP allele use in dosage
                names(Dose_SNPINFO)=c("CHR", "SNPID", "rsid", "POSITION", "ref", "alt")

                DOSAGE  <- as.data.frame(t(dose_UKBB_Lipid[,-c(1:6)])) 
                colnames(DOSAGE)<- as.character(Dose_SNPINFO[,3])

        head(Dose_SNPINFO)
        head(DOSAGE)

        # Sample
        #sample_UKBB_Lipid=read.table("~/jupyter/IRKD_SNP/Alexis_LIPID_SNP_Retrieval_subset.sample.gz", header=TRUE)
	sample_UKBB_Lipid=read.table("/data/nrnb03/users/agarduno/jupyter/IRKD_SNP/Alexis_IR_SNP_Retrieval_subset.sample.gz", header=TRUE)
                sample_UKBB_LipidX=sample_UKBB_Lipid[-1,1:2]
                names(sample_UKBB_LipidX)=c("FID", "IID")

# Generating Combined file Dosage w/ Sample ID File
        geno_UKBB_Lipid=cbind(sample_UKBB_LipidX, DOSAGE)
	rm(DOSAGE)  # removing large files


# Combining Reference SNPList with SNP info from Dosage File
        SNPINFOX=merge(Dose_SNPINFO, snpinfo, by="rsid", all=FALSE) #SOURCE OF ERROR...?

# Checking Allele Coding - Flip or not
        #  SNP orientation check
        SNPINFOX$OK  =(SNPINFOX$EA == SNPINFOX$alt)
        #SNPINFOX$FLIP=(SNPINFOX$EA == SNPINFOX$ref.y) #update from just ref (two refs)
        SNPINFOX$FLIP=(SNPINFOX$EA == SNPINFOX$ref) # fixed the error of two refs

	# Check if alleles match/overlap (if bad, need to review/double check)
	SNPINFOX$CHECK="BAD_Alleles"
        SNPINFOX$CHECK[SNPINFOX$EA == SNPINFOX$alt  || SNPINFOX$NEA == SNPINFOX$ref ]="OK"
        SNPINFOX$CHECK[SNPINFOX$NEA == SNPINFOX$alt || SNPINFOX$EA == SNPINFOX$ref ]="OK"

	head(SNPINFOX)

	table(SNPINFOX$OK)
	table(SNPINFOX$FLIP)
	table(SNPINFOX$CHECK)



# SNP Orientation Checker Function
    # Note: Checking and Aligning SNPs (Strand Orientation check/correction)

        SNP_Align_GRS=function(XDOSE, XINFO)
                {
                
                # Copying Dosage file (avoids internal loops/file reference within script)
                DOSE_POS=XDOSE[,c("FID", "IID", as.character(XINFO$rsid))]
                DOSE_WT=XDOSE[,c("FID", "IID", as.character(XINFO$rsid))]

                # Generating list of SNPs to loop through
                XSNPLIST=names(DOSE_POS)[-c(1:2)]
                NN=length(XSNPLIST)

                # Sets alleles so that all alleles are postively associated with outcome based on snpinfo file (positive beta)
                for(i in 1:NN)
                        {
                        # Additional formatting to insure exact SNP name match (avoids partial match issues)
                        xsnp=XSNPLIST[i]
                        ZZ=grep(paste("^", xsnp, "$", sep=""), XINFO$rsid)
                        snp_info=XINFO[ZZ,]

                        #print(i)
                        #print(xsnp)
                        if(snp_info$OK=="TRUE")
                                {
                                DOSE_POS[,xsnp]=XDOSE[,xsnp]
                                DOSE_WT[,xsnp]=snp_info$Beta*XDOSE[,xsnp]
                                }
                        if(snp_info$FLIP=="TRUE")
                                {
                                DOSE_POS[,xsnp]=abs(XDOSE[,xsnp] -2)
                                DOSE_WT[,xsnp]=snp_info$Beta*(abs(XDOSE[,xsnp] -2))
                                }

                        }

                # Creating GRS Variables (set to 'NA')
                XDOSE_POS=DOSE_POS
                XDOSE_POS$GRS_RAW=NA
                XDOSE_POS$GRS_WT=NA

                # Creating GRS (count of positive associated allele (raw) and weighted (WT) versions)
                XDOSE_POS$GRS_RAW=rowSums(DOSE_POS[,-c(1:2)], na.rm = FALSE, dims = 1)
                XDOSE_POS$GRS_WT =rowSums(DOSE_WT[ ,-c(1:2)], na.rm = FALSE, dims = 1)

		# Checking that all SNPS used in PRS
		NSNPi=nrow(XINFO)
		NSNPd=ncol(XDOSE[,!(names(XDOSE) %in% c("FID", "IID"))])
		NSNPp=ncol(DOSE_POS[,!(names(DOSE_POS) %in% c("FID", "IID"))])

		print("####################################")
		print(paste("Num NSNP in SNPINFO File  : ", NSNPi, sep=""))
		print(paste("Num NSNP in Dosage File   : ", NSNPd, sep=""))
		print(paste("Num NSNP used in PRS Calc : ", NSNPp, sep=""))
		print("####################################")

                return(XDOSE_POS)
                }




# Running GRS Check and Calculator Script (note: GRS added to end of file)
        UKBB_IR_GRS=SNP_Align_GRS(XDOSE=geno_UKBB_Lipid, XINFO=SNPINFOX)

write.table(UKBB_IR_GRS, file="~/UKBB_IRLOTTA_GRS_09Sep21.txt", quote=FALSE, sep="\t", row.names=FALSE)

SNPINFOX
dim(SNPINFOX)
dim(Dose_SNPINFO)
dim(snpinfo)




In [None]:
Dose_SNPINFO
snpinfo
dim(SNPINFOX)