In [None]:
##################################################
# Insulin Resistance PGRS Analysis  
#
#	T2D GWAS (trans-ethnic GWAS results, non-overlapping UKBB)
#
# Date:         May 24, 2021
#
# Adapted from:           rsalem
#
# 7/23/21: Updated to include one proxy SNP, one X chromosomes
#
#
##################################################

In [None]:
###Pre-Assessment: Analytic sample minus those missing X-chromosome data
##DON'T NEED TO RUN

##Libraries
library(stringr)
library(tidyverse)
library(R.utils)
library(data.table)

#Import the linker file between sample and X-chromosome data
##Linker file##
# Dosage for X-chromosome (confirmed no overlap with other SNPs), rs5945326
link2=read.table("/nrnb/ukb-salem/GenoInfo/UKBiobank_genoQC_allancestry_linker.txt", header = TRUE, na.strings=c("",".","NA"))
link=as.data.frame(link2)

##Difference in the analytic sample##
UKBB_AG2_m <- fread("~/jupyter/UKBB_AG2_12Jan21.txt", header = TRUE, na.strings=c("",".","NA"))
UKBB_AG2=as.data.frame(UKBB_AG2_m)
dim(UKBB_AG2)
rm(UKBB_AG2_m)
UKBB_AG2$FID <- UKBB_AG2$f.eid

##X Chromosome file##
# Dosage for X-chromosome (confirmed no overlap with other SNPs), rs5945326
sample_X=read.table("/data/nrnb03/users/agarduno/jupyter/IRKD_SNP/Alexis_X_snps_subset.sample.gz", header=TRUE)
sample_X=sample_X[-1,1:2]
names(sample_X)=c("FID", "IID")

#Final analytic sample
UKBB_AG3=merge(UKBB_AG2, sample_X, by="FID", all=FALSE)
dim(UKBB_AG3)
#remove
rm(UKBB_AG2)
rm(UKBB_AG3)

In [None]:
#Main Script
##DON'T NEED TO RUN


# Reading in SNP Reference list w/ betas and SNP allele details (EA: effect allele, NEA: non-effect allele, EAF=Effect allele frequency)

#structure: snid, alt, ref, beta
#modified two include replace missing SNP (rs62023387) with proxy
snpinfo=read.csv("/data/nrnb03/users/agarduno/jupyter/IRKD_SNP/T2D_MVMR_Scott_Snp_v2.csv") 
#added from T2D_MVMR_Scott_Snp_CHECKED_format
rs1 <- data.frame(rsid="rs74995800",EA="G",NEA="A",Beta=0.019802627)
rs2 <- data.frame(rsid="rs76895963",EA="T",NEA="G",Beta=0.207014169)
snpinfo=rbind(snpinfo,rs1,rs2)


# renaming alleles to Effect (EA) & Non-Effect (NEA)
names(snpinfo)=c("rsid", "EA", "NEA", "Beta")
head(snpinfo)    
dim(snpinfo)  

# CHECKING & DEALING WITH NEGATIVE Betas (OR < 1): SETTING All BETAS to Positive & Flipping alleles as needed
	# Creating copy of Beta/EA/NEA variables (for reference, saved as BETAX, EAX & NEAX)
		snpinfo$BETAX=snpinfo$Beta
		snpinfo$EAX=as.character(snpinfo$EA)
		snpinfo$NEAX=as.character(snpinfo$NEA)

	# Flipping if beta is negative: Betas and EA/NEA
snpinfo$Beta <- ifelse(snpinfo$BETAX < 0,snpinfo$BETAX*-1,snpinfo$BETAX)
snpinfo$NEA <- ifelse(snpinfo$BETAX < 0,snpinfo$EAX,snpinfo$NEAX)
snpinfo$EA <- ifelse(snpinfo$BETAX < 0,snpinfo$NEAX,snpinfo$EAX)


# Reading in ARIC EA dosage and sample files
# Dosage
dose_UKBB_Lipid2=fread("/data/nrnb03/users/agarduno/jupyter/IRKD_SNP/Alexis_T2D_ScottMVMR_SNPs_subset.dosage.gz", header=FALSE)
dose_UKBB_Lipid=as.data.frame(dose_UKBB_Lipid2)
# Confirmed that X-Chromosome was not in original dosage file
dose_UKBB_Lipid[which(dose_UKBB_Lipid$V3=='rs11257659'), 1:10] #present (test)
dose_UKBB_Lipid[which(dose_UKBB_Lipid$V3=='rs62023386'), 1:10] #proxy, confirmed absent
dose_UKBB_Lipid[which(dose_UKBB_Lipid$V3=='rs5945326'), 1:10] #X-chromosome, added by UKBB_X
# Dosage for proxy SNP (first row)
dose_UKBB_proxy2=fread("/data/nrnb03/users/agarduno/jupyter/IRKD_SNP/Alexis_T2D_chr15_proxy_subset.dosage.gz", header=FALSE)
dose_UKBB_proxy=as.data.frame(dose_UKBB_proxy2)[1,]
# Dosage for X-chromosome (confirmed no overlap with other SNPs), rs5945326
dose_UKBB_X2=fread("/data/nrnb03/users/agarduno/jupyter/IRKD_SNP/Alexis_X_snps_subset.dosage.gz", header=FALSE)
dose_UKBB_X=as.data.frame(dose_UKBB_X2)

rm(dose_UKBB_Lipid2)
rm(dose_UKBB_proxy2)
rm(dose_UKBB_X2)

dose_UKBB_Lipid=rbind(dose_UKBB_Lipid,dose_UKBB_proxy)

dim(dose_UKBB_Lipid)
dose_UKBB_Lipid[1:10, 1:10]

# Dosage for autosomal SNPs
# NOTE: The 1st 6 columns are structured as follow: "CHR SNPID(1) SNPID(2) POSITION Allele1(ref) Allele2(alt)", followed by SNP dosages (scaled from 0 - 2)
# as counts of allele2 (note: dosages range from 0-2 to account for genotype imputation uncertainty), for each subject in file
Dose_SNPINFO <- dose_UKBB_Lipid[,1:6]
names(Dose_SNPINFO)=c("CHR", "SNPID", "rsid", "POSITION", "ref", "alt")
DOSAGE  <- as.data.frame(t(dose_UKBB_Lipid[,-c(1:6)])) 
colnames(DOSAGE)<- as.character(Dose_SNPINFO[,3])

# Dosage for X Snps
Dose_SNPINFO_X <- dose_UKBB_X[,1:6]
names(Dose_SNPINFO_X)=c("CHR", "SNPID", "rsid", "POSITION", "ref", "alt")
DOSAGE_X <- as.data.frame(t(dose_UKBB_X[,-c(1:6)])) 
colnames(DOSAGE_X)<- as.character(Dose_SNPINFO_X[,3])

##Combining Dosage and Sample: Autosomal chromosomes 
# Sample (already confirmed samples are identical with Steven - just ids)
sample_UKBB_Lipid=read.table("/data/nrnb03/users/agarduno/jupyter/IRKD_SNP/Alexis_T2D_ScottMVMR_SNPs_subset.sample.gz", header=TRUE)
sample_UKBB_LipidX=sample_UKBB_Lipid[-1,1:2]
names(sample_UKBB_LipidX)=c("FID", "IID")
# Generating Combined file Dosage w/ Sample ID File
geno_UKBB_Lipid=cbind(sample_UKBB_LipidX, DOSAGE)
head(geno_UKBB_Lipid)
rm(DOSAGE)  # removing large files

##Combining Dosage and Sample:X chromosomes 
sample_UKBB_X=read.table("/data/nrnb03/users/agarduno/jupyter/IRKD_SNP/Alexis_X_snps_subset.sample.gz", header=TRUE)
sample_UKBB_X=sample_UKBB_X[-1,1:2]
names(sample_UKBB_X)=c("FID","IID") #don't want redudant IID
geno_UKBB_X=cbind(sample_UKBB_X, DOSAGE_X)
rm(DOSAGE_X)

##Combined alt and ref alleles ("dosage files", not full genotype)
Dose_SNPINFO <- rbind(Dose_SNPINFO,Dose_SNPINFO_X) #combined dosage ref files

# Combining Reference SNPList with SNP info from Dosage File
SNPINFOX=merge(Dose_SNPINFO, snpinfo, by="rsid", all=TRUE) #X chromosomes and autosomal

	dim(SNPINFOX)
	#340   9

print("SNPINFO")
	dim(Dose_SNPINFO)
	#340   6
print("SNPs")
	dim(snpinfo)
	#338   4

	#checking SNPINFOX for duplicate SNP using rsIDS
	subset(as.data.frame(table(SNPINFOX$rsid)), Freq==2)
	#         Var1 Freq
	#59  rs1215468    2
	#212 rs4930011    2
	
	#checking Dose_SNPINFO for duplicate SNP using rsIDs
	subset(as.data.frame(table(Dose_SNPINFO$rsid)), Freq==2)
        #	 Var1 Freq
	#59  rs1215468    2
	#212 rs4930011    2

	#checking Dose_SNPINFO for duplicate SNP using SNPID
	subset(as.data.frame(table(Dose_SNPINFO$SNPID)), Freq==2) 
	#	[1] Var1 Freq
	#<0 rows> (or 0-length row.names) ## no matches -> 'SNPID' is unique for 340 SNPS, while rsID has two duplicates


	# checking details of duplicate rsIDs in SNPINFOX File
	SNPINFOX[(SNPINFOX$rsid=="rs1215468"),]
	#        rsid CHR           SNPID POSITION ref alt EA NEA       Beta
	#59 rs1215468  13 13:80707429_A_C 80707429   A   C  A   G 0.03342375
	#60 rs1215468  13 13:80707429_A_G 80707429   A   G  A   G 0.03342375

	# checking details of duplicate rsIDs in SNPINFOX File
	SNPINFOX[(SNPINFOX$rsid=="rs4930011"),]
	#         rsid CHR          SNPID POSITION ref alt EA NEA       Beta
	#213 rs4930011  11 11:2856658_C_G  2856658   C   G  G   C 0.0128372
	#214 rs4930011  11 11:2856658_C_T  2856658   C   T  G   C 0.01283723

	# The issue is these 2 SNPs are tri-allelic (rs1215468 has A/C/G alleles), triallelic SNPs
	# are split into two biallelic SNPs and are both retrieved when we retrieve on chr/pos (or rsID)
	# The PGS script fails because of the trialllelic SNPs which are split into two biallelic variants 
	# (with same rsID), which the PRS calculator fxn fail (not designed for this)


# Checking Allele Coding - Flip or not
#  SNP orientation check
SNPINFOX$OK=(SNPINFOX$EA == SNPINFOX$alt)
SNPINFOX$FLIP=(SNPINFOX$EA == SNPINFOX$ref) # fixed the error of two refs

# Check if alleles match/overlap (if bad, need to review/double check) - CORRECTED
SNPINFOX$CHECK="BAD_Alleles"
SNPINFOX$CHECK[SNPINFOX$EA == SNPINFOX$alt  & SNPINFOX$NEA == SNPINFOX$ref ]="OK"
SNPINFOX$CHECK[SNPINFOX$NEA == SNPINFOX$alt & SNPINFOX$EA == SNPINFOX$ref ]="OK"

table(SNPINFOX$OK)
table(SNPINFOX$FLIP)
table(SNPINFOX$CHECK) #one bad allele

SNPINFOX[(SNPINFOX$rsid=="rs7403531"),]
SNPINFOX[which(SNPINFOX$CHECK=="BAD_Alleles"),]

In [None]:
#Main Script
library(stringr)
library(tidyverse)
library(R.utils)
library(data.table)

# Reading in SNP Reference list w/ betas and SNP allele details (EA: effect allele, NEA: non-effect allele, EAF=Effect allele frequency)

#structure: snid, alt, ref, beta
#modified two include replace missing SNP (rs62023387) with proxy
snpinfo=read.csv("/data/nrnb03/users/agarduno/jupyter/IRKD_SNP/T2D_MVMR_Scott_Snp_v2.csv") 
#added from T2D_MVMR_Scott_Snp_CHECKED_format
rs1 <- data.frame(rsid="rs74995800",EA="G",NEA="A",Beta=0.019802627)
rs2 <- data.frame(rsid="rs76895963",EA="T",NEA="G",Beta=0.207014169)
snpinfo=rbind(snpinfo,rs1,rs2)


# renaming alleles to Effect (EA) & Non-Effect (NEA)
names(snpinfo)=c("rsid", "EA", "NEA", "Beta")
head(snpinfo)    
dim(snpinfo)  

# CHECKING & DEALING WITH NEGATIVE Betas (OR < 1): SETTING All BETAS to Positive & Flipping alleles as needed
	# Creating copy of Beta/EA/NEA variables (for reference, saved as BETAX, EAX & NEAX)
		snpinfo$BETAX=snpinfo$Beta
		snpinfo$EAX=as.character(snpinfo$EA)
		snpinfo$NEAX=as.character(snpinfo$NEA)

	# Flipping if beta is negative: Betas and EA/NEA
snpinfo$Beta <- ifelse(snpinfo$BETAX < 0,snpinfo$BETAX*-1,snpinfo$BETAX)
snpinfo$NEA <- ifelse(snpinfo$BETAX < 0,snpinfo$EAX,snpinfo$NEAX)
snpinfo$EA <- ifelse(snpinfo$BETAX < 0,snpinfo$NEAX,snpinfo$EAX)


# Reading in ARIC EA dosage and sample files
# Dosage
dose_UKBB_Lipid2=fread("/data/nrnb03/users/agarduno/jupyter/IRKD_SNP/Alexis_T2D_ScottMVMR_SNPs_subset.dosage.gz", header=FALSE)
dose_UKBB_Lipid=as.data.frame(dose_UKBB_Lipid2)
# Confirmed that X-Chromosome was not in original dosage file
dose_UKBB_Lipid[which(dose_UKBB_Lipid$V3=='rs11257659'), 1:10] #present (test)
dose_UKBB_Lipid[which(dose_UKBB_Lipid$V3=='rs62023386'), 1:10] #proxy, confirmed absent
dose_UKBB_Lipid[which(dose_UKBB_Lipid$V3=='rs5945326'), 1:10] #X-chromosome, added by UKBB_X
# Dosage for proxy SNP (first row)
dose_UKBB_proxy2=fread("/data/nrnb03/users/agarduno/jupyter/IRKD_SNP/Alexis_T2D_chr15_proxy_subset.dosage.gz", header=FALSE)
dose_UKBB_proxy=as.data.frame(dose_UKBB_proxy2)[1,]
# Dosage for X-chromosome (confirmed no overlap with other SNPs), rs5945326
dose_UKBB_X2=fread("/data/nrnb03/users/agarduno/jupyter/IRKD_SNP/Alexis_X_snps_subset.dosage.gz", header=FALSE)
dose_UKBB_X=as.data.frame(dose_UKBB_X2)

rm(dose_UKBB_Lipid2)
rm(dose_UKBB_proxy2)
rm(dose_UKBB_X2)

dose_UKBB_Lipid=rbind(dose_UKBB_Lipid,dose_UKBB_proxy) #can't merge x here since different number of col or peep

dim(dose_UKBB_Lipid)
dose_UKBB_Lipid[1:10, 1:10]

# Dosage for autosomal SNPs
# NOTE: The 1st 6 columns are structured as follow: "CHR SNPID(1) SNPID(2) POSITION Allele1(ref) Allele2(alt)", followed by SNP dosages (scaled from 0 - 2)
# as counts of allele2 (note: dosages range from 0-2 to account for genotype imputation uncertainty), for each subject in file
Dose_SNPINFO <- dose_UKBB_Lipid[,1:6]
names(Dose_SNPINFO)=c("CHR", "SNPID", "rsid", "POSITION", "ref", "alt")
DOSAGE  <- as.data.frame(t(dose_UKBB_Lipid[,-c(1:6)])) 
colnames(DOSAGE)<- as.character(Dose_SNPINFO[,3])

# Dosage for X Snps
Dose_SNPINFO_X <- dose_UKBB_X[,1:6]
names(Dose_SNPINFO_X)=c("CHR", "SNPID", "rsid", "POSITION", "ref", "alt")
DOSAGE_X <- as.data.frame(t(dose_UKBB_X[,-c(1:6)])) 
colnames(DOSAGE_X)<- as.character(Dose_SNPINFO_X[,3])

##Combining Dosage and Sample: Autosomal chromosomes 
# Sample (already confirmed samples are identical with Steven - just ids)
sample_UKBB_Lipid=read.table("/data/nrnb03/users/agarduno/jupyter/IRKD_SNP/Alexis_T2D_ScottMVMR_SNPs_subset.sample.gz", header=TRUE)
sample_UKBB_LipidX=sample_UKBB_Lipid[-1,1:2]
names(sample_UKBB_LipidX)=c("FID", "IID")
# Generating Combined file Dosage w/ Sample ID File
geno_UKBB_Lipid=cbind(sample_UKBB_LipidX, DOSAGE)
head(geno_UKBB_Lipid)
rm(DOSAGE)  # removing large files

##Combining Dosage and Sample:X chromosomes 
sample_UKBB_X=read.table("/data/nrnb03/users/agarduno/jupyter/IRKD_SNP/Alexis_X_snps_subset.sample.gz", header=TRUE)
sample_UKBB_X=sample_UKBB_X[-1,1:2]
names(sample_UKBB_X)=c("FID","IID") #don't want redudant IID
geno_UKBB_X=cbind(sample_UKBB_X, DOSAGE_X)
rm(DOSAGE_X)

##Combined alt and ref alleles ("dosage files", not full genotype)
Dose_SNPINFO <- rbind(Dose_SNPINFO,Dose_SNPINFO_X) #combined dosage ref files
print("Dose_SNPINFO (line 92)")
head(Dose_SNPINFO)

# Combining Reference SNPList with SNP info from Dosage File
SNPINFOX=merge(Dose_SNPINFO, snpinfo, by="rsid", all=TRUE) #X chromosomes and autosomal

	dim(SNPINFOX)
	#340   9

print("SNPINFO")
	dim(Dose_SNPINFO)
	#340   6
print("SNPs")
	dim(snpinfo)
	#338   4

	#checking SNPINFOX for duplicate SNP using rsIDS
	subset(as.data.frame(table(SNPINFOX$rsid)), Freq==2)
	#         Var1 Freq
	#59  rs1215468    2
	#212 rs4930011    2
	
	#checking Dose_SNPINFO for duplicate SNP using rsIDs
	subset(as.data.frame(table(Dose_SNPINFO$rsid)), Freq==2)
        #	 Var1 Freq
	#59  rs1215468    2
	#212 rs4930011    2

	#checking Dose_SNPINFO for duplicate SNP using SNPID
	subset(as.data.frame(table(Dose_SNPINFO$SNPID)), Freq==2) 
	#	[1] Var1 Freq
	#<0 rows> (or 0-length row.names) ## no matches -> 'SNPID' is unique for 340 SNPS, while rsID has two duplicates


	# checking details of duplicate rsIDs in SNPINFOX File
	SNPINFOX[(SNPINFOX$rsid=="rs1215468"),]
	#        rsid CHR           SNPID POSITION ref alt EA NEA       Beta
	#59 rs1215468  13 13:80707429_A_C 80707429   A   C  A   G 0.03342375
	#60 rs1215468  13 13:80707429_A_G 80707429   A   G  A   G 0.03342375

	# checking details of duplicate rsIDs in SNPINFOX File
	SNPINFOX[(SNPINFOX$rsid=="rs4930011"),]
	#         rsid CHR          SNPID POSITION ref alt EA NEA       Beta
	#213 rs4930011  11 11:2856658_C_G  2856658   C   G  G   C 0.0128372
	#214 rs4930011  11 11:2856658_C_T  2856658   C   T  G   C 0.01283723
    SNPINFOX[(SNPINFOX$rsid=="rs5945326"),]

	# The issue is these 2 SNPs are tri-allelic (rs1215468 has A/C/G alleles), triallelic SNPs
	# are split into two biallelic SNPs and are both retrieved when we retrieve on chr/pos (or rsID)
	# The PGS script fails because of the trialllelic SNPs which are split into two biallelic variants 
	# (with same rsID), which the PRS calculator fxn fail (not designed for this)


# Checking Allele Coding - Flip or not
#  SNP orientation check
SNPINFOX$OK=(SNPINFOX$EA == SNPINFOX$alt)
SNPINFOX$FLIP=(SNPINFOX$EA == SNPINFOX$ref) # fixed the error of two refs

# Check if alleles match/overlap (if bad, need to review/double check) - CORRECTED
SNPINFOX$CHECK="BAD_Alleles"
SNPINFOX$CHECK[SNPINFOX$EA == SNPINFOX$alt  & SNPINFOX$NEA == SNPINFOX$ref ]="OK"
SNPINFOX$CHECK[SNPINFOX$NEA == SNPINFOX$alt & SNPINFOX$EA == SNPINFOX$ref ]="OK"

table(SNPINFOX$OK)
table(SNPINFOX$FLIP)
table(SNPINFOX$CHECK) #one bad allele
SNPINFOX[which(SNPINFOX$CHECK=="BAD_Alleles"),]

SNPINFOX[(SNPINFOX$rsid=="rs7403531"),]

##################################################################################################
# The GRS script does not run due to two SNPs with same rsid -> need to remove problematic allele
# Need to corret (remove the 2 problamatic snps noted above) in dose_UKBB_Lipid using SNPID
##################################################################################################
grep("15:38822905_T_G",  Dose_SNPINFO$SNPID) #removed the problematic SNP
#36

#DOSAGE:Removing problematic SNPs 
dose_UKBB_LipidX=dose_UKBB_Lipid[-c(36),]
Dose_SNPINFOX <- dose_UKBB_LipidX[,1:6]
names(Dose_SNPINFOX)=c("CHR", "SNPID", "rsid", "POSITION", "ref", "alt")
DOSAGEX  <- as.data.frame(t(dose_UKBB_LipidX[,-c(1:6)])) 
colnames(DOSAGEX)<- as.character(Dose_SNPINFOX[,3])
# Generating Combined file Dosage w/ Sample ID File
geno_UKBB_LipidX=cbind(sample_UKBB_LipidX, DOSAGEX) #included larger sample file to combine

#DOSAGE:Add chromosome X information
#see lines #79 (geno_UKBB_X)

# Combining all dosage and sample files
geno_UKBB_allX=merge(geno_UKBB_LipidX, geno_UKBB_X, by="FID", all=TRUE) #keep all people
print('sample size - x and auto')
dim(geno_UKBB_allX) #130 - x chromo is here

# Combining Reference SNPList with SNP info from Dosage File
SNPINFOX=merge(Dose_SNPINFOX, snpinfo, by="rsid", all=FALSE)
dim(SNPINFOX) #126??? - source of missing snps!

# Checking Allele Coding - Flip or not
#  SNP orientation check
SNPINFOX$OK=(SNPINFOX$EA == SNPINFOX$alt)
#SNPINFOX$FLIP=(SNPINFOX$EA == SNPINFOX$ref.y) #update from just ref (two refs)
SNPINFOX$FLIP=(SNPINFOX$EA == SNPINFOX$ref) # fixed the error of two refs

# Check if alleles match/overlap (if bad, need to review/double check) - CORRECTED
SNPINFOX$CHECK="BAD_Alleles"
SNPINFOX[which(SNPINFOX$CHECK=="BAD_Alleles"),]
SNPINFOX$CHECK[SNPINFOX$EA == SNPINFOX$alt  & SNPINFOX$NEA == SNPINFOX$ref ]="OK"
SNPINFOX$CHECK[SNPINFOX$NEA == SNPINFOX$alt & SNPINFOX$EA == SNPINFOX$ref ]="OK"

#Check loss of 126 SNPs over 128
head(SNPINFOX)
table(SNPINFOX$OK)
table(SNPINFOX$FLIP)
table(SNPINFOX$CHECK)

# SNP Orientation Checker Function
# Note: Checking and Aligning SNPs (Strand Orientation check/correction)

        SNP_Align_GRS=function(XDOSE, XINFO)
                {
                
                # Copying Dosage file (avoids internal loops/file reference within script)
                DOSE_POS=XDOSE[,c("FID", "IID", as.character(XINFO$rsid))]
                DOSE_WT=XDOSE[,c("FID", "IID", as.character(XINFO$rsid))]

                # Generating list of SNPs to loop through
                XSNPLIST=names(DOSE_POS)[-c(1:2)]
                NN=length(XSNPLIST)

                # Sets alleles so that all alleles are postively associated with outcome based on snpinfo file (positive beta)
                for(i in 1:NN)
                        {
                        # Additional formatting to insure exact SNP name match (avoids partial match issues)
                        xsnp=XSNPLIST[i]
                        ZZ=grep(paste("^", xsnp, "$", sep=""), XINFO$rsid)
                        snp_info=XINFO[ZZ,]

                        #print(i)
                        #print(xsnp)
                        if(snp_info$OK=="TRUE")
                                {
                                DOSE_POS[,xsnp]=XDOSE[,xsnp]
                                DOSE_WT[,xsnp]=snp_info$Beta*XDOSE[,xsnp]
                                }
                        if(snp_info$FLIP=="TRUE")
                                {
                                DOSE_POS[,xsnp]=abs(XDOSE[,xsnp] -2)
                                DOSE_WT[,xsnp]=snp_info$Beta*(abs(XDOSE[,xsnp] -2))
                                }

                        }

                # Creating GRS Variables (set to 'NA')
                XDOSE_POS=DOSE_POS
                XDOSE_POS$GRS_RAW=NA
                XDOSE_POS$GRS_WT=NA

                # Creating GRS (count of positive associated allele (raw) and weighted (WT) versions)
                XDOSE_POS$GRS_RAW=rowSums(DOSE_POS[,-c(1:2)], na.rm = FALSE, dims = 1)
                XDOSE_POS$GRS_WT =rowSums(DOSE_WT[ ,-c(1:2)], na.rm = FALSE, dims = 1)

		# Checking that all SNPS used in PRS
		NSNPi=nrow(XINFO)
		NSNPd=ncol(XDOSE[,!(names(XDOSE) %in% c("FID", "IID"))])
		NSNPp=ncol(DOSE_POS[,!(names(DOSE_POS) %in% c("FID", "IID"))])

		print("####################################")
		print(paste("Num NSNP in SNPINFO File  : ", NSNPi, sep=""))
		print(paste("Num NSNP in Dosage File   : ", NSNPd, sep=""))
		print(paste("Num NSNP used in PRS Calc : ", NSNPp, sep=""))
		print("####################################")

                return(XDOSE_POS)
                }



# Running GRS Check and Calculator Script (note: GRS added to end of file)
geno_UKBB_allX$IID <- geno_UKBB_allX$IID.x
UKBB_IR_GRS=SNP_Align_GRS(XDOSE=geno_UKBB_allX, XINFO=SNPINFOX)
write.table(UKBB_IR_GRS, file="~/UKBB_T2D_MVMR_GRS_REV_23Jul21.txt", quote=FALSE, sep="\t", row.names=FALSE)

In [None]:
dim(dose_UKBB_Lipid)
dim(dose_UKBB_proxy)
dim(dose_UKBB_X)

In [None]:
#check for rsid
getwd()
check <- read.table("~/jupyter/UKBB_T2D_MVMR_GRS_REV_23Jul21.txt", header = TRUE, na.strings=c("",".","NA"))
head(check)

In [None]:
print("rs5945326")
names(check)

In [None]:
#missing from exp
print("rs5945326")
which(names(check)=="rs5945326")
print("rs74995800")
which(names(check)=="rs74995800")
#outcome file missing
print("rs2244020")
which(names(check)=="rs2244020")
print("rs12941263")
which(names(check)=="rs12941263")
check[,which(names(check)=="rs12941263")]
print("rs79349575")
which(names(check)=="rs79349575")
check[,which(names(check)=="rs79349575")]
print("rs115321690")
which(names(check)=="rs115321690")
check[,which(names(check)=="rs115321690")]
print("rs3132535")
which(names(check)=="rs3132535")
check[,which(names(check)=="rs3132535")]
print("rs62023386")
which(names(check)=="rs62023386")
check[,which(names(check)=="rs62023386")]


In [None]:
#x chromosome on list
# snpinfo=read.csv("/data/nrnb03/users/agarduno/jupyter/IRKD_SNP/T2D_MVMR_Scott_Snp_v2.csv") 
#added from T2D_MVMR_Scott_Snp_CHECKED_format
# rs1 <- data.frame(rsid="rs74995800",EA="G",NEA="A",Beta=0.019802627)
# rs2 <- data.frame(rsid="rs76895963",EA="T",NEA="G",Beta=0.207014169)
# snpinfo=rbind(snpinfo,rs1,rs2)
# print(snpinfo)

In [None]:
#126 instead of 128
print('batch1')
which(!(Dose_SNPINFOX$rsid %in% snpinfo$rsid)) 
print('batch2')
which(!(snpinfo$rsid %in% Dose_SNPINFOX$rsid)) 
#which missing from csv list
snpinfo$rsid[c(49,82)]
#rs186838848, rs5945326