In [None]:
rm( list=ls() ) 

In [None]:
require( data.table)

In [None]:
getwd()

In [None]:
list.files()

## Load test GWAS "summary statistics"

In [None]:
# If computer hates you, switch which file you load
data <- fread( "./data/PGC3_SCZ_wave3_public.v2.tsv", fill=T )
#data <- fread( "./data/SCZ_raw_sample.txt" )

In [None]:
dim( data )
head( data )

In [None]:
data.size <- dim( data )

In [None]:
data$CHR <- as.numeric( data$CHR )
data <- data[ !is.na( data$CHR ), ]
data <- data[ order( data$CHR, data$BP ), ]

In [None]:
head( data )

In [None]:
## Some plot values
data$ManPos <- cumsum( data$BP/1000 )
chr.mids <- cbind( 1:22, NA )
for ( i in 1:max(data$CHR) ) {
	chr.start <- min( data$ManPos[ data$CHR == i ] )	
	chr.stop <- max( data$ManPos[ data$CHR == i ] )
	chr.length <- chr.stop-chr.start
	chr.mids[ i,2 ] <- chr.start + chr.length/2
}

In [None]:
#jpeg( "SCZ_QuickManhattan_sample.jpeg", width=1440, height=480)
plot( data$ManPos[ data$P < 0.01 ], -log10( data$P[ data$P < 0.01 ] ),
        xaxt='n', 
        col=c( 'black','grey' )[ data$CHR[ data$P < 0.01 ] %% 2 + 1 ],
        xlab="Genome Position", ylab="-log10(p)",
        ylim=c(0,40), 
        pch=16 )
abline( h=-log10( 5e-8 ), col='green' )
axis( 1, at=chr.mids[,2], labels=chr.mids[,1] )

In [None]:
# Mini qq-plot
#jpeg( "SCZ_QuickQQ_sample.jpeg", width=480, height=480)
observed.p <- -log10( data$P[ order( data$P ) ] )
expected.p <- -log10( ( 1:length( data$P ) ) / ( length( data$P ) + 1 ) )
plot( expected.p[ expected.p < 4 ], observed.p[ expected.p < 4 ], 
        type='l', 
        lwd=4,
        ylim=c( 0, max(observed.p)+1),
        xlim=c( 0, max(expected.p)+1) )
points( expected.p[ expected.p > 4 ], observed.p[ expected.p > 4 ], pch=16 )
abline( 0,1 )
abline( v=-log10(0.5), lty=2, col='blue' )
abline( v=-log10(0.25), lty=3, col='blue' )
abline( v=-log10(0.1), lty=4, col='blue' )

## Prep genotype data

In [None]:
rm( list=ls() )

In [None]:
# Load target individual
ajs <- fread( './data/AndrewSchork.traw' )

In [None]:
dim( ajs )
head( ajs )

In [None]:
# Load Reference individuals
kgp <- fread( './data/KGP_GSA_PGS.traw' )

In [None]:
dim( kgp )
head( kgp[,1:10] )

In [None]:
# Combine data
tped <- cbind( kgp[ ,1:6 ], ajs[ ,7 ], kgp[ ,7:1170 ] )

In [None]:
dim( tped )
head( tped[,1:15] )

#### Assess fit to reference to test subject

In [None]:
# Get a smaller set of SNPs (2000) for computational speed, if you are brave try more!
tped.small <- tped[ sample( dim(tped)[1], 2000 ) ,7:1171 ]

In [None]:
# Compute PCs to check ancestry
# This step takes a while
pca <- prcomp( t( tped.small ) )
pcs <- cbind( do.call( rbind, strsplit( names(tped)[ 7:1171 ], split="_" ) ), pca$x[ ,1:25 ] )
pcs <- data.table( pcs )

In [None]:
# Plot reference and test subject ancestry PCAs				
# This step also takes a while
par( mfrow=c(2,3) )
plot( pcs$PC1, pcs$PC2, 
		col=c( "red", "blue" )[ 1 + 1*(pcs[[1]] == 'EUR') ] )
points( pcs$PC1[1], pcs$PC2[1], col="green", pch=16 )
plot( pcs$PC3, pcs$PC4, 
		col=c( "red", "blue" )[ 1 + 1*(pcs[[1]] == 'EUR') ] )
points( pcs$PC3[1], pcs$PC4[1], col="green", pch=16 )
plot( pcs$PC5, pcs$PC6, 
		col=c( "red", "blue" )[ 1 + 1*(pcs[[1]] == 'EUR') ] )
points( pcs$PC5[1], pcs$PC6[1], col="green", pch=16 )
plot( pcs$PC7, pcs$PC8, 
		col=c( "red", "blue" )[ 1 + 1*(pcs[[1]] == 'EUR') ] )
points( pcs$PC7[1], pcs$PC8[1], col="green", pch=16 )
plot( pcs$PC9, pcs$PC10, 
		col=c( "red", "blue" )[ 1 + 1*(pcs[[1]] == 'EUR') ] )
points( pcs$PC9[1], pcs$PC10[1], col="green", pch=16 )
plot(0,0,type='n')
legend( "center", legend=c("AFR", "EUR", "AJS" ), col=c( "red", "blue", "green" ), pch=c(1,1,16) )

In [None]:
# Grab just Europeans

eurs <- which( pcs[,1] != "AFR" )
tped.small.eur <- tped.small[ ,..eurs ]

pca.eur <- prcomp( t( tped.small.eur ) )
pcs.eur <- cbind( do.call( rbind, strsplit( names(tped)[ eurs+6 ], split="_" ) ), pca.eur$x[ ,1:25 ] )
pcs.eur <- data.table( pcs.eur )

In [None]:
subpop <- fread( "data/EUR_SubPop.txt" )
table( subpop[[2]] )

In [None]:
plot( pcs.eur$PC1, pcs.eur$PC2 )
points( pcs.eur$PC1[1], pcs.eur$PC2[1], col="green", pch=16, cex=1.5 )

In [None]:
# Plot reference and test subject ancestry PCAs
plot( pcs.eur$PC1, pcs.eur$PC2, type='n' )

points( pcs.eur$PC1[ pcs.eur$V2 %in% subpop[[1]][ subpop[[2]] == 'FIN' ] ], 
           pcs.eur$PC2[ pcs.eur$V2 %in% subpop[[1]][ subpop[[2]] == 'FIN' ] ], col='red' )       #North
points( pcs.eur$PC1[ pcs.eur$V2 %in% subpop[[1]][ subpop[[2]] == 'GBR' ] ], 
           pcs.eur$PC2[ pcs.eur$V2 %in% subpop[[1]][ subpop[[2]] == 'GBR' ] ], col='blue' )
points( pcs.eur$PC1[ pcs.eur$V2 %in% subpop[[1]][ subpop[[2]] == 'CEU' ] ], 
           pcs.eur$PC2[ pcs.eur$V2 %in% subpop[[1]][ subpop[[2]] == 'CEU' ] ], col='black' )
points( pcs.eur$PC1[ pcs.eur$V2 %in% subpop[[1]][ subpop[[2]] == 'IBS' ] ], 
           pcs.eur$PC2[ pcs.eur$V2 %in% subpop[[1]][ subpop[[2]] == 'IBS' ] ], col='goldenrod' )
points( pcs.eur$PC1[ pcs.eur$V2 %in% subpop[[1]][ subpop[[2]] == 'TSI' ] ], 
           pcs.eur$PC2[ pcs.eur$V2 %in% subpop[[1]][ subpop[[2]] == 'TSI' ] ], col='purple' )    #South

points( pcs.eur$PC1[1], pcs.eur$PC2[1], col="green", pch=16, cex=1.5 )


## Let's explore my PGS, computed from 65,000 SNPs

In [None]:
## I got your back.  Delete everything, and load PGS file for all traits and disorders.  
rm( list=ls() )

In [None]:
PGS <- fread( './data/PGS.txt' )
head( PGS )

In [None]:
hist( PGS$ht_pgs )
abline( v=mean( PGS$ht_pgs ), lty=2, col=1 )
mean( PGS$ht_pgs )
sd( PGS$ht_pgs )

In [None]:
for ( i in 3:14 ) {
    PGS[[ i ]] <- scale( PGS[[ i ]] )
}

hist( PGS$ht_pgs )
mean( PGS$ht_pgs )
sd( PGS$ht_pgs )

In [None]:
# Lets start with height

hist(  PGS$ht_pgs, breaks='fd', xlim=c( -4,4 ) )
abline( v=mean( PGS$ht_pgs[ PGS$FID == 'EUR' ] ), lty=2, col='blue' )
abline( v=mean( PGS$ht_pgs[ PGS$FID == 'AFR' ] ), lty=2, col='green' )

hist(  PGS$ht_pgs[ PGS$FID == 'EUR' ], breaks='fd', xlim=c( -4,4 ) )
abline( v=mean( PGS$ht_pgs[ PGS$FID == 'EUR' ] ), lty=2, col='blue' )
abline( v=mean( PGS$ht_pgs[ PGS$FID == 'AFR' ] ), lty=2, col='green' )

hist(  PGS$ht_pgs[ PGS$FID == 'AFR' ], breaks='fd', xlim=c( -4,4 ) )
abline( v=mean( PGS$ht_pgs[ PGS$FID == 'EUR' ] ), lty=2, col='blue' )
abline( v=mean( PGS$ht_pgs[ PGS$FID == 'AFR' ] ), lty=2, col='green' )


In [None]:
# Lets add me

hist(  PGS$ht_pgs, breaks='fd', xlim=c( -4,4 ) )
abline( v=mean( PGS$ht_pgs[ PGS$FID == 'EUR' ] ), lty=2, col='blue' )
abline( v=mean( PGS$ht_pgs[ PGS$FID == 'AFR' ] ), lty=2, col='green' )
abline( v=PGS$ht_pgs[ PGS$FID == 'Schork' ], col='red', lwd=2 )

hist(  PGS$ht_pgs[ PGS$FID == 'EUR' ], breaks='fd', xlim=c( -4,4 ) )
abline( v=mean( PGS$ht_pgs[ PGS$FID == 'EUR' ] ), lty=2, col='blue' )
abline( v=mean( PGS$ht_pgs[ PGS$FID == 'AFR' ] ), lty=2, col='green' )
abline( v=PGS$ht_pgs[ PGS$FID == 'Schork' ], col='red', lwd=2 )

hist(  PGS$ht_pgs[ PGS$FID == 'AFR' ], breaks='fd', xlim=c( -4,4 ) )
abline( v=mean( PGS$ht_pgs[ PGS$FID == 'EUR' ] ), lty=2, col='blue' )
abline( v=mean( PGS$ht_pgs[ PGS$FID == 'AFR' ] ), lty=2, col='green' )
abline( v=PGS$ht_pgs[ PGS$FID == 'Schork' ], col='red', lwd=2 )

In [None]:
## Summarize distribution to look up my percentile

all.pgs.ecdf <- ecdf( PGS$ht_pgs )
#all.ct.ecdf <- ecdf( PGS$ht_count )
eur.pgs.ecdf <- ecdf( PGS$ht_pgs[ PGS$FID == 'EUR' ] )
#eur.ct.ecdf <- ecdf( PGS$ht_count[ PGS$FID == 'EUR' ] )
afr.pgs.ecdf <- ecdf( PGS$ht_pgs[ PGS$FID == 'AFR' ] )
#afr.ct.ecdf <- ecdf( PGS$ht_count[ PGS$FID == 'AFR' ] )

plot( eur.pgs.ecdf )
abline( v=PGS$ht_pgs[1], col='red', lwd=2 )

In [None]:
plot( afr.pgs.ecdf )
abline( v=PGS$ht_pgs[1], col='red', lwd=2 )

In [None]:
## What are my percentiles?

all.pgs.ecdf( PGS$ht_pgs[1] )
#all.ct.ecdf( PGS$ht_count[1] )
eur.pgs.ecdf( PGS$ht_pgs[1] )
#eur.ct.ecdf( PGS$ht_count[1] )
afr.pgs.ecdf( PGS$ht_pgs[1] )
#afr.ct.ecdf( PGS$ht_count[1] )

In [None]:
## So, am I tall? I need to know.
## Any speculations?

## My score vs. different backgrounds

##### ADHD

In [None]:
temp <- PGS[ ,c(1,2,5,6) ]

In [None]:
head( temp )

In [None]:
par( mfrow=c(2,2) )

hist(  temp[[3]][ temp$FID == 'EUR' ], breaks='fd' )
abline( v=temp[[3]][1], col='red', lwd=2 )
abline( v=mean( temp[[3]][ temp$FID == 'EUR' ] ), lty=2, lwd=2 )

hist(  temp[[3]][ temp$FID == 'AFR' ], breaks='fd' )
abline( v=temp[[3]][1], col='red', lwd=2 )
abline( v=mean( temp[[3]][ temp$FID == 'AFR' ] ), lty=2, lwd=2 )


In [None]:
eur.ecdf <- ecdf( temp[[3]][ temp$FID == 'EUR' ] )
afr.ecdf <- ecdf( temp[[3]][ temp$FID == 'AFR' ] )

In [None]:
plot( eur.ecdf )
abline( v=temp[[3]][1], col='red', lwd=2 )
plot( afr.ecdf )
abline( v=temp[[3]][1], col='red', lwd=2 )

In [None]:
eur.ecdf( temp[[3]][1] )
afr.ecdf( temp[[3]][1] )

##### ASD

In [None]:
temp <- PGS[ ,c(1,2,7,8) ]

##### MDD

In [None]:
temp <- PGS[ ,c(1,2,9,10) ]

##### SCZ

In [None]:
temp <- PGS[ ,c(1,2,3,4) ]

##### IQ

In [None]:
temp <- PGS[ ,c(1,2,13,14) ]