# Challenge 2 solution

Pre-challenge:
```
bcftools view -a -s ^"DenisovaPinky,AltaiNea" testset.filtered.vcf.gz | bcftools view -e 'GT="./."' -m 2 -M 2 | bgzip > testset.filtered.nomis.vcf.gz
```


## A)

Install gdc (and, as in the previous lecture, the admixtools package):

```
cd ~/
git clone https://github.com/mathii/gdc.git

R --vanilla
install.packages(c("igraph","dplyr","plotly"),repos="https://cloud.r-project.org",upgrade="never")
devtools::install_github("uqrmaie1/admixtools",repos="https://cloud.r-project.org",upgrade="never")
library("admixtools")
```

Get data to eigenstrat:
```
python2 ~/gdc/vcf2eigenstrat.py -v testset.filtered.vcf.gz -o testset -i testset_1.ind
```

Analyse with admixtools:

```
R --vanilla
library(admixtools)
library(dplyr)
options(scipen=100)

extract_f2(pref="testset",
    outdir="genos",blgsize=500000,
    overwrite=T,maxmem=500
)

f2_blocks = f2_from_precomp("genos")
f4_table1<-f4(f2_blocks,pop1="YRI", pop2="IBS",pop3="NEA", pop4="DEN")
```

IBS has more shared alleles with NEA than expected. This is due to gene flow from Neandertals to non-Africans. The z-score is low because this is only one chromosome.

```
pop1  pop2  pop3  pop4       est       se     z      p
  <chr> <chr> <chr> <chr>    <dbl>    <dbl> <dbl>  <dbl>
1 YRI   IBS   NEA   DEN   -0.00202 0.000905 -2.24 0.0254
```

## B)

Calculate matchrate to Denisovan:
```
sstar matchrate --vcf testset.filtered.nomis.witharch.vcf.gz --ref ref.ind.list --tgt tgt.ind.list --src deni.list --score testset.threshold.results --mapped-region testset.tracts.bed --output deni.match.rate.results
```

Get Denisovan-like fragments regardless of Neanderthals:
```
sstar tract --threshold testset.threshold.results --output-prefix deni.final --match-rate deni.match.rate.results
```

Count the number of fragments:
```
R --vanilla # or RStudio

setwd("/home/jovyan/notebooks/introgression/data")
matchtab<-read.table("deni.final.bed",sep="\t",header=F)
as.data.frame(table(matchtab[which(matchtab[,5]>0.33),4]))
```

Do a direct comparison what fits best, following the sstar tutorial:

```
sstar tract --threshold testset.threshold.results --output-prefix both.final --match-rate altai.match.rate.results deni.match.rate.results
```

Count the number of fragments:
```
R --vanilla # or RStudio

setwd("/home/jovyan/notebooks/introgression/data")
matchtab<-read.table("both.final.src2.bed",sep="\t",header=F)
as.data.frame(table(matchtab[which(matchtab[,5]>0.33),4]))
```


## C)

The following is a rather efficient solution (not even the most efficient one). As always, there are different ways of reaching the same goal.

Get values of cumulative length per individual on the command line (one could also do the same in R, but for practical purposes we stick to the methods from the course). The easiest is a `while` loop, but it can be done one by one with copy-pasting of code.

```
while read ind in
do
grep "$ind" both.final.src2.bed | bedtools sort | bedtools merge > $ind.deni.bed
grep "$ind" both.final.src1.bed | bedtools sort | bedtools merge > $ind.altai.bed
done < tgt.ind.list
```

Otherwise, just do `ind=HG02223` and the same two lines with grep, repeat with the next `ind` etc.


Load tables into R. The most straightforward is a `for` loop in R, repeat the same action and collect the output:

```
R --vanilla # or RStudio
setwd("/home/jovyan/notebooks/introgression/data")
indlist<-unlist(read.table("tgt.ind.list",sep="\t",header=F))

lengsA<-c()
numsA<-c()
lengsD<-c()
numsD<-c()
for (j in (1:length(indlist))) { 
    matchtab1<-read.table(paste(indlist[j],".altai.bed",sep=""),sep="\t",header=F)
    matchtab2<-read.table(paste(indlist[j],".deni.bed",sep=""),sep="\t",header=F)
    lengsA[j]<-sum(matchtab1[,3]-matchtab1[,2])
    lengsD[j]<-sum(matchtab2[,3]-matchtab2[,2])
    numsA[j]<-nrow(matchtab1)
    numsD[j]<-nrow(matchtab2)
    }
```

Now, create simple barplots:

```
nicecol <- colorRampPalette(c("orange", "blue"))(10)
barplot(lengsA,beside=T,names.arg=indlist,col=c(nicecol),las=2)
barplot(lengsD,beside=T,names.arg=indlist,col=c(nicecol),las=2)

barplot(numsA,beside=T,names.arg=indlist,col=c(nicecol),las=2)
barplot(numsD,beside=T,names.arg=indlist,col=c(nicecol),las=2)

```

Or both together:

```
sepcol=c("blue","orange")
barplot(rbind(lengsA,lengsD),beside=T,names.arg=rbind(indlist,indlist),col=sepcol,las=2)
barplot(rbind(numsA,numsD),beside=T,names.arg=rbind(indlist,indlist),col=sepcol,las=2)
```

Solution for more pretty violinplots:

```
library(ggplot2)
mtab<-data.frame(lengs=c(lengsA,lengsD),sample=c(rep("AltaiNea",length(lengsA)),rep("Denisovan",length(lengsD))))
p1<-ggplot(mtab,aes(factor(sample), lengs)) + 
    geom_violin(scale="width",aes(fill=factor(sample)),adjust=1.0,draw_quantiles = c(0.5)  ) +
    geom_jitter(height = 0, width = 0.25,size=1) + geom_segment(aes(x=0.0,y=0,xend=0.0,yend=1))    + 
    theme_bw() +
    theme(panel.border = element_blank(),panel.grid.major = element_blank(),panel.grid.minor = element_blank(),plot.background = element_rect(fill = "white"),legend.position="none",axis.text.y= element_text(size=8),axis.title.y= element_text(size=10), axis.ticks.x = element_blank(), plot.title = element_text(face="bold",hjust=0.5,size=12)) +
    xlab("")+ylab("Length distributions") + ggtitle(label="Distribution of cumulative fragment lengths")

mtab2<-data.frame(lengs=c(numsA,numsD),sample=c(rep("AltaiNea",length(lengsA)),rep("Denisovan",length(lengsD))))
p2<-ggplot(mtab2,aes(factor(sample), lengs)) + 
    geom_violin(scale="width",aes(fill=factor(sample)),adjust=1.0,draw_quantiles = c(0.5)  ) +
    geom_jitter(height = 0, width = 0.25,size=1) + geom_segment(aes(x=0.0,y=0,xend=0.0,yend=1))    + 
    theme_bw() +
    theme(panel.border = element_blank(),panel.grid.major = element_blank(),panel.grid.minor = element_blank(),plot.background = element_rect(fill = "white"),legend.position="none",axis.text.y= element_text(size=8),axis.title.y= element_text(size=10), axis.ticks.x = element_blank(), plot.title = element_text(face="bold",hjust=0.5,size=12)) +
    xlab("")+ylab("Length distributions") + ggtitle(label="Distribution of fragment numbers")

install.packages("patchwork")
require("patchwork")
p1 + p2
```


## D) 

Creating a countour plot, basically taken from here: https://github.com/YingZhou001/sprimepipeline/blob/main/pub.pipeline.pbs/tools/plot_contour.r

```
library(MASS)

matchtab1<-read.table("altai.final.bed",sep="\t",header=F)
matchtab2<-read.table("deni.final.bed",sep="\t",header=F)
matab<-data.frame(AltaiNea=matchtab1[,5],Deni=matchtab2[,5])

png("contour.png",width=1100,height=1100, res=250); 
ylab="Match to Denisovan"; 
xlab="Match to Altai Neanderthal"; 
level1=seq(0.3,0.9,0.1); 
level2=seq(1,10,1)
X<-matab$AltaiNea
Y<-matab$Deni
o<-!is.na(X)&!is.na(Y)
X<-X[o]
Y<-Y[o]

contour(kde2d(X,Y,n=100, lims=c(0,1,0,1)),levels=c(level1,level2), xaxs="i",yaxs="i",xlab=xlab,ylab=ylab,main="",las=1,cex.lab=1.3,cex.axis=1.2,cex.main=1.5,lty=5,labcex=0.8);grid(lty=3,col="gray")
dev.off()
```
