analysis/PASdescriptiveplots.Rmd

---
title: "PAS descriptive plots"
author: "Briana Mittleman"
date: "4/23/2019"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

In this analysis I will create discriptive plots for the PAS identified in the 54 LCLs.  

```{r}
library(workflowr)
library(tidyverse)
library(reshape2)
library(cowplot)
```


##Peaks per gene: 

I want to plot how many genes have 0, 1, 2 and more than 2 PAS in the set. I need to join my PAS with the annotation to find out how many genes have 0 PAS.  

```{r}
pas=read.table("../data/PAS/APAPAS_GeneLocAnno.5perc.bed", header = F, stringsAsFactors = F, col.names = c("Chr", "start", "end", "PeakID", "score", "strand")) %>% separate(PeakID, into=c("peaknum", "geneAnno"), sep=":") %>% separate(geneAnno, into=c("Gene", "Loc"),sep="_")

pasbygene= pas %>% group_by(Gene) %>% summarise(PAS=n())

annotation=read.table("../../genome_anotation_data/RefSeq_annotations/ncbiRefSeq_FormatedallAnnotation.sort.bed", col.names = c("chr", "start", "end", "anno", "score", "strand")) %>% separate(anno, into=c("Loc", "Gene"),sep=":") %>% group_by(Gene) %>% summarise(annos=n()) %>% dplyr::select(Gene)

PASallgene=annotation %>% full_join(pasbygene, by="Gene") %>% replace_na(list(PAS=0)) 

#group with 0,1,2,more than 2
PASallgene_grouped=PASallgene %>% mutate(Zero=ifelse(PAS==0,1, 0), One=ifelse(PAS==1,1,0), Multiple=ifelse(PAS>1,1,0))

```


Plot this:  
```{r}
Genes=c(sum(PASallgene_grouped$Zero),sum(PASallgene_grouped$One),sum(PASallgene_grouped$Multiple))
PAS=c("Zero", "One", "Multiple")
AllPAS=c(sum(PASallgene_grouped$Zero),sum(PASallgene_grouped$One),sum(PASallgene_grouped$Multiple))
GenebyPAS=as.data.frame(cbind(PAS,AllPAS))
GenebyPAS$AllPAS=as.numeric(as.character(GenebyPAS$AllPAS))

allPASplot=ggplot(GenebyPAS, aes(x="",y=AllPAS, fill=PAS)) + geom_bar(stat="identity", width=.5) + scale_fill_brewer(palette="GnBu") + labs(title="Identified PAS per Gene", y="Genes",x="All Indentified PAS")
allPASplot

ggsave(allPASplot, file="../output/GeneswithAPApotentialAllPAS.png", width=3, height=5)
```

###Subset and get stats for UTR  

```{r}
pasUTR=pas %>% filter(Loc=="utr3") %>% group_by(Gene) %>% summarise(PAS=n())

pasUTR_allgene=annotation %>% full_join(pasUTR, by="Gene") %>% replace_na(list(PAS=0)) 

PASUTRallgene_grouped=pasUTR_allgene %>% mutate(Zero=ifelse(PAS==0,1, 0), One=ifelse(PAS==1,1,0), Multiple=ifelse(PAS>1,1,0))


GenesUTR=c(sum(PASUTRallgene_grouped$Zero),sum(PASUTRallgene_grouped$One),sum(PASUTRallgene_grouped$Multiple))
UTR=c(sum(PASUTRallgene_grouped$Zero),sum(PASUTRallgene_grouped$One),sum(PASUTRallgene_grouped$Multiple))
GenebyPASUTR=as.data.frame(cbind(PAS,UTR))
GenebyPASUTR$UTR=as.numeric(as.character(GenebyPASUTR$UTR))

ggplot(GenebyPASUTR, aes(x="",y=UTR, fill=PAS)) + geom_bar(stat="identity")

```


###Subset and get stats for Intron


```{r}
pasIntron=pas %>% filter(Loc=="intron" | Loc=='utr3') %>% group_by(Gene) %>% summarise(PAS=n())

pasIntron_allgene=annotation %>% full_join(pasIntron, by="Gene") %>% replace_na(list(PAS=0)) 

pasIntronallgene_grouped=pasIntron_allgene %>% mutate(Zero=ifelse(PAS==0,1, 0), One=ifelse(PAS==1,1,0), Multiple=ifelse(PAS>1,1,0))


UTRandIntron=c(sum(pasIntronallgene_grouped$Zero),sum(pasIntronallgene_grouped$One),sum(pasIntronallgene_grouped$Multiple))
GenebyPASIntron=as.data.frame(cbind(PAS,UTRandIntron))
GenebyPASIntron$UTRandIntron=as.numeric(as.character(GenebyPASIntron$UTRandIntron))

ggplot(GenebyPASIntron, aes(x="",y=UTRandIntron, fill=PAS)) + geom_bar(stat="identity")

```


Make these side by side:  

```{r}
GenebyPASUTR_melt=melt(GenebyPASUTR, id.vars = "PAS", value.name = "Genes", variable.name = "Set")

GenebyPAS_melt=melt(GenebyPAS, id.vars = "PAS", value.name = "Genes", variable.name = "Set")

GenebyPASIntron_melt=melt(GenebyPASIntron, id.vars = "PAS", value.name = "Genes", variable.name = "Set")

GenebyPAStoplot=rbind(GenebyPAS_melt,GenebyPASUTR_melt,GenebyPASIntron_melt)

geneswithAPA=ggplot(GenebyPAStoplot, aes(x=Set,y=Genes, fill=PAS, by=Set)) + geom_bar(stat="identity")+ scale_fill_brewer(palette="YlGnBu") + labs(title="Genes with APA poential")

geneswithAPA

```

```{r}
ggsave(geneswithAPA, file="../output/GeneswithAPApotential.png")
```

```{r}
GenebyPAStoplot
```

##Location of PAS

```{r}
PAS_loc=pas %>% group_by(Loc) %>% summarise(nPAS=n())
loclabel=c("Coding", "Downstream", "Intronic", "3' UTR", "5' UTR")
PASLocPlot=ggplot(PAS_loc, aes(x=Loc, y=nPAS, fill=Loc)) + geom_bar(stat="identity",width=.5)+ scale_fill_brewer(palette = "YlGnBu") + labs(x="Gene location", y="Number of identified PAS", title="Location distribution for identified PAS") + theme(legend.position = "none")+ scale_x_discrete(labels= loclabel)+theme(axis.text.x = element_text(angle = 90, hjust = 1))
PASLocPlot

ggsave(PASLocPlot, file="../output/PASlocation.png")

```
##Number of genes with apa by cutoff 

I want to make a script that takes a cuttoff and tells me how many gens have 0,1, >1 PAS. This way I can put these together to make stacked barplots:  

I can make the plot for total then again for nuclear. 

The annotaiton is annotation:  

```{r}


totapaanno=read.table("../data/phenotype/APApeak_Phenotype_GeneLocAnno.Total.fc",header = T,stringsAsFactors = F) 
indiv=colnames(totapaanno)[2:55]
totapanum=read.table("../data/phenotype/APApeak_Phenotype_GeneLocAnno.Total.CountsOnlyNumeric",header = F, col.names = indiv) 
totapa_mean=rowMeans(totapanum)
totapaMeananno=as.data.frame(cbind(ID=totapaanno$chrom, meanUsage=totapa_mean))
totapaMeananno$meanUsage=as.numeric(as.character(totapaMeananno$meanUsage))
totapaMeananno$ID=as.character(totapaMeananno$ID)
```


```{r}
genesbycuttoff_tot=function(fraction){
  totapaMeananno_filt=totapaMeananno %>% filter(meanUsage >=fraction) %>% separate(ID, into=c("chrom", "start","end", "peakID"),sep=":") %>% separate(peakID, into=c("Gene","loc", "strand", "peak"),sep="_") %>% group_by(Gene) %>% summarise(PAS=n())
  PASallgene=annotation %>% full_join(totapaMeananno_filt, by="Gene") %>% replace_na(list(PAS=0))
  PASallgene_cat=PASallgene %>% mutate(Category=ifelse(PAS==0,"Zero", ifelse(PAS==1, "One", "Multiple"))) %>% group_by(Category) %>% summarise(NPer=n())
return(PASallgene_cat$NPer)
}

```
```{r}
#multiple, one, zero 
categories=c("Multiple_PAS", "One_PAS", "Zero_PAS")
FullDF=as.data.frame(cbind(categories))
cutoffs=seq(from=0, to=.5, by=.05)
for (val in cutoffs[1:10])
{
FullDF=cbind(FullDF,val=genesbycuttoff_tot(val))
}
colnames(FullDF)=c("Category",cutoffs[1:10])
```

Melt:

```{r}
fullDF_melt=melt(FullDF,id.vars = "Category",variable.name = "Cutoff", value.name = "NGenes") %>% mutate(propGene=NGenes/nrow(annotation))

ggplot(fullDF_melt,aes(x=Cutoff, y=propGene, by=Category, fill=Category)) + geom_bar(width=.5, stat="identity") + scale_fill_brewer(palette="Purples") + labs(title="Proportion of genes with Multiple PAS by Total Usage filter",y="Proportion of 27115 genes", x="Usage Filter cutoff") + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + scale_x_discrete(name="Usage Filter cutoff", breaks=c("0","0.1","0.2", "0.3", "0.4","0.5"))
```
Nuclear:  
```{r}


nucapaanno=read.table("../data/phenotype/APApeak_Phenotype_GeneLocAnno.Nuclear.fc",header = T,stringsAsFactors = F) 

nucapanum=read.table("../data/phenotype/APApeak_Phenotype_GeneLocAnno.Nuclear.CountsOnlyNumeric",header = F, col.names = indiv) 
nucapa_mean=rowMeans(nucapanum)
nucapaMeananno=as.data.frame(cbind(ID=nucapaanno$chrom, meanUsage=nucapa_mean))
nucapaMeananno$meanUsage=as.numeric(as.character(nucapaMeananno$meanUsage))
nucapaMeananno$ID=as.character(nucapaMeananno$ID)
```


```{r}
genesbycuttoff_nuc=function(fraction){
  nucapaMeananno_filt=nucapaMeananno %>% filter(meanUsage >=fraction) %>% separate(ID, into=c("chrom", "start","end", "peakID"),sep=":") %>% separate(peakID, into=c("Gene","loc", "strand", "peak"),sep="_") %>% group_by(Gene) %>% summarise(PAS=n())
  PASallgene=annotation %>% full_join(nucapaMeananno_filt, by="Gene") %>% replace_na(list(PAS=0))
  PASallgene_cat=PASallgene %>% mutate(Category=ifelse(PAS==0,"Zero", ifelse(PAS==1, "One", "Multiple"))) %>% group_by(Category) %>% summarise(NPer=n())
return(PASallgene_cat$NPer)
}
```


```{r}
#multiple, one, zero 
FullDFNuc=as.data.frame(cbind(categories))
for (val in cutoffs[1:10])
{
FullDFNuc=cbind(FullDFNuc,val=genesbycuttoff_nuc(val))
}
colnames(FullDFNuc)=c("Category",cutoffs[1:10])
```


```{r}
FullDFNuc_melt=melt(FullDFNuc,id.vars = "Category",variable.name = "Cutoff", value.name = "NGenes") %>% mutate(propGene=NGenes/nrow(annotation))

ggplot(FullDFNuc_melt,aes(x=Cutoff, y=propGene, by=Category, fill=Category)) + geom_bar( width=.5,stat="identity") + scale_fill_brewer(palette="Blues") + labs(title="Proportion of genes with Multiple PAS by Nuclear Usage filter",y="Proportion of 27115 genes", x="Usage Filter cutoff")
```
##Location of PAS by filter
I will make these plots but the categories will be location of the PAS.  

```{r}

locbycutoff_tot=function(fraction){
  totapaMeananno_filt=totapaMeananno %>% filter(meanUsage >=fraction) %>% separate(ID, into=c("chrom", "start","end", "peakID"),sep=":") %>% separate(peakID, into=c("Gene","loc", "strand", "peak"),sep="_") %>% group_by(loc) %>% summarise(PerLoc=n()) %>%filter(loc!= "008559")
return(totapaMeananno_filt$PerLoc)
}

```


```{r}

locations=c("cds", "end", "intron", "utr3", "utr5")
FullDF_loc=as.data.frame(cbind(locations))
cutoffs=seq(from=0, to=.5, by=.05)
for (val in cutoffs[1:10])
{
FullDF_loc=cbind(FullDF_loc,val=locbycutoff_tot(val))
}
colnames(FullDF_loc)=c("Location",cutoffs[1:10])


```


Melt:

```{r}
FullDF_loc_melt=melt(FullDF_loc,id.vars = "Location",variable.name = "Cutoff", value.name = "NPas") %>% group_by(Cutoff) %>%  mutate(propPAS=NPas/sum(NPas))

totplotloc=ggplot(FullDF_loc_melt,aes(x=Cutoff, y=propPAS, by=Location, fill=Location)) + geom_bar(width=.5, stat="identity") + scale_fill_brewer(palette="BuPu") + labs(title="PAS location\n Total Fraction",y="Proportion of PAS", x="Usage Filter cutoff")+ theme(axis.text.x = element_text(angle = 90, hjust = 1)) + scale_x_discrete(name="Usage Filter cutoff", breaks=c("0","0.1","0.2", "0.3", "0.4","0.5"))
```


Nuclear  

```{r}

locbycutoff_nuc=function(fraction){
nucapaMeananno_filt=nucapaMeananno %>% filter(meanUsage >=fraction) %>% separate(ID, into=c("chrom", "start","end", "peakID"),sep=":") %>% separate(peakID, into=c("Gene","loc", "strand", "peak"),sep="_") %>% group_by(loc) %>% summarise(PerLoc=n()) %>%filter(loc!= "008559")
return(nucapaMeananno_filt$PerLoc)
}

```


```{r}


NucFullDF_loc=as.data.frame(cbind(locations))
cutoffs=seq(from=0, to=.5, by=.05)
for (val in cutoffs)
{
NucFullDF_loc=cbind(NucFullDF_loc,val=locbycutoff_nuc(val))
}
colnames(NucFullDF_loc)=c("Location",cutoffs)

```


Melt:

```{r}
NucFullDF_locMelt=melt(NucFullDF_loc,id.vars = "Location",variable.name = "Cutoff", value.name = "NPas") %>% group_by(Cutoff) %>%  mutate(propPAS=NPas/sum(NPas))

nucplotloc=ggplot(NucFullDF_locMelt,aes(x=Cutoff, y=propPAS, by=Location, fill=Location)) + geom_bar(width = .5, stat="identity") + scale_fill_brewer(palette="YlGnBu") + labs(title="PAS location\n Nuclear Fraction",y="Proportion of PAS", x="Usage Filter cutoff")+ theme(axis.text.x = element_text(angle = 90, hjust = 1)) + scale_x_discrete(name="Usage Filter cutoff", breaks=c("0","0.1","0.2", "0.3", "0.4","0.5"))

```

Plot next to eachother 

```{r}
nucplotloc
```

```{r}
totplotloc
```