-
Notifications
You must be signed in to change notification settings - Fork 2
/
signalsiteanalysis.Rmd
198 lines (141 loc) · 9.38 KB
/
signalsiteanalysis.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
---
title: "Signal Site analysis"
author: "Briana Mittleman"
date: "4/23/2019"
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
```{r}
library(tidyverse)
library(workflowr)
library(reshape2)
library(gplots)
library(cowplot)
```
In this analysis I will plot the distribution of signal sites upstream of the PAS I have found.
First I use a python script to make a bed file with the 100 base pairs upsream of the PAS:
```{bash,eval=F}
module load Anaconda3
source activate three-prime-env
mkdir ../data/SignalSiteFiles
python Upstream100Bases_general.py ../data/PAS/APAPAS_GeneLocAnno.5perc.bed ../data/SignalSiteFiles/APAPAS_100up.bed
```
Now I use bedtools nuc to get the sequence for each of these regions:
```{bash,eval=F}
sbatch getSeq100up.sh
```
I can now run the DistPAS2Sig.py which will give me the location for the signal site for each PAS.I am running this with the 12 most common PAS signal sites.
```{bash,eval=F}
sbatch run_distPAS2Sig.sh
```
Upload all of the results:
```{r}
Loc_AATAAA= read.table("../data/SignalSiteFiles/Loc_AATAAA_Distance2end.txt", header=F, col.names =c( "PAS", "Distance2PAS")) %>% mutate(Site="AATAAA")
Loc_AAAAAG= read.table("../data/SignalSiteFiles/Loc_AAAAAG_Distance2end.txt", header=F, col.names =c( "PAS", "Distance2PAS")) %>% mutate(Site="AAAAAG")
Loc_AATACA= read.table("../data/SignalSiteFiles/Loc_AATACA_Distance2end.txt", header=F, col.names =c( "PAS", "Distance2PAS")) %>% mutate(Site="AATACA")
Loc_AATAGA= read.table("../data/SignalSiteFiles/Loc_AATAGA_Distance2end.txt", header=F, col.names =c( "PAS", "Distance2PAS")) %>% mutate(Site="AATAGA")
Loc_AATATA= read.table("../data/SignalSiteFiles/Loc_AATATA_Distance2end.txt", header=F, col.names =c( "PAS", "Distance2PAS")) %>% mutate(Site="AATATA")
Loc_ACTAAA= read.table("../data/SignalSiteFiles/Loc_ACTAAA_Distance2end.txt", header=F, col.names =c( "PAS", "Distance2PAS")) %>% mutate(Site="ACTAAA")
Loc_AGTAAA= read.table("../data/SignalSiteFiles/Loc_AGTAAA_Distance2end.txt", header=F, col.names =c( "PAS", "Distance2PAS")) %>% mutate(Site="AGTAAA")
Loc_ATTAAA= read.table("../data/SignalSiteFiles/Loc_ATTAAA_Distance2end.txt", header=F, col.names =c( "PAS", "Distance2PAS")) %>% mutate(Site="ATTAAA")
Loc_CATAAA= read.table("../data/SignalSiteFiles/Loc_CATAAA_Distance2end.txt", header=F, col.names =c( "PAS", "Distance2PAS")) %>% mutate(Site="CATAAA")
Loc_GATAAA= read.table("../data/SignalSiteFiles/Loc_GATAAA_Distance2end.txt", header=F, col.names =c( "PAS", "Distance2PAS")) %>% mutate(Site="GATAAA")
Loc_TATAAA= read.table("../data/SignalSiteFiles/Loc_TATAAA_Distance2end.txt", header=F, col.names =c( "PAS", "Distance2PAS")) %>% mutate(Site="TATAAA")
Loc_AAAAAA= read.table("../data/SignalSiteFiles/Loc_AAAAAA_Distance2end.txt", header=F, col.names =c( "PAS", "Distance2PAS")) %>% mutate(Site="AAAAAA")
```
Join these together:
```{r}
AllsiteDF=as.data.frame(rbind(Loc_AATAAA,Loc_AAAAAG,Loc_AATACA,Loc_AATAGA,Loc_AATATA,Loc_ACTAAA,Loc_AGTAAA,Loc_ATTAAA, Loc_GATAAA,Loc_TATAAA,Loc_CATAAA, Loc_AAAAAA))
```
```{r}
AllsiteDF_sep = AllsiteDF %>% separate(PAS, int=c("GenePeak", "Location"), sep="_")
ggplot(AllsiteDF_sep, aes(x=Distance2PAS, by=Site, col=Site)) + stat_ecdf() + facet_wrap(~Location)
```
Check to see if any PAS have more than one signal site detected:
```{r}
AllsiteDFmultsites=AllsiteDF %>% group_by(PAS) %>% mutate(nSites=n()) %>% filter(nSites>1)
```
First take the perfect match within 50 bp then use the closest.
Write out the AllSite in order to use it in the chooseSignalSite.py script:
```{r}
write.table(AllsiteDF, file="../data/SignalSiteFiles/AllSignalSite.txt", quote=F, col.names = F, row.names = F, sep="\t")
```
```{bash, eval=F}
python chooseSignalSite.py ../data/SignalSiteFiles/AllSignalSite.txt ../data/SignalSiteFiles/AllSignalSite_1perPAS.txt
```
```{r}
AllsiteDF_1per=read.table(file="../data/SignalSiteFiles/AllSignalSite_1perPAS.txt", col.names = colnames(AllsiteDF)) %>% mutate(NegCount=-1*as.integer(as.character(Distance2PAS)))
```
Plot
```{r}
dist2signalsiteplot=ggplot(AllsiteDF_1per, aes(group=Site, x=NegCount, fill=Site)) + geom_histogram(position="stack",bins=50 ) + labs(x="Distance from PAS", y="N annotated Sites", title="Location of annotated signal sites")
dist2signalsiteplot
ggsave(dist2signalsiteplot, file="../output/SignalSitePlot.png")
```
Plot with proportion:
```{r}
allPAS=read.table("../data/PAS/APAPAS_GeneLocAnno.5perc.bed", stringsAsFactors = F, col.names = c("chr","start","end","PAS","score","strand"))
AllsiteDF_1per_prop= AllsiteDF_1per %>% group_by(Site,NegCount) %>% summarise(CountperPos=n()) %>% mutate(TotCount=sum(CountperPos),prop=CountperPos/nrow(allPAS))
```
Plot with prop:
```{r}
dist2signalsiteplotprop=ggplot(AllsiteDF_1per_prop, aes(group=Site, x=NegCount,y=prop, fill=Site)) + geom_histogram(position="stack",bins=50,stat="identity" ) + labs(x="Distance from PAS", y="Proportion of annotated Sites", title="Location of annotated signal sites")
dist2signalsiteplotprop
```
```{r}
nrow(AllsiteDF_1per)/nrow(allPAS)
```
Seperate by location:
```{r}
AllsiteDF_1per_sep= AllsiteDF_1per %>%separate(PAS, int=c("GenePeak", "Location"), sep="_")
```
```{r}
dist2signalsiteplot_byloc=ggplot(AllsiteDF_1per_sep, aes(group=Site, x=NegCount, fill=Site)) + geom_histogram(position="stack",bins=50 ) + labs(x="Distance from PAS", y="N annotated Sites", title="Location of annotated signal sites") + facet_wrap(~Location)
dist2signalsiteplot_byloc
ggsave(dist2signalsiteplot_byloc, file="../output/SignalSitePlotbyLoc.png")
```
Proportion:
```{r}
allPAS_byloc=allPAS %>% separate(PAS,into=c("peakid", "loc"),sep="_") %>% group_by(loc) %>% summarise(nLoc=n())
allPAS_byloc_new=as.data.frame(allPAS_byloc$nLoc %>% t())
colnames(allPAS_byloc_new) = allPAS_byloc$loc
AllsiteDF_1per_sep_INTRON=AllsiteDF_1per_sep %>% filter(Location=="intron") %>% group_by(Site,NegCount) %>% summarise(CountperPos=n()) %>% mutate(TotCount=sum(CountperPos),prop=CountperPos/(allPAS_byloc_new$intron))
ggplot(AllsiteDF_1per_sep_INTRON, aes(group=Site, x=NegCount,y=prop, fill=Site)) + geom_histogram(position="stack",bins=5, stat="identity" ) + labs(x="Distance from PAS", y="Propotion of Intron annotated Sites", title="Location of annotated signal sites")
```
```{r}
AllsiteDF_1per_sep_UTR=AllsiteDF_1per_sep %>% filter(Location=="utr3") %>% group_by(Site,NegCount) %>% summarise(CountperPos=n()) %>% mutate(TotCount=sum(CountperPos),prop=CountperPos/(allPAS_byloc_new$intron))
ggplot(AllsiteDF_1per_sep_UTR, aes(group=Site, x=NegCount,y=prop, fill=Site)) + geom_histogram(position="stack", stat="identity" ) + labs(x="Distance from PAS", y="Propotion of UTR annotated Sites", title="Location of annotated signal sites in UTR")
```
```{r}
Propwith=c(nrow(AllsiteDF_1per_sep %>% filter(Location=="intron"))/allPAS_byloc_new$intron,nrow(AllsiteDF_1per_sep %>% filter(Location=="utr3"))/allPAS_byloc_new$utr3,nrow(AllsiteDF_1per_sep %>% filter(Location=="utr5"))/allPAS_byloc_new$utr5,nrow(AllsiteDF_1per_sep %>% filter(Location=="cds"))/allPAS_byloc_new$cds,nrow(AllsiteDF_1per_sep %>% filter(Location=="end"))/allPAS_byloc_new$end)
Locations=c("intron", "utr3", "utr5", "Coding", "Downstream")
propdf=as.data.frame(cbind(Location=Locations,Proportion=Propwith))
propdf$Proportion=as.numeric(as.character(propdf$Proportion))
all=ggplot(propdf,aes(x=Location, y=Proportion, fill=Location)) + geom_bar(stat="identity") + theme(axis.text.x = element_text(angle = 90, hjust = 1))
```
```{r}
AllsiteDF_1per_sep_noncon=AllsiteDF_1per_sep %>% filter(Site != "AATAAA")
Propwithnotcon=c(nrow(AllsiteDF_1per_sep_noncon %>% filter(Location=="intron"))/allPAS_byloc_new$intron,nrow(AllsiteDF_1per_sep_noncon %>% filter(Location=="utr3"))/allPAS_byloc_new$utr3,nrow(AllsiteDF_1per_sep_noncon %>% filter(Location=="utr5"))/allPAS_byloc_new$utr5,nrow(AllsiteDF_1per_sep_noncon %>% filter(Location=="cds"))/allPAS_byloc_new$cds,nrow(AllsiteDF_1per_sep_noncon %>% filter(Location=="end"))/allPAS_byloc_new$end)
Locations=c("intron", "utr3", "utr5", "Coding", "Downstream")
propdf_noncon=as.data.frame(cbind(Location=Locations,Proportion=Propwithnotcon))
propdf_noncon$Proportion=as.numeric(as.character(propdf_noncon$Proportion))
non=ggplot(propdf_noncon,aes(x=Location, y=Proportion, fill=Location)) + geom_bar(stat="identity")+theme(axis.text.x = element_text(angle = 90, hjust = 1))
non
```
```{r}
AllsiteDF_1per_sep_con=AllsiteDF_1per_sep %>% filter(Site == "AATAAA")
Propwithcon=c(nrow(AllsiteDF_1per_sep_con %>% filter(Location=="intron"))/allPAS_byloc_new$intron,nrow(AllsiteDF_1per_sep_con %>% filter(Location=="utr3"))/allPAS_byloc_new$utr3,nrow(AllsiteDF_1per_sep_con %>% filter(Location=="utr5"))/allPAS_byloc_new$utr5,nrow(AllsiteDF_1per_sep_con %>% filter(Location=="cds"))/allPAS_byloc_new$cds,nrow(AllsiteDF_1per_sep_con %>% filter(Location=="end"))/allPAS_byloc_new$end)
Locations=c("intron", "utr3", "utr5", "Coding", "Downstream")
propdf_con=as.data.frame(cbind(Location=Locations,Proportion=Propwithcon))
propdf_con$Proportion=as.numeric(as.character(propdf_con$Proportion))
con=ggplot(propdf_con,aes(x=Location, y=Proportion, fill=Location)) + geom_bar(stat="identity")+ theme(axis.text.x = element_text(angle = 90, hjust = 1))
con
```
```{r}
plot_grid(all, con, non, labels=c("All PAS", "Cononical PAS", "Non-conical PAS"))
```
##Signal site and usage relationship
**Next plot: look at presence of signal site compared to PAS usage**
I need to look at the mean usage and fraction it by if the peak has a signal site.