/
nonNormQTL.Rmd
176 lines (112 loc) · 8.05 KB
/
nonNormQTL.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
---
title: "Usage Effect Sizes"
author: "Briana Mittleman"
date: "5/24/2019"
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
In order to compare effect sizes for the QTLs I have previously identified in an interpretable manner, I need to run the linear model with the non normalized usage. To do this I will separate the the usage (with annotation) files by chromosome and run fastqtl on these files.
```{r}
library(workflowr)
library(tidyverse)
library(cowplot)
```
##Prepare files
countsnum= APApeak_Phenotype_GeneLocAnno.Total.5perc.CountsNumeric, APApeak_Phenotype_GeneLocAnno.Nuclear.5perc.CountsNumeric
id file= APApeak_Phenotype_GeneLocAnno.Nuclear.5perc.fc.gz, APApeak_Phenotype_GeneLocAnno.Total.5perc.fc.gz
```{r}
totAnno= read.table("../data/phenotype_5perc/APApeak_Phenotype_GeneLocAnno.Total.5perc.fc.gz", stringsAsFactors = F, header = T) %>% separate(chrom, into=c("Chrchrom", "Start", "End", "ID"),sep=":") %>% mutate(Chr=str_sub(Chrchrom, 4, str_length(Chrchrom)))
colnamesTot= colnames(totAnno)[5:58]
totUsage=read.table("../data/phenotype_5perc/APApeak_Phenotype_GeneLocAnno.Total.5perc.CountsNumeric", stringsAsFactors = F, header = F, col.names = colnamesTot)
totUsageAnno=as.data.frame(cbind(Chr=totAnno$Chr, start=totAnno$Start, end=totAnno$End, ID=totAnno$ID, totUsage ))
write.table(totUsageAnno,file="../data/nonNorm_pheno/TotalUsageAllChrom.txt", col.names = T, row.names = F, quote = F, sep="\t" )
```
```{r}
nucAnno= read.table("../data/phenotype_5perc/APApeak_Phenotype_GeneLocAnno.Nuclear.5perc.fc.gz", stringsAsFactors = F, header = T)%>% separate(chrom, into=c("Chrchrom", "Start", "End", "ID"),sep=":") %>% mutate(Chr=str_sub(Chrchrom, 4, str_length(Chrchrom)))
colnamesNuc= colnames(nucAnno)[5:58]
nucUsage=read.table("../data/phenotype_5perc/APApeak_Phenotype_GeneLocAnno.Nuclear.5perc.CountsNumeric", stringsAsFactors = F, header = F, col.names = colnamesNuc)
nucUsageAnno=as.data.frame(cbind(Chr=nucAnno$Chr, start=nucAnno$Start, end=nucAnno$End, ID=nucAnno$ID, nucUsage ))
write.table(nucUsageAnno,file="../data/nonNorm_pheno/NuclearUsageAllChrom.txt", col.names = T, row.names = F, quote = F, sep="\t" )
```
##Run QTL scripts
I will create a python script to seperate the file into each chromosome for running fastQTL.
```{bash,eval=F}
sbatch run_sepUsagephen.sh
sbatch ZipandTabPheno.sh
sbatch ApaQTL_nominalNonnorm.sh
```
Concatinate files:
```{bash,eval=F}
cat ../data/nonNorm_pheno/TotalUsageChrom*.nominal.out > ../data/nonNorm_pheno/TotalUsageChrom_Nominal_AllChrom.txt
cat ../data/nonNorm_pheno/NuclearUsageChrom*.nominal.out > ../data/nonNorm_pheno/NuclearUsageChrom_Nominal_AllChrom.txt
```
##Pull out real total and nuc QLTs
```{bash,eval=F}
python qtlsPvalOppFrac.py ../data/nonNorm_pheno/TotalUsageChrom_Nominal_AllChrom.txt ../data/QTLoverlap_nonNorm/NuclearQTLinTotalNominal_nonNorm.txt
python qtlsPvalOppFrac.py ../data/apaQTLs/Nuclear_apaQTLs4pc_5fdr.txt ../data/nonNorm_pheno/NuclearUsageChrom_Nominal_AllChrom.txt ../data/QTLoverlap_nonNorm/NuclearQTLinNuclearNominal_nonNorm.txt
python qtlsPvalOppFrac.py ../data/apaQTLs/Total_apaQTLs4pc_5fdr.txt ../data/nonNorm_pheno/TotalUsageChrom_Nominal_AllChrom.txt ../data/QTLoverlap_nonNorm/TotalQTLinTotalNominal_nonNorm.txt
python qtlsPvalOppFrac.py ../data/apaQTLs/Total_apaQTLs4pc_5fdr.txt ../data/nonNorm_pheno/NuclearUsageChrom_Nominal_AllChrom.txt ../data/QTLoverlap_nonNorm/TotalQTLinNuclearNominal_nonNorm.txt
```
```{r}
totAPAinNuc=read.table("../data/QTLoverlap_nonNorm/TotalQTLinNuclearNominal_nonNorm.txt", header = F, stringsAsFactors = F, col.names=c("peakID", "snp", "dist", "pval", "slope"))
nucAPAinTot=read.table("../data/QTLoverlap_nonNorm/NuclearQTLinTotalNominal_nonNorm.txt", header = F, stringsAsFactors = F, col.names=c("peakID", "snp", "dist", "pval", "slope"))
totAPAinTot=read.table("../data/QTLoverlap_nonNorm/TotalQTLinTotalNominal_nonNorm.txt", header = F, stringsAsFactors = F, col.names=c("peakID", "snp", "dist", "pval", "slope")) %>% dplyr::select(peakID, snp, slope) %>% dplyr::rename("Originalslope"=slope)
nucAPAinNuc=read.table("../data/QTLoverlap_nonNorm/NuclearQTLinNuclearNominal_nonNorm.txt", header = F, stringsAsFactors = F, col.names=c("peakID", "snp", "dist", "pval", "slope")) %>% dplyr::select(peakID, snp, slope)%>% dplyr::rename("Originalslope"=slope)
```
##Total
```{r}
TotBoth= totAPAinNuc %>% inner_join(totAPAinTot,by=c("peakID", "snp"))
summary(lm(TotBoth$slope ~ TotBoth$Originalslope))
totbothplot=ggplot(TotBoth, aes(x=Originalslope, y=slope))+geom_point() + geom_smooth(method="lm") + labs(title="Total apaQTL effect sizes", x="Effect size in Total",y="Effect size in Nucler") + geom_density_2d(col="red") + annotate("text", y=1, x=0, label="R2=.64, slope=0.65")
```
##Nuclear
```{r}
NucBoth= nucAPAinTot %>% inner_join(nucAPAinNuc,by=c("peakID", "snp"))
summary(lm(NucBoth$slope ~ NucBoth$Originalslope))
Nucbothplot=ggplot(NucBoth, aes(x=Originalslope, y=slope))+geom_point() + geom_smooth(method="lm") + labs(title="Nuclear apaQTL effect sizes", x="Effect size in Nuclear",y="Effect size in Total") + geom_density_2d(col="red") + annotate("text", y=1, x=0, label="R2=.76, slope=0.71")
```
```{r}
plot_grid(totbothplot,Nucbothplot)
```
For the nuclear plot I want to include the PAS we cannot test in the total fraction. To do this I will write code that only writes out the lines for the PAS not in the total fraction. I need the nominal effect sizes for the nuclear qtls in PAS we could not test in total. First I will get all of the PAS tested in total and figure out the nuclear QTLs not in this set. I will then pull those associations out of the nominal nuclear set.
```{bash,eval=F}
python nucSpeceffectsize.py
```
```{r}
nucSpec=read.table("../data/QTLoverlap_nonNorm/NuclearSpecQTLinNuclearNominal_nonNorm.txt",header = F, stringsAsFactors = F, col.names=c("peakID", "snp", "dist", "pval", "Originalslope")) %>% mutate(slope=0,set="Specific") %>% select(peakID, snp, dist, pval, Originalslope, slope,set)
NucBoth_set= NucBoth %>% mutate(set="Both")
NucBothwSpec=bind_rows(NucBoth_set, nucSpec)
```
```{r}
summary(lm(NucBothwSpec$slope ~ NucBothwSpec$Originalslope))
cor.test(y=NucBothwSpec$slope, x=NucBothwSpec$Originalslope)$p.value
nucspecplot=ggplot(NucBothwSpec, aes(x=Originalslope, y=slope))+geom_point(aes( col=set)) + geom_smooth(method="lm") + labs(title="Nuclear apaQTL effect sizes", x="Effect size in Nuclear",y="Effect size in Total") + annotate("text", y=1, x=0, label="R2=.66, slope=0.62",size=6) + scale_color_brewer(palette = "Dark2")+ theme(text = element_text(size=16), legend.position = "bottom")
```
Do this in the other fraction to see what is looks like.
```{bash,eval=F}
python totSeceffectsize.py
```
```{r}
totSpec=read.table("../data/QTLoverlap_nonNorm/TotalSpecQTLinTotalNominal_nonNorm.txt",header = F, stringsAsFactors = F, col.names=c("peakID", "snp", "dist", "pval", "Originalslope")) %>% mutate(slope=0,set="Specific") %>% select(peakID, snp, dist, pval, Originalslope, slope,set)
TotBoth_set= TotBoth %>% mutate(set="Both")
TotBothwSpec=bind_rows(TotBoth_set, totSpec)
```
```{r}
summary(lm(TotBothwSpec$slope ~ TotBothwSpec$Originalslope))
totspecplot=ggplot(TotBothwSpec, aes(x=Originalslope, y=slope))+geom_point(aes( col=set)) + geom_smooth(method="lm") + labs(title="Total apaQTL effect sizes", x="Effect size in Total",y="Effect size in Nuclear") + annotate("text", y=1, x=0, label="R2=.62, slope=0.64", size=6) + scale_color_brewer(palette = "Dark2") + theme(text = element_text(size=16), legend.position = "bottom")
totspecplot
```
```{r}
grideffectplot=plot_grid(nucspecplot,totspecplot)
grideffectplot
```
```{r figure2D, include=FALSE, dev="pdf", fig.height=4, fig.width=20, crop=FALSE}
grideffectplot
```
```{r}
TotBothwSpec %>% filter(set=="Specific") %>% nrow()
NucBothwSpec %>% filter(set=="Specific") %>% nrow()
prop.test(x=c(178,34), n=c(771,440), alternative ="greater")
```