/
pipeline_writeup.tex
276 lines (257 loc) · 13.6 KB
/
pipeline_writeup.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
\title{Bipolar Exome Sequencing Pipeline}
\author{Duncan Palmer}
\date{}
\documentclass[12pt]{article}
\usepackage[top=0.85in,left=0.85in,right=0.85in,footskip=0.75in]{geometry}
\usepackage{parskip}
\usepackage{amsmath}
\usepackage{mathrsfs}
\usepackage{amssymb}
\usepackage{graphicx}
\usepackage{mathtools}
\usepackage{subfig}
\usepackage{listings}
\usepackage[T1]{fontenc}
\usepackage{graphics}
\usepackage{float}
\usepackage{cite}
\usepackage{tabulary}
\usepackage{setspace}
\usepackage{booktabs}
\usepackage{parskip}
\usepackage{hhline}
\usepackage{color}
\usepackage{authblk}
% \usepackage{ccaption}
\usepackage{enumerate}
\usepackage[usenames,dvipsnames,svgnames,table]{xcolor}
\usepackage{subfig}
\usepackage{xr-hyper}
\usepackage{hyperref}
\usepackage{xcolor}
\hypersetup{
colorlinks = true,
citecolor = gray,
linkcolor = NavyBlue
}
\begin{document}
\graphicspath{{../../../QC_plots/sample_plots/}{../../../QC_plots/variant_plots/}}
\maketitle
\begin{itemize}
\item Filter genotype content from joint called .vcf. Throw away the following (\href{https://github.com/astheeggeggs/BipEx/blob/master/scripts_BipEx/QC_BipEx/01_load_vcf_filterGT.py}{01\_load\_vcf\_filterGT.py}):
\begin{itemize}
\item If homozygous reference, at least one of:
\begin{itemize}
\item Genotype quality $< 20$
\item Depth $< 10$
\end{itemize}
\item If heterozygous, at least one of:
\begin{itemize}
\item (Reference allele depth + alternative allele depth)/depth $< 0.8$
\item (Alternative allele depth)/depth $< 0.2$
\item Reference phred-scaled genotype posterior $< 20$
\item Depth $< 10$
\end{itemize}
\item If homozygous variant, at least one of:
\begin{itemize}
\item (Alternative allele depth)/depth $< 0.8$
\item Reference phred-scaled genotype posterior $< 20$
\item Depth $< 10$
\end{itemize}
\end{itemize}
\item Using \href{https://github.com/astheeggeggs/BipEx/blob/master/scripts_BipEx/QC_BipEx/02_prefilter_variants.py}{02\_prefilter\_variants.py}, Remove variants that either:
\begin{itemize}
\item Fall in a low complexity region.
\item Fail VQSR.
\item Fall outside of padded target intervals (50bp padding).
\item Filter out invariant sites.
\end{itemize}
\item Using \href{https://github.com/astheeggeggs/BipEx/blob/master/scripts_BipEx/QC_BipEx/03_initial_sample_qc.py}{03\_initial\_sample\_qc.py}, run `sample\_qc' using hail and remove:
\begin{itemize}
\item sample call rate $< 0.93$
\item FREEMIX contamination (\%) $> 0.02$
\item PCT\_CHIMERAS (\%) $> 0.015$
\item mean depth (dpMean) $< 30$
\item mean genotype quality (gqMean) $< 55$
\end{itemize}
Thresholds used were based on plotting the various distributions. See Figure \ref{fig:initial_sample_qc} for CDFs and the thresholds used.
\item Export common variants (allele frequency between 0.01 to 0.99) with high call rate ($>0.98$) to plink format and prune to independent SNPs using `--indep 50 5 2' (\href{https://github.com/astheeggeggs/BipEx/blob/master/scripts_BipEx/QC_BipEx/04_export_plink.py}{04\_export\_plink.py}).
\item Impute the sexes of the individuals (\href{https://github.com/astheeggeggs/BipEx/blob/master/scripts_BipEx/QC_BipEx/05_impute_sex.py}{05\_impute\_sex.py}; see Figure \ref{fig:impute_sex}) with this pruned set of variants on the X, and create list of samples with incorrect or unknown sex as defined by:
\begin{itemize}
\item Sex is `unknown' in the phenotype files.
\item F-statistic $> 0.6$ and the sex is `female' in the phenotype file.
\item F-statistic $< 0.6$ and the sex is `male' in the phenotype file.
\end{itemize}
\item Compute IBD between all pairs of individuals using the pruned set of variants in the autosomes (\href{https://github.com/astheeggeggs/BipEx/blob/master/scripts_BipEx/QC_BipEx/06_ibd.py}{06\_ibd.py}; see Figure \ref{fig:IBD_plot}).
\begin{itemize}
\item Create sample list of individuals such that no pair has $\hat{\pi}>0.2$ (\href{https://github.com/astheeggeggs/BipEx/blob/master/scripts_BipEx/QC_BipEx/06_ibd_filter.r}{06\_ibd\_filter.r}).
\end{itemize}
\item Run PCA on samples after removing relateds and those that passed initial QC, using the pruned set of variants (`09\_pca.py'; see Figure \ref{fig:PCA}).
\item Run PCA on samples plus all of 1000 genomes (\href{https://github.com/astheeggeggs/BipEx/blob/master/scripts_BipEx/QC_BipEx/10_pca_1kg.py}{10\_pca\_1kg.py} ; see Figure \ref{fig:PCA_1kg}).
\begin{itemize}
\item Train a random forest (10,000 trees) on the super populations of 1000 genomes and predict super populations of BipEx samples.
\item Denote strictly defined European subset as those with probability $> 0.95$ of being European according to the classifier.
\end{itemize}
\item Run PCA on samples restricted to the strictly defined European subset, and check for outliers (\href{https://github.com/astheeggeggs/BipEx/blob/master/scripts_BipEx/QC_BipEx/11_pca_EUR.py}{11\_pca\_EUR.py}; see Figure \ref{fig:PCA_EUR}).
\item Using a much looser definition of European, restrict to US samples from MGH and Johns Hopkins, and run PCA (\href{https://github.com/astheeggeggs/BipEx/blob/master/scripts_BipEx/QC_BipEx/12_pca_aj.py}{12\_pca\_.aj.py}; see Figure \ref{fig:AJ_PCA})
\begin{itemize}
\item Identify Ashkenazi Jewish cluster, and create a list of outliers (AJ or otherwise) for downstream removal.
\end{itemize}
\item Run further PCA on:
\begin{enumerate}
\item Strictly defined Europeans and Ashkenazi Jews (\href{https://github.com/astheeggeggs/BipEx/blob/master/scripts_BipEx/QC_BipEx/13_pca_aj_1kg.py}{13\_pca\_aj\_1kg.py}; see Figure \ref{fig:PCA_EUR_AJ_1kg}).
\begin{itemize}
\item Use Ashkenazi Jewish cluster to train a random forest and determine if there are further Ashkenazi Jews in the remainder of the dataset - there weren't.
\end{itemize}
\item Strictly defined Europeans (\href{https://github.com/astheeggeggs/BipEx/blob/master/scripts_BipEx/QC_BipEx/13_pca_EUR_1kg.py}{13\_pca\_EUR\_1kg.py}; see Figure \ref{fig:PCA_EUR_1kg})
\end{enumerate}
\item Now restrict to samples in the strictly defined European subset, filter to the unrelated list, and filter out samples with incorrectly defined sex or unknown sex, and run variant QC (\href{https://github.com/astheeggeggs/BipEx/blob/master/scripts_BipEx/QC_BipEx/14_final_variant_qc.py}{14\_final\_variant\_qc.py} ; see Figure \ref{fig:variant_QC}). Remove variants that satisfy at least one of:
\begin{itemize}
\item Invariant site in cleaned sample subset.
\item Call rate $< 0.97$
\item Control call rate $< 0.97$
\item Case call rate $< 0.97$
\item |Case call rate - Control call rate| $> 0.02$
\item $p$-value for Hardy Weinberg Equilibrium $< 10^{-6}$
\end{itemize}
\item Remove sample outliers after the variant cleaning in the previous step (see Figure \ref{fig:variant_QC_impact} for their distributions). Remove sample if at least one of
\begin{itemize}
\item Ratio of heterozygous to homozygous variant
\item Ratio of insertions to deletions
\item Ratio of transitions to transversions
\end{itemize}
lies more than three standard deviations away from the mean.
\item Export common (0.01 - 0.99) variants to plink format, prune and evaluate final principal components for downstream analysis, and save cleaned .mt files.
\item Run final PCA, pruning the cleaned set of variants and cleaned set of samples (see Figure \ref{fig:final_PCs}).
\end{itemize}
% INITIAL SAMPLE QC.
\begin{figure}
\centering
\subfloat{\label{fig:callrate_cdf}\includegraphics[width=0.5\textwidth]{03_callRate_cdf.jpg}}
\subfloat{\label{fig:contamination_cdf}\includegraphics[width=0.5\textwidth]{03_contamination_cdf.jpg}}\\
\subfloat{\label{fig:chimeras_cdf}\includegraphics[width=0.5\textwidth]{03_chimeras_cdf.jpg}}
\subfloat{\label{fig:depth_cdf}\includegraphics[width=0.5\textwidth]{03_dpMean_cdf.jpg}}\\
\subfloat{\label{fig:genotype_quality_cdf}\includegraphics[width=0.5\textwidth]{03_gqMean_cdf.jpg}}
\caption{CDFs of various metrics of sample quality with thresholds for inclusion/exclusion.}
\label{fig:initial_sample_qc}
\end{figure}
% IMPUTE SEX.
\begin{figure}
\centering
\subfloat{\label{fig:impute_sex_hist}\includegraphics[width=0.5\textwidth]{05_imputesex_histogram.jpg}}
\subfloat{\label{fig:impute_sex_scatter_box}\includegraphics[width=0.5\textwidth]{05_imputesex_scatter_box.jpg}}\\
\subfloat{\label{fig:impute_sex_scatter}\includegraphics[width=0.8\textwidth]{05_imputesex_scatter.jpg}}
\caption{F-statistic used to determine if individual is male or female. Threshold set at 0.6.}
\label{fig:impute_sex}
\end{figure}
% IBD FILTER.
\begin{figure}
\centering
\includegraphics[width=0.7\textwidth]{06_IBD_plot.jpg}
\caption{IBD plot. Threshold for `related' set at $\hat{\pi} > 0.2$.}
\label{fig:IBD_plot}
\end{figure}
% PCA.
\begin{figure}
\centering
\subfloat{\includegraphics[width=0.5\textwidth]{09_PC1_PC2.jpg}}
\subfloat{\includegraphics[width=0.5\textwidth]{09_PC1_PC2_collection.jpg}}\\
\subfloat{\includegraphics[width=0.5\textwidth]{09_PC3_PC4.jpg}}
\subfloat{\includegraphics[width=0.5\textwidth]{09_PC3_PC4_collection.jpg}}\\
\subfloat{\includegraphics[width=0.5\textwidth]{09_PC5_PC6.jpg}}
\subfloat{\includegraphics[width=0.5\textwidth]{09_PC5_PC6_collection.jpg}}
\caption{Initial PCA of all samples}
\label{fig:PCA}
\end{figure}
% PCA with 1kg.
\begin{figure}
\centering
\subfloat{\includegraphics[width=0.5\textwidth]{10_PC1_PC2_1kg.jpg}}
\subfloat{\includegraphics[width=0.5\textwidth]{10_PC1_PC2_1kg_collection.jpg}}\\
\subfloat{\includegraphics[width=0.5\textwidth]{10_PC3_PC4_1kg.jpg}}
\subfloat{\includegraphics[width=0.5\textwidth]{10_PC3_PC4_1kg_collection.jpg}}\\
\subfloat{\includegraphics[width=0.5\textwidth]{10_PC5_PC6_1kg.jpg}}
\subfloat{\includegraphics[width=0.5\textwidth]{10_PC5_PC6_1kg_collection.jpg}}
\caption{Initial PCA of all samples with 1000 Genomes}
\label{fig:PCA_1kg}
\end{figure}
\begin{figure}
\centering
\subfloat{\includegraphics[width=0.5\textwidth]{10_PC1_PC2_classify_EUR_strict.jpg}}
\subfloat{\includegraphics[width=0.5\textwidth]{10_PC3_PC4_classify_EUR_strict.jpg}}\\
\subfloat{\includegraphics[width=0.5\textwidth]{10_PC5_PC6_classify_EUR_strict.jpg}}
\caption{Strict definition of European samples using random forest classifier, 500 trees. $P(European)>0.95$.}
\label{fig:PCA_1kg}
\end{figure}
% PCA europe.
\begin{figure}
\centering
\subfloat{\includegraphics[width=0.5\textwidth]{11_PC1_PC2_EUR.jpg}}
\subfloat{\includegraphics[width=0.5\textwidth]{11_PC3_PC4_EUR.jpg}}\\
\subfloat{\includegraphics[width=0.5\textwidth]{11_PC5_PC6_EUR.jpg}}
\caption{PCA after restricting to strict European subset.}
\label{fig:PCA_EUR}
\end{figure}
% PCA AJ
\begin{figure}
\centering
\subfloat[AJ and half AJ cluster identified]{\includegraphics[width=0.5\textwidth]{12_AJ_in_US_PC1_PC2.jpg}}
\subfloat[Removal of outliers in US European samples]{\includegraphics[width=0.5\textwidth]{12_AJ_remove_in_US_PC1_PC2.jpg}}
\caption{PCA of loosely defined `European' US samples to identify AJ cluster}
\label{fig:AJ_PCA}
\end{figure}
% PCA 1kg AJ
\begin{figure}
\centering
\subfloat{\includegraphics[width=0.5\textwidth]{13_PC1PC2_EUR_and_AJ_1kg.jpg}}
\subfloat{\includegraphics[width=0.5\textwidth]{13_PC1PC2_EUR_and_AJ_1kg_collection.jpg}}\\
\subfloat{\includegraphics[width=0.5\textwidth]{13_PC3PC4_EUR_and_AJ_1kg.jpg}}
\subfloat{\includegraphics[width=0.5\textwidth]{13_PC3PC4_EUR_and_AJ_1kg_collection.jpg}}\\
\subfloat{\includegraphics[width=0.5\textwidth]{13_PC5PC6_EUR_and_AJ_1kg.jpg}}
\subfloat{\includegraphics[width=0.5\textwidth]{13_PC5PC6_EUR_and_AJ_1kg_collection.jpg}}
\caption{PCA of European cohorts with 1000 Genomes}
\label{fig:PCA_EUR_AJ_1kg}
\end{figure}
% PCA 1kg EUR
\begin{figure}
\centering
\subfloat{\includegraphics[width=0.5\textwidth]{13_PC1PC2_EUR_1kg.jpg}}
\subfloat{\includegraphics[width=0.5\textwidth]{13_PC1PC2_EUR_1kg_collection.jpg}}\\
\subfloat{\includegraphics[width=0.5\textwidth]{13_PC3PC4_EUR_1kg.jpg}}
\subfloat{\includegraphics[width=0.5\textwidth]{13_PC3PC4_EUR_1kg_collection.jpg}}\\
\subfloat{\includegraphics[width=0.5\textwidth]{13_PC5PC6_EUR_1kg.jpg}}
\subfloat{\includegraphics[width=0.5\textwidth]{13_PC5PC6_EUR_1kg_collection.jpg}}
\caption{PCA of European cohorts with 1000 Genomes}
\label{fig:PCA_EUR_1kg}
\end{figure}
\begin{figure}
\centering
\subfloat{\includegraphics[width=0.5\textwidth]{14_call_rate_cdf.jpg}}
\subfloat{\includegraphics[width=0.5\textwidth]{14_call_rate_diff.jpg}}\\
\subfloat{\includegraphics[width=0.5\textwidth]{14_pHWE.jpg}}
\caption{Variant QC}
\label{fig:variant_QC}
\end{figure}
% Variant QC before and after
\begin{figure}
\centering
\subfloat{\includegraphics[width=0.45\textwidth]{15_nSingletonsbyBatchColLocation.jpg}}
\subfloat{\includegraphics[width=0.45\textwidth]{15_nSingletonsbyBatchColPheno.jpg}}\\
\subfloat{\includegraphics[width=0.45\textwidth]{15_rHetHomVarbyBatchColLocation.jpg}}
\subfloat{\includegraphics[width=0.45\textwidth]{15_rHetHomVarbyBatchColPheno.jpg}}\\
\subfloat{\includegraphics[width=0.45\textwidth]{15_rInsertionDeletionbyBatchColLocation.jpg}}
\subfloat{\includegraphics[width=0.45\textwidth]{15_rInsertionDeletionbyBatchColPheno.jpg}}\\
\subfloat{\includegraphics[width=0.45\textwidth]{15_rTiTvbyBatchColLocation.jpg}}
\subfloat{\includegraphics[width=0.45\textwidth]{15_rTiTvbyBatchColPheno.jpg}}\\
\caption{Visualise the impact of variant QC on different metrics.}
\label{fig:variant_QC_impact}
\end{figure}
\begin{figure}
\centering
\subfloat{\includegraphics[width=0.45\textwidth]{17_PC1_PC2_final_PCs.jpg}}
\subfloat{\includegraphics[width=0.45\textwidth]{17_PC3_PC4_final_PCs.jpg}}\\
\subfloat{\includegraphics[width=0.45\textwidth]{17_PC5_PC6_final_PCs.jpg}}
\caption{PCA of pruned cleaned variants and samples.}
\label{fig:final_PCs}
\end{figure}
\end{document}