-
Notifications
You must be signed in to change notification settings - Fork 2
/
MRAnalysis_examplePipeline.sh
186 lines (154 loc) · 12.6 KB
/
MRAnalysis_examplePipeline.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
##This code is for Runing MR Analysis over any two Summary statistics datasets folowing the ordered format:
#RsID chr pos EA OA EAF beta se pvalue N
#RsID -> the SNP ID
#chr -> chromosome
#pos -> position
#EA -> effect allele: allele that was selected to calculate the beta on the GWAS
#OA -> other allele: allele that was NOT used to alculate the beta
#EAF -> allele frequency of the effect allele
#beta -> beta
#se -> standard error of the beta
#pvalue -> pvalue of the beta
#N -> sample size
#This Script is conformed by the following steps:
#0. Arranging datasets (scripts to solve some of the following issues):
#0.1 Dataset is lacking either RsID or chr:pos
#0.2 Dataset has OR/TSTAT instead of Beta/SE
#1. Data Extraction and preparation
#1.1 Clumping
#1.2 Instrumental Variables (index SNPs) data extraction from exposure dataset
#1.3 Instrumental Variables (index SNPs) data extraction from outcome dataset
#2. Harmonization
#2.1 Make all the betas positive on the exposure dataset
#2.2 Same Effect Allele Coding Harmonization (to make sure that both the exposure and the outcome datasets have the same allele coding)
#2.3 Calculate Rsquare and F statistic
#3. Input Generation
#3.1 IVW
#3.2 MRPRESSO
#3.3 GSMR
#MR ANALYSIS
#4. IVW/MR-EGGER and MRPRESSO
#5. GSMR
###This is an example code, where the exposure is some summary statistics for intelligence (Davies,2011).
###The user should change the word 'exposure' thoughout the code for the exposure that he or she is evaluating, i.e. TeaDrinking
#cd /data/neurogen/MRandPD/GWASsummaryStats/
#awk '{print $2,$4,$5}' snp150.txt | sed -e 's/ /:/' | sort > reference_chrposrsID
#cd /data/neurogen/MRandPD/GWASsummaryStats/Nalls2019AllSamples/
#echo "RsID chr pos EA OA EAF beta se pvalue N" > header
#join nallsEtAl2019_allSamples_allVariants_sorted /data/neurogen/MRandPD/GWASsummaryStats/reference_chrposrsID | sed -e 's/chr//g' | sed -e 's/:/ /g' | sort | awk '{print $11,$1,$2,$3,$4,$5,$6,$7,$8,$9+$10}' > temp
#cat header temp | sed -e 's/ /\t/g' > /data/neurogen/MRandPD/GWASsummaryStats/Parkinsons_GWASsummarystats.txt
#1. DATA EXTRACTION AND PREPARATION
#1.1 CLUMPING
#1.1.1 Prepare GWAS files for SNP clumping (in PLINK) by extracting the columns (SNP CHR BP P) from the GWAS Results
mkdir /data/neurogen/MRandPD/Results/PLINKClumping
mkdir /data/neurogen/MRandPD/Results/PLINKClumping/GWASData4Clumping
cd /data/neurogen/MRandPD/GWASsummaryStats/
echo "SNP CHR BP P" > header
awk '{ if (NR!=1) print $1,$2,$3,$9}' exposure_GWASsummarystats.txt > temp
cat header temp > /data/neurogen/MRandPD/Results/PLINKClumping/GWASData4Clumping/exposure4clumping
awk '{print $1}' temp > /data/neurogen/MRandPD/Results/PLINKClumping/GWASData4Clumping/AllSNPs_exposure
# 1.1.2 Create .lsf files to run in cluster queue
#Parameters for clumping: --clump-p1 .00000005 --clump-p2 .01 --clump-r2 0.001 --clump-kb 10000 --clump-verbose
#this means that index snps will reach genomewide significance (.00000005) and either have a r2=0.001 with the next index SNP or be 10000 kb appart
#--clump-verbose to get the Rsquare values of the clumped SNPs with the index SNP
mkdir /data/neurogen/MRandPD/Scripts/PLINKClumpScripts/Exposure_ClumpScript/
mkdir /data/neurogen/MRandPD/Results/PLINKClumping/Exposure_Clumping/
for i in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
do
cp /data/neurogen/MRandPD/Scripts/test.lsf /data/neurogen/MRandPD/Scripts/PLINKClumpScripts/Exposure_ClumpScript
mv /data/neurogen/MRandPD/Scripts/PLINKClumpScripts/Exposure_ClumpScript/test.lsf /data/neurogen/MRandPD/Scripts/PLINKClumpScripts/Exposure_ClumpScript/exposure.clump_chr"$i".lsf
echo "module load plink/1.90b3" >> /data/neurogen/MRandPD/Scripts/PLINKClumpScripts/Exposure_ClumpScript/exposure.clump_chr"$i".lsf
echo "plink --bfile /data/neurogen/MRandPD/1000GenomsPlinkFormat/1kg_phase1_chr"$i" --extract /data/neurogen/MRandPD/Results/PLINKClumping/GWASData4Clumping/AllSNPs_exposure --clump /data/neurogen/MRandPD/Results/PLINKClumping/GWASData4Clumping/exposure4clumping --clump-p1 .00000005 --clump-p2 .01 --clump-r2 0.001 --clump-kb 10000 --clump-verbose --out /data/neurogen/MRandPD/Results/PLINKClumping/Exposure_Clumping/clumped.exposure.chr"$i" " >> /data/neurogen/MRandPD/Scripts/PLINKClumpScripts/Exposure_ClumpScript/exposure.clump_chr"$i".lsf
done
#1.1.3 Submit them to the queue
cd /data/neurogen/MRandPD/Scripts/PLINKClumpScripts/Exposure_ClumpScript/
for i in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
do
bsub -q normal < exposure.clump_chr"$i".lsf
done
#1.1.4 Verify that all clumping was compleated. to do so, run the following command and it whould return 22. if not, look for the ones missing and rerun them
cd /data/neurogen/MRandPD/Results/PLINKClumping/Exposure_Clumping/
ls | grep 'log' | wc -l
#1.1.5 Extract the list of index SNPs from the clumpling results
cat clumped.exposure.chr*.clumped > temp
grep 'INDEX' temp | sed -e 's/ / /g' | sed -e 's/ / /g' | sed -e 's/ / /g' | cut -d ' ' -f3 > indexSNPs_exposure
#1.2 EXTRACT EXPOSURE GWAS INFO FOR CALCUATING PHENOTYPIC VARIANCE EXPALINED (Rsquare) AND F-STATISTIC
mkdir /data/neurogen/MRandPD/Results/IndexSNPs_inExposuresGWASdata/
echo "RsID chr pos EA OA EAF beta se pvalue N" > header
grep -w -f indexSNPs_exposure /data/neurogen/MRandPD/GWASsummaryStats/exposure_GWASsummarystats.txt | sort | uniq > temp
cat header temp | sed -e 's/ /\t/g' > /data/neurogen/MRandPD/Results/IndexSNPs_inExposuresGWASdata/exposureIVs_inExposureGWASdata
#1.3 EXTRACT THE INFORMATION OF THE INDEX SNPS BUT FROM THE PD GWAS DATASET
mkdir /data/neurogen/MRandPD/Results/IndexSNPs_inParkinsonsGWASdata/
grep -w -f indexSNPs_exposure /data/neurogen/MRandPD/GWASsummaryStats/Parkinsons_GWASsummarystats.txt | sort | uniq > temp
cat header temp | sed -e 's/ /\t/g' > /data/neurogen/MRandPD/Results/IndexSNPs_inParkinsonsGWASdata/exposureIVs_inParkinsonsGWASdata
#2 HARMONIZATION
#2.1 MAKE ALL THE BETAS POSITIVE ON THE EXPOSURE DATASET
#It is important to convert all the betas into the same direction (positive) as it facilitates interpretation of plots and it is requiered for MR-PRESSO (see Hartwig,2016)
#Using linux shell, we'll prepare the data for beta harmonization, which will be performed on an R script. Further details on this step can be found on HarmonizationBetaPositive.R script
cd /data/neurogen/MRandPD/Results/IndexSNPs_inExposuresGWASdata/
mkdir /data/neurogen/MRandPD/Results/BetaPositiveHarmonized/
Rscript --vanilla /data/neurogen/MRandPD/Scripts/Rscripts/BetaPositiveHarmonization.R --input /data/neurogen/MRandPD/Results/IndexSNPs_inExposuresGWASdata/exposureIVs_inExposureGWASdata --exposure exposure
#2.2 SAME EFFECT ALLELE CODING HARMONIZATION
#2.2.1 After harmonizing the IV to all positive betas (BetaPositiveHarmonizarion.rmd) extract the effect allele coding from the exposure dataset. #ver que onda con R2 and F-statistic stimation
cd /data/neurogen/MRandPD/Results/BetaPositiveHarmonized/
mkdir /data/neurogen/MRandPD/Results/BetaPositiveHarmonized/ExposureGWASdata_EA/
mkdir /data/neurogen/MRandPD/Results/R2andFstatCalculation
awk '{ if (NR!=1) print $1}' /data/neurogen/MRandPD/Results/IndexSNPs_inParkinsonsGWASdata/exposureIVs_inParkinsonsGWASdata > temp
grep -w -f temp exposure_betapositive | sed -e 's/ /\t/g' | sort > /data/neurogen/MRandPD/Results/R2andFstatCalculation/exposureIVs_inExposureGWASdata
awk '{print $1,$4}' /data/neurogen/MRandPD/Results/R2andFstatCalculation/exposureIVs_inExposureGWASdata | sed -e 's/ /\t/g' > /data/neurogen/MRandPD/Results/BetaPositiveHarmonized/ExposureGWASdata_EA/exposureIVs_inExposureGWASdata_EA
#2.2.2 EXTRACT THE EFFECT ALLELE CODING FROM THE PD DATASET
mkdir /data/neurogen/MRandPD/Results/BetaPositiveHarmonized/ParkinsonsGWASdata_EA/
awk '{ if (NR!=1) print $1,$4}' /data/neurogen/MRandPD/Results/IndexSNPs_inParkinsonsGWASdata/exposureIVs_inParkinsonsGWASdata | sed -e 's/ /\t/g' | sort > /data/neurogen/MRandPD/Results/BetaPositiveHarmonized/ParkinsonsGWASdata_EA/exposureIVs_inParkinsonsGWASdata_EA
#2.2.3 Find which SNPs Effect Allere are different between the trait and PD datasets, for then flipping them following the same approach as before (R script)
mkdir /data/neurogen/MRandPD/Results/EAHarmonizationNallsPDdata/
mkdir /data/neurogen/MRandPD/Results/EAHarmonizationNallsPDdata/SameEAexposureAndPD/
mkdir /data/neurogen/MRandPD/Results/EAHarmonizationNallsPDdata/SNPsinPD4Flipping/
diff ExposureGWASdata_EA/exposureIVs_inExposureGWASdata_EA ParkinsonsGWASdata_EA/exposureIVs_inParkinsonsGWASdata_EA | sed -e 's/ /\t/g' | cut -f2 | grep 'rs' | sort | uniq > ../diffEA_exposurevsPD
cd /data/neurogen/MRandPD/Results/
grep -v -f diffEA_exposurevsPD IndexSNPs_inParkinsonsGWASdata/exposureIVs_inParkinsonsGWASdata > EAHarmonizationNallsPDdata/SameEAexposureAndPD/exposureIVs_inParkinsonsGWASdata_SameEA
grep -f diffEA_exposurevsPD IndexSNPs_inParkinsonsGWASdata/exposureIVs_inParkinsonsGWASdata > EAHarmonizationNallsPDdata/SNPsinPD4Flipping/exposureIVs_inParkinsonsGWASdata_EA4flip
#2.2.4 Run SameEAHarmonization.R to conduct Same Effect Allele coding harmonization
mkdir /data/neurogen/MRandPD/Results/SameEAHarmonized/
Rscript --vanilla /data/neurogen/MRandPD/Scripts/Rscripts/SameEAHarmonization.R --input /data/neurogen/MRandPD/Results/EAHarmonizationNallsPDdata/SNPsinPD4Flipping/exposureIVs_inParkinsonsGWASdata_EA4flip --exposure exposure
#2.2.5 After fliping the SNPs that needed to be fliped, join them with the ones that already had the same Effect allele coding as their corresponding exposure GWAS data.
cd /data/neurogen/MRandPD/Results/SameEAHarmonized/
mkdir /data/neurogen/MRandPD/Results/Harmonized_IndexSNPsinParkinsonsGWASdata/
cat /data/neurogen/MRandPD/Results/EAHarmonizationNallsPDdata/SameEAexposureAndPD/exposureIVs_inParkinsonsGWASdata_SameEA exposure_EAflipped | sed -e 's/ /\t/g'| sort > /data/neurogen/MRandPD/Results/SameEAHarmonized/exposureIVsinParkinsonsGWASdata_harmonized
#/data/neurogen/MRandPD/Results/SameEAHarmonized/exposureIVsinParkinsonsGWASdata_harmonized will contain Parkinson's GWAS data that will serve as the input for MR
#/data/neurogen/MRandPD/Results/R2andFstatCalculation/exposureIVs_inExposureGWASdata will contain the exposure GWAS data that will serve as the input for MR
#2.3 Calculate Rsquare and Fstatistic
Rscript --vanilla /data/neurogen/MRandPD/Scripts/Rscripts/R2andFstatistic_calculation.R --input /data/neurogen/MRandPD/Results/R2andFstatCalculation/exposureIVs_inExposureGWASdata --exposure exposure
#3 INPUT GENERATION
#INPUT GENERATION
mkdir /data/neurogen/MRandPD/Results/Input4IVW/
mkdir /data/neurogen/MRandPD/Results/Input4MRPRESSO/
mkdir /data/neurogen/MRandPD/Results/Input4GSMR/
#3.1 Input for IVW
cd /data/neurogen/MRandPD/Results/R2andFstatCalculation/
echo "bx bxse by byse snps ea oa eaf" > headerIVW
awk '{ if (NR!=1) print $1,$7,$8,$4,$5,$6}' exposureIVs_inExposureGWASdata | sort > tempIV
awk '{ if (NR!=1) print $1,$7,$8}' /data/neurogen/MRandPD/Results/SameEAHarmonized/exposureIVsinParkinsonsGWASdata_harmonized | sort > tempPD
join tempIV tempPD > temp
awk '{ print $2,$3,$7,$8,$1,$4,$5,$6}' temp > PDIV
cat headerIVW PDIV | sed -e 's/ /\t/g' > /data/neurogen/MRandPD/Results/Input4IVW/exposurevsPD_IVWinput
#3.2 Input for MRPRESSO
echo "bx bxse bxpval by byse bypval" > headerMRPRESSO
awk '{ if (NR!=1) print $1,$7,$8,$9}' exposureIVs_inExposureGWASdata | sort > tempIV
awk '{ if (NR!=1) print $1,$7,$8,$9}' /data/neurogen/MRandPD/Results/SameEAHarmonized/exposureIVsinParkinsonsGWASdata_harmonized | sort > tempPD
join tempIV tempPD > temp
awk '{print $2,$3,$4,$5,$6,$7}' temp > PDIV
cat headerMRPRESSO PDIV | sed -e 's/ /\t/g' > /data/neurogen/MRandPD/Results/Input4MRPRESSO/exposurevsPD_MRPRESSOinput
#3.2 Input for GSMR
echo "SNP a1 a2 a1freq bx bxse bxpval bxn by byse bypval byn" > headerGSMR
awk '{ if (NR!=1) print $1,$4,$5,$6,$7,$8,$9,$10}' exposureIVs_inExposureGWASdata | sort > tempIV
awk '{ if (NR!=1) print $1,$7,$8,$9,$10}' /data/neurogen/MRandPD/Results/SameEAHarmonized/exposureIVsinParkinsonsGWASdata_harmonized | sort > tempPD
join tempIV tempPD > IVPD
cat headerGSMR IVPD | sed -e 's/ /\t/g' > /data/neurogen/MRandPD/Results/Input4GSMR/exposurevsPD_GSMRinput
#4. Running IVW/MR-EGGER and MRPRESSO MR Analysis
#Run the /data/neurogen/MRandPD/Scripts/Rscripts/MRAnalysis_IVWMRPRESSO.Rmd script. In orther to use the MRPRESSO package, it requires R version below 3.5
#5. Running IVW/MR-EGGER and MRPRESSO MR Analysis
cd /data/neurogen/MRandPD/Results/Input4GSMR/
awk '{ if (NR!=1) print $1,$2}' exposurevsPD_GSMRinput > exposurevsPD_GSMRsnps.allele
module load gcta/1.91.1
gcta64 --bfile /data/neurogen/MRandPD/1000GenomsPlinkFormat/allchr --extract exposurevsPD_GSMRsnps.allele --update-ref-allele exposurevsPD_GSMRsnps.allele --recode --out gsmr_exposurevsPD
#Run the /data/neurogen/MRandPD/Scripts/Rscripts/MRAnalysis_GSMR.Rmd script.