-
Notifications
You must be signed in to change notification settings - Fork 0
/
RNAseq
207 lines (153 loc) · 7.69 KB
/
RNAseq
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
# Make a new directory for the RNAseq project
mkdir -p ~/FYP/raw_fastq/FastQC
cd ~/FYP/raw_fastq
# Create symbolic links to the compressed fastq files
ln -s /media/newdrive/data/TREM2/KO_Mo_1_1.fastq.gz
ln -s /media/newdrive/data/TREM2/KO_Mo_1_2.fastq.gz
ln -s /media/newdrive/data/TREM2/KO_Mo_2_1.fastq.gz
ln -s /media/newdrive/data/TREM2/KO_Mo_2_2.fastq.gz
ln -s /media/newdrive/data/TREM2/KO_Mo_3_1.fastq.gz
ln -s /media/newdrive/data/TREM2/KO_Mo_3_2.fastq.gz
ln -s /media/newdrive/data/TREM2/KO_Mo_4_1.fastq.gz
ln -s /media/newdrive/data/TREM2/KO_Mo_4_2.fastq.gz
ln -s /media/newdrive/data/TREM2/KO_Mo_5_1.fastq.gz
ln -s /media/newdrive/data/TREM2/KO_Mo_5_2.fastq.gz
ln -s /media/newdrive/data/TREM2/WT_Mo_1_1.fastq.gz
ln -s /media/newdrive/data/TREM2/WT_Mo_1_2.fastq.gz
ln -s /media/newdrive/data/TREM2/WT_Mo_2_1.fastq.gz
ln -s /media/newdrive/data/TREM2/WT_Mo_2_2.fastq.gz
ln -s /media/newdrive/data/TREM2/WT_Mo_3_1.fastq.gz
ln -s /media/newdrive/data/TREM2/WT_Mo_3_2.fastq.gz
ln -s /media/newdrive/data/TREM2/WT_Mo_4_1.fastq.gz
ln -s /media/newdrive/data/TREM2/WT_Mo_4_2.fastq.gz
ln -s /media/newdrive/data/TREM2/WT_Mo_5_1.fastq.gz
ln -s /media/newdrive/data/TREM2/WT_Mo_5_2.fastq.gz
# Check links the have been successfully created
ls -l
###################################################
#FastQC
###################################################
# Run fastqc on all fastq files, and output results to
fastqc *.fastq.gz --outdir=/media/newdrive/GCB2024/aaronngai/FYP/raw_fastq/FastQC
###################################################
#MultiQC
###################################################
# Run multiqc on a directory containing fastqc reports
multiqc ~/media/newdrive/GCB2024/aaronngai/FYP/raw_fastq/FastQC -n multiqc_data
# Download and view the html file
###################################################
#STAR
###################################################
# To test a loop to align all fastq files within a directory
for i in *_1.fastq.gz; do
echo STAR --genomeDir /media/newdrive/data/Reference_genomes/Human/UCSC/STAR_UCSC_no_annotations/ --runThreadN 12 --readFilesIn $i ${i%_1.fastq.gz}_2.fastq.gz --readFilesCommand zcat --outFileNamePrefix ${i%_1.fastq.gz} --outSAMtype BAM SortedByCoordinate --outSAMunmapped Within --outSAMattributes Standard --sjdbGTFfile /media/newdrive/data/Reference_genomes/Human/UCSC/hg38.ncbiRefSeq.gtf --sjdbOverhang 49
done
# To align all fastq files within a directory
nohup sh -c 'for i in *_1.fastq.gz; do
STAR --genomeDir /media/newdrive/data/Reference_genomes/Human/UCSC/STAR_UCSC_no_annotations/ --runThreadN 12 --readFilesIn $i ${i%_1.fastq.gz}_2.fastq.gz --readFilesCommand zcat --outFileNamePrefix ${i%_1.fastq.gz} --outSAMtype BAM SortedByCoordinate --outSAMunmapped Within --outSAMattributes Standard --sjdbGTFfile /media/newdrive/data/Reference_genomes/Human/UCSC/hg38.ncbiRefSeq.gtf --sjdbOverhang 49
done' &
# Move bams and related files to a new directory and create a multiqc summary
mkdir -p ~/FYP/coordinate_sorted_bams
mv *.out* ~/FYP/coordinate_sorted_bams/
cd ~/FYP/coordinate_sorted_bams
multiqc ~/FYP/coordinate_sorted_bams -n STAR_multiqc
# Download and view the html file
###################################################
#Samtoools
###################################################
# view header only of bam file
samtools view -H KO_Mo_1Aligned.sortedByCoord.out.bam
# view mapping section only of sam file
samtools view KO_Mo_1Aligned.sortedByCoord.out.bam | less -S
# view header and mapping section of sam file
samtools view -h KO_Mo_1Aligned.sortedByCoord.out.bam | less -S
# count number of alignments
for i in *Aligned.sortedByCoord.out.bam; do
samtools view -c $i
done
# index bam files
# -@ specify number of threads to use
for i in *Aligned.sortedByCoord.out.bam; do
samtools index $i -@ 12
done
# Check what files produced
ls -lrt
###################################################
#RSeQC
###################################################
# RSeQC for Human data
# make directory for infer exp
mkdir -p ~/FYP/coordinate_sorted_bams/rseqc/infer_experiment
# Infer whether strand-specific RNA-seq data was performed: Only need to check this for one bam file
for i in *Aligned.sortedByCoord.out.bam; do
infer_experiment.py -r /media/newdrive/data/Reference_genomes/Human/UCSC/hg38.ncbiRefSeq.bed12 -i $i > ~/FYP/coordinate_sorted_bams/rseqc/infer_experiment/${i%Aligned.sortedByCoord.out.bam}.infer_exp.txt
done
# make directory for bam stat
mkdir -p ~/FYP/coordinate_sorted_bams/rseqc/bam_stat
# Summarizing mapping statistics of a BAM or SAM file: bam_stat.py
nohup sh -c 'for i in *Aligned.sortedByCoord.out.bam; do
bam_stat.py -i $i > ~/FYP/coordinate_sorted_bams/rseqc/bam_stat/${i%Aligned.sortedByCoord.out.bam}.bamstats.txt
done' &
# make directory for read dist
mkdir -p ~/FYP/coordinate_sorted_bams/rseqc/read_distribution
# Calculate how mapped reads were distributed over genome feature (like CDS exon, 5’UTR exon, 3’ UTR exon, Intron, Intergenic regions): read_distribution.py
nohup sh -c 'for i in *Aligned.sortedByCoord.out.bam; do
read_distribution.py -i $i -r /media/newdrive/data/Reference_genomes/Human/UCSC/hg38.ncbiRefSeq.bed12 > ~/FYP/coordinate_sorted_bams/rseqc/read_distribution/${i%Aligned.sortedByCoord.out.bam}.read_dist.txt
done' &
# Calculate the RNA-seq reads coverage over gene body: geneBody_coverage.py
# This one in particular takes a long time!
# count number of alignments
for i in *Aligned.sortedByCoord.out.bam; do
samtools view -c $i
done
# subsample a proportion of the aligned reads, to end up with ~200,000
for i in *Aligned.sortedByCoord.out.bam; do
samtools view -s 0.00209 -o ${i%.bam}_subset.bam $i
done
# index subsampled bam files
for i in *Aligned.sortedByCoord.out_subset.bam; do
samtools index $i
done
# count number of alignments in subsampled bam files
for i in *Aligned.sortedByCoord.out_subset.bam; do
samtools view -c $i
done
# make directory for gene body coverage
mkdir -p ~/FYP/coordinate_sorted_bams/rseqc/genebodycoverage
nohup sh -c 'for i in *Aligned.sortedByCoord.out_subset.bam; do
geneBody_coverage.py -i $i -r /media/newdrive/data/Reference_genomes/Human/UCSC/hg38.ncbiRefSeq.bed12 -o ~/FYP/coordinate_sorted_bams/rseqc/genebodycoverage/${i%Aligned.sortedByCoord.out_subset.bam}
done' &
# Run multiqc on the results
cd ~/FYP/coordinate_sorted_bams/rseqc/genebodycoverage
multiqc ~/FYP/coordinate_sorted_bams/rseqc/genebodycoverage -n rseqc_genebodycoverage_multiqc
cd ~/FYP/coordinate_sorted_bams/rseqc/read_distribution
multiqc ~/FYP/coordinate_sorted_bams/rseqc/read_distribution -n rseqc_readdistribution_multiqc
cd ~/FYP/coordinate_sorted_bams/rseqc/infer_experiment
multiqc ~/FYP/coordinate_sorted_bams/rseqc/infer_experiment -n rseqc_inferexperiment_multiqc
###################################################
#featurecounts
###################################################
cd ~/FYP/coordinate_sorted_bams/
mkdir ~/FYP/name_sorted_bams
mkdir ~/FYP/counts
# create name-sorted bam file
# -n sort by name rather than coordinates
# -@ specify number of threads to use
# -o name of output file
# test
for i in *Aligned.sortedByCoord.out.bam; do
echo samtools sort -n -@ 12 -o ~/FYP/name_sorted_bams/${i%Aligned.sortedByCoord.out.bam}_namesorted.bam $i
done
# execute
for i in *Aligned.sortedByCoord.out.bam; do
samtools sort -n -@ 12 -o ~/FYP/name_sorted_bams/${i%Aligned.sortedByCoord.out.bam}_namesorted.bam $i
done
cd ~/FYP/name_sorted_bams
# for reverse-stranded libraries
featureCounts -T 12 -s 2 -p --countReadPairs -C -a /media/newdrive/data/Reference_genomes/Human/UCSC/hg38.ncbiRefSeq.gtf -o ~/FYP/counts/featurecounts.txt *namesorted.bam 2> ~/FYP/counts/featurecounts.screen-output.log
# change directory
cd ~/FYP/counts
# view txt file
less -S featurecounts.txt
# multiqc
multiqc ~/FYP/counts -n featurecounts_multiqc