-
Notifications
You must be signed in to change notification settings - Fork 0
/
compute_gene_transcripts_length.sh
executable file
·153 lines (109 loc) · 4.77 KB
/
compute_gene_transcripts_length.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
#!/bin/bash
#type compute_gene_transcripts_length.sh -h (--help) for more info
### ALGORITHM:
### Extract exons, transcript id and gene id from columns 4, 5, and 9
### only exons that belong to transcripts which belong to genes will be extracted
### Be careful that all exon lines have a gene and a transcript id
gene_id="gene_id"
transcript_id="transcript_id"
threads=1
while [[ -n $1 ]];do
case $1 in
-g|--gene_id)
shift
gene_id=$1
;;
-t|--transcript_id)
shift
transcript_id=$1
;;
-f|--gtf_file)
shift
gtf_file=$1
;;
-c|--threads)
shift
threads=$1
;;
-h|--help)
shift
echo ' '-g, --gene_id [tag identifier for gene id\'s in column 12] default: \"gene_id\" $'\n' \
-t, --transcript_id [tag identifier for transcript id\'s in column 12] default: \"transcript_id\"$'\n' \
-f, --gtf_file [annotation file]$'\n' \
-c, --threads [number of parallel threads] default: 1
exit 1
;;
*)
echo -e '\033[0;31m'Error: \"$1\" isnt a recognized parameter
echo Use:
echo ' '-g, --gene_id [tag identifier for gene id\'s in column 12]$'\n' \
-t, --transcript_id [tag identifier for transcript id\'s in column 12]$'\n' \
-f, --gtf_file [annotation file]$'\n' \
-c, --threads [number of parallel threads]
exit 1
;;
esac
shift
done
if [[ ! -n $gtf_file ]];then
echo -e '\033[0;31m'"You forgot to include the --gtf_file"
exit 1
fi
filename=$(echo $gtf_file |rev |cut -d '/' -f 1 |rev)
### Extract exons, transcript id and gene id from columns 4, 5, and 9
### only exons that belong to transcripts which belong to genes will be extracted
### Be careful that all exon lines have a gene and a transcript id
grep $'\t'exon$'\t' $gtf_file |cut -f 4,5,9 | grep $gene_id |grep $transcript_id > reduced_${filename}.temp
# column 9 contains gene id and transcript id
# and is assumed to be ; delimmited according to gtf,gff,gff3 format
if [[ ! -f gene_transcript_exonLengths_$filename ]];then
paste <(grep -Eo "${gene_id}[^;]+;?" reduced_${filename}.temp |sed -r "s/${gene_id}[ ,=,\"]+//g" |sed 's/;//g'|sed 's/\"//g')\
<(grep -Eo "${transcript_id}[^;]+;?" reduced_${filename}.temp |sed -r "s/${transcript_id}[ ,=,\"]+//g" |sed 's/;//g'|sed 's/\"//g') \
<(Rscript <(cut -f 1,2 reduced_${filename}.temp| sed 's/\t/-/g'|sed 's/^/abs(/g'|sed 's/$/)/g' | sed 's/^/1+/g')|sed 's/\[1\] //g') > gene_transcript_exonLengths_$filename
fi
rm reduced_${filename}.temp
echo gene_id$'\t'transcript_id$'\t'transcript_length > transcript_lengths_$filename
# Function for computing transcript lengths from input transcript intermediate file with one line per transcript
get_transcript_lengths () {
transcript_array=($(cat $1))
for gene_transcript in ${transcript_array[@]};do
transcript=$(echo $gene_transcript |cut -d ':' -f 2|sed -r 's/[ ]+//g')
if [[ ${#transcript} -eq 0 ]];then
transcript=$(echo $gene_transcript|cut -d ':' -f 1)
fi
echo $gene_transcript$'\t'$(echo $(grep -E $transcript[^A-Z,a-z,0-9\.]+ gene_transcript_exonLengths_$filename |cut -f 3) |sed 's/ /+/g' |bc )|sed 's/:/\t/g' >> ${1}_$filename
done
}
# Make transcript length files
echo "Making transcript length file"
cut -f 1,2 gene_transcript_exonLengths_$filename | sed 's/\t/:/g' |sort -V |uniq > transcript_list.text_$filename
split -d -n l/$threads transcript_list.text_$filename temp_tscript_${filename}_
for tt in temp_tscript_${filename}_*;do
get_transcript_lengths $tt &
done
wait
cat temp_tscript*_$filename >> transcript_lengths_$filename
rm temp_tscript_${filename}_* transcript_list.text_$filename
# make mean gene length file
echo "Making average gene transcript length file"
echo gene_id$'\t'mean_length > gene_meanLengths_$filename
# Function for computing mean transcript lengths for each gene
get_mean_transcript_lengths () {
# $1 is a file that contains a list of gene names
# $2 is the transcript length file
for gene in $(cut -f 1 $1);do
mean_length=$(echo \( $(echo $(grep ${gene}$'\t' $2 | cut -f 3) |sed 's/ /+/g') \) / $(grep -c ${gene}$'\t' $2) |bc -l | sed -r 's/([0-9]+\.[0-9]{3})[0-9]+$/\1/g')
echo $gene$'\t'$mean_length >> temp_meanLengths_$1
done
}
# make gene list file and split into $threads separate files
cut -f 1 transcript_lengths_$filename |sort -V|uniq > temp_geneid_list323_$filename
split -d -n l/$threads temp_geneid_list323_$filename subtemp_geneid_list323_${filename}_
for genelist_file in subtemp_geneid_list323_${filename}_*;do
get_mean_transcript_lengths $genelist_file transcript_lengths_$filename &
done
wait
cat temp_meanLengths_subtemp_geneid_list323_${filename}_* >> gene_meanLengths_$filename
rm gene_transcript_exonLengths_$filename temp_geneid_list323_$filename subtemp_geneid_list323_${filename}_*
rm temp_meanLengths_subtemp_geneid_list323_${filename}_*
echo Finished