-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathlogcollector.sh
103 lines (93 loc) · 2.88 KB
/
logcollector.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
#!/bin/bash
#Brian Hanratty -- bhanratt@asu.edu
#ASU Bioinformatics Core Lab -- asubioinformatics.org
#Parses a reference genome for orthologs of a protein
#Produces Orthlogs.fa, a fasta of all hits
#FIRST run `Python GetGeneIDs.py <gene>` to generate a necessary input file
#Created -- 9/12/2017
#Dependencies: blat, blastx, pslToBed, bedtools
#Includes: split.pl
if [ $# -lt 3 ]; then
echo "Usage: $0 <genome fasta> <protein fasta> <y/n for length filter> <multiplier for length filter>"
exit 1
fi
genome=$1
peptide=$2
lengthfilter=$3
multiplier=$4
peplength=`cat $peptide | awk '$0 ~ ">" {print c; c=0; } $0 !~ ">" {c+=length($0);} END { print c; }' | tail -n 1`
p=$(($multiplier*$peplength))
genomename=`echo ${1%.fa} | sed 's,^[^/]*/,,'`
peptidename=`echo ${2%.fa} | sed 's,^[^/]*/,,'`
grepline=`cat grepline.txt`
echo 'Ortholog detector by ASU Bioinformatics Core Lab (asubioinformatics.org) - Last Updated: 2/26/2018'
echo 'using genome ' $genomename;
echo 'using peptide ' $peptidename;
echo 'peptide length: ' $peplength;
echo 'multiplier: ' $multiplier;
if [ "$lengthfilter" = "y" ]; then
echo 'Length filter ON. Before blastx, will delete sequences smaller than: '$p;
else
echo 'Length filter OFF.';
fi
date
echo 'Running Step 1: blat';
#can set -minScore=65 for more specific results
tools/blat $1 $2 -t=dnax -q=prot -minIdentity=65 -minScore=55 $genomename.$peptidename.blat.psl;
date
echo 'Running Step 2: pslToBed';
tools/pslToBed $genomename.$peptidename.blat.psl $genomename.$peptidename.blat.psl.bed;
date
echo 'Running Step 3: bedtools';
bedtools getfasta -s -split -fi $genome -bed $genomename.$peptidename.blat.psl.bed -fo Copies.$peptidename;
#split fasta by IDs
perl tools/split.pl --input_file Copies.$peptidename --output_dir ./;
date
#logic to filter out sequences that are smaller than 3X peptide length
if [ "$lengthfilter" = "y" ]; then
for i in *.split
do
size=`cat $i | awk '$0 ~ ">" {print c; c=0; } $0 !~ ">" {c+=length($0);} END { print c; }' | tail -n 1`
if [ "$size" -ge "$p" ]; then
echo 'Passed the length filter: '$i;
else
echo 'Failed the length filter: '$i;
rm $i;
fi
done
fi
echo 'Running Step 4: blastx and parse for protein';
#parse split protein fastas for occurences of TP53 IDs and filter by identity >=65%
for i in *.split
do
blastx -outfmt 6 -query $i -subject tools/humanprotein.fa > $i.blr;
eval $grepline
awk '$3 >= 65.00' $i.blr.tp53 > $i.blr.tp53.passed;
done
#remove non-matches
find . -name '*.passed' -size 0 -delete
#save potential hits
for i in $(ls *.blr.tp53.passed)
do
mv $i ${i%.split.blr.tp53.passed}.orth;
done
#save fasta sequences
for i in $(ls *.orth)
do
mv ${i%.orth}.split ${i%.orth}.orth.fa;
done
echo 'All done before cleanup!';
date
#concatenate results
cat *.orth.fa > Results.fa
cat *.orth > Results.blr
#cleanup temp files
rm *.orth.fa
rm *.orth
rm *.split
rm Copies.*
rm *.split.*
rm *.bed
rm *.psl
rm grepline.txt
echo 'All done!';