/
post_disc
executable file
·60 lines (42 loc) · 2.15 KB
/
post_disc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
#!/bin/bash
#USAGE: ./post_disc <exp_name> <match_thr>
#
# Copyright 2011-2015 Johns Hopkins University (Author: Aren Jansen)
#
. config
ulimit -c 0
TMP=${TMP:=/tmp}
EXPNAME=$1
DTWTHR=$2
EXPDIR=$EXPDIR/$EXPNAME
OLAPTHR=0.97
DEDUPTHR=0.8
DURTHR=50
RHOTHR=500
if [ -z $EXPNAME ]; then
echo "USAGE: ./post_disc <exp_name> <match_thr>"
exit
fi
if [ -z $DTWTHR ]; then
echo "USAGE: ./post_disc <exp_name> <match_thr>"
exit
fi
if [ ! -d $EXPDIR ]; then
echo "ERROR: experiment subfolder $EXPDIR does not exist"
exit
fi
echo "Post-processing experiment $EXPDIR"
echo "Generating master match file: $EXPDIR/matches/master_match"
cat $EXPDIR/matches/out.* | cut -d ' ' -f1-6 | awk 'NF == 2 || ($6 < rhothr && $5 > dtwthr && $2-$1 > durthr && $4-$3 > durthr) {print $0;}' dtwthr=$DTWTHR durthr=$DURTHR rhothr=$RHOTHR | uniq | awk 'NF == 2 {lastpair=$0; lastNF=2; next;} lastNF==2 {print lastpair; print $0; lastNF=6; next} {print $0; lastNF=6;}' > $EXPDIR/matches/master_match
echo "Building adjacency graph"
python scripts/build_graph.py --input=$EXPDIR/matches/master_match --probthr=$DTWTHR --olapthr=$OLAPTHR --output=$EXPDIR/matches/master_graph --list=$EXPDIR/files.base
echo "Perform clustering"
python ./scripts/conncomp_dfs.py --input=$EXPDIR/matches/master_graph.edges --output=$EXPDIR/matches/master_graph.clusters --thresh=0
echo "Dedup clusters"
python scripts/dedup_clusters.py --ninput=$EXPDIR/matches/master_graph.nodes --cinput=$EXPDIR/matches/master_graph.clusters --dedupthr=$DEDUPTHR --output=$EXPDIR/matches/master_graph.dedups
echo "Remove garbage clusters"
cat $EXPDIR/matches/master_graph.dedups | awk 'length($0) < 60000 && NF < 10000 {print $0;}' > $EXPDIR/matches/master_graph.dedupsfilt
mv $EXPDIR/matches/master_graph.dedupsfilt $EXPDIR/matches/master_graph.dedups
echo "Write the bags-of-pseudoterms feature file"
python scripts/dump_pseudoterm.py --cinput=$EXPDIR/matches/master_graph.dedups --ninput=$EXPDIR/matches/master_graph.nodes --output=$EXPDIR/matches/master_graph.feats --listname=$EXPDIR/files.base
cp $EXPDIR/matches/master_graph.dedups $EXPDIR/matches/master_graph.nodes $EXPDIR/matches/master_graph.feats $EXPDIR/results/