-
Notifications
You must be signed in to change notification settings - Fork 0
/
to_validate_bleu.sh
executable file
·54 lines (42 loc) · 1.69 KB
/
to_validate_bleu.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
#!/bin/bash
GPUS=0
MARIAN=../..
TOOLS=$MARIAN/tools
MOSESSCRIPT_TOKENIZER=$TOOLS/moses-scripts/scripts/tokenizer
MOSESSCRIPT_SCRIPTS=$TOOLS/moses-scripts/scripts
TEMP=temp
SRC=src
TRG=trg
MODEL=model
TEST=TEST
if [ -d "$TEMP" ]
then
rm -r $TEMP
fi
mkdir -p $TEMP
if [ -d "$TEST" ]
then
rm -r $TEST
fi
mkdir -p $TEST
for lang in $SRC $TRG
do
cat data/clang8/clang8.$lang.test.txt | ${MOSESSCRIPT_TOKENIZER}/replace-unicode-punctuation.perl > $TEMP/$entry.unicode.$lang
cat $TEMP/$entry.unicode.$lang | ${MOSESSCRIPT_TOKENIZER}/remove-non-printing-char.perl > $TEMP/$entry.printingchar.$lang
cat $TEMP/$entry.printingchar.$lang | ${MOSESSCRIPT_TOKENIZER}/normalize-punctuation.perl -l en > $TEMP/$entry.punctuation.$lang
sed 's/ */ /g;s/^ *//g;s/ *$$//g' $TEMP/$entry.punctuation.$lang > $TEMP/$entry.punctuation.$lang.out
mv $TEMP/$entry.punctuation.$lang.out $TEMP/$entry.punctuation.$lang
cat $TEMP/$entry.punctuation.$lang | ${MOSESSCRIPT_TOKENIZER}/tokenizer.perl -q -l en > $TEMP/$entry.tok.$lang
cat $TEMP/$entry.tok.$lang | $MARIAN/build/spm_encode --model $MODEL/$lang.spm > $TEST/corpus.test.encoded.$lang
done
rm -r $TEMP
java FileCleaner.java "$TEST/corpus.test.encoded.$SRC" "$TEST/corpus.test.encoded.$TRG"
cat $TEST/corpus.test.encoded.$SRC \
| $MARIAN/build/marian-decoder -c model/model.npz.orig.npz.decoder.yml -d $GPUS \
-b 12 -n 0.6 --mini-batch 8 -w 2500 --max-length 200 \
| sed 's/\@\@ //g' \
| $MOSESSCRIPT_SCRIPTS/recaser/detruecase.perl 2> /dev/null \
| $MOSESSCRIPT_SCRIPTS/tokenizer/detokenizer.perl -l en 2>/dev/null \
| $MOSESSCRIPT_SCRIPTS/generic/multi-bleu-detok.perl $TEST/corpus.test.encoded.$TRG \
#| sed -r 's/BLEU = ([0-9.]+),.*/\1/'
rm -r $TEST