Skip to content

Commit

Permalink
Merge pull request #221 from bitextor/docalign_feat
Browse files Browse the repository at this point in the history
Substitute tensorflow in feature-based document alignment by sklearn SVM classifier
  • Loading branch information
lpla committed Sep 24, 2021
2 parents d6f6618 + 49c4812 commit 31cae7a
Show file tree
Hide file tree
Showing 27 changed files with 712 additions and 598 deletions.
9 changes: 2 additions & 7 deletions bitextor/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -1120,7 +1120,7 @@ rule bleualign:
if [ {params.workers} -gt 1 ]; then
parallel_cmd="parallel --gnu --halt 2 --pipe --j {params.workers} -l 1 --group"
fi
cat {input.indices} \
cut -f 2,3 {input.indices} \
| docjoin \
-l {input.url1} -r {input.url2} \
-l {input.plain1} -r {input.plain2} \
Expand All @@ -1133,11 +1133,8 @@ rule bleualign:
### FILTERING AND CLEANING ######################################

split_input_filename = "06_02.segalign"
split_input_extension = ".gz"

if SEGALIGN == "hunalign":
split_input_filename = "hunalign.06_02.segalign"
split_input_extension = ".xz"


# split segalign results into balanced chunks
Expand All @@ -1152,7 +1149,7 @@ checkpoint split_segalign:
"""
input:
lambda wildcards: [
f"{TRANSIENT}/{SRC_LANG}_{TRG_LANG}/{shard}/{SRC_LANG}{src_batch}_{TRG_LANG}{trg_batch}.{split_input_filename}{split_input_extension}"
f"{TRANSIENT}/{SRC_LANG}_{TRG_LANG}/{shard}/{SRC_LANG}{src_batch}_{TRG_LANG}{trg_batch}.{split_input_filename}.gz"
for (shard, (src_batch, trg_batch)) in get_align_inputs(SRC_LANG, TRG_LANG)
],
output:
Expand Down Expand Up @@ -1188,8 +1185,6 @@ checkpoint split_segalign:
CAT=cat
if [[ {input[0]} == *.gz ]]; then
CAT=zcat
elif [[ {input[0]} == *.xz ]]; then
CAT=xzcat
fi
$CAT {input} \
| ( [ "{SRC_LANG}" = "{LANG1}" ] && cat || awk -F '\t' '{{ print $2,$1,$4,$3,$5 }}' OFS='\t' )\
Expand Down
223 changes: 0 additions & 223 deletions bitextor/bitextor_align_documents.py

This file was deleted.

87 changes: 0 additions & 87 deletions bitextor/bitextor_rank.py

This file was deleted.

0 comments on commit 31cae7a

Please sign in to comment.