Skip to content

Commit

Permalink
Added other half MaCoCu production scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
lpla committed May 29, 2023
1 parent 879d2b8 commit ee2c7ca
Show file tree
Hide file tree
Showing 13 changed files with 107 additions and 0 deletions.
11 changes: 11 additions & 0 deletions bitextor/example/macocu-production/macocu-ca-extra.sh
@@ -0,0 +1,11 @@
#!/usr/bin/bash

WORK=/data1/lpla/macocu-release-2
bitextor --notemp -j 32 \
--config profiling=True permanentDir="${WORK}/permanent/bitextor-mt-output-en-ca-release-2-extra" granularity='["sentences", "documents"]' \
dataDir="${WORK}/data/data-mt-en-ca-release-2-extra" transientDir="${WORK}/transient-mt-en-ca-release-2-extra" \
preverticalsFile="'/data1/lpla/prevertical_ca_release2.list'" shards=8 batches=999999 lang1=en lang2=ca \
documentAligner="externalMT" alignerCmd="bash /home/lpla/bitextor/bitextor/example/marian-translate-ca.sh" translationDirection="ca2en" sentenceAligner="bleualign" \
bifixer=True bicleaner=True bicleanerModel="${WORK}/bicleaner-model-ai/en-ca/metadata.yaml" deferred=True tmx=True boilerplateCleaning=True deduped=True paragraphIdentification=True additionalMetadata=True bicleanerExtraArgs="--disable_minimal_length" bicleanerThreshold=0.5 \
&> "${WORK}/reports/10-mt-en-ca-release-2-extra.report"

11 changes: 11 additions & 0 deletions bitextor/example/macocu-production/macocu-is-extra.sh
@@ -0,0 +1,11 @@
#!/usr/bin/bash

WORK=/data1/lpla/macocu-release-1-using-release-2-code
bitextor --notemp -j 32 \
--config profiling=True permanentDir="${WORK}/permanent/bitextor-mt-output-en-is-extra" granularity='["sentences", "documents"]' \
dataDir="${WORK}/data/data-mt-en-is-extra" transientDir="${WORK}/transient-mt-en-is-extra" \
preverticalsFile="'/data1/lpla/prevertical_is_extra.list'" shards=8 batches=999999 lang1=en lang2=is \
documentAligner="externalMT" alignerCmd="bash /home/lpla/bitextor/bitextor/example/marian-translate-is.sh" translationDirection="is2en" sentenceAligner="bleualign" \
bifixer=True bicleaner=True bicleanerModel="${WORK}/bicleaner-model-ai/en-is/metadata.yaml" deferred=True tmx=True boilerplateCleaning=True deduped=True paragraphIdentification=True additionalMetadata=True bicleanerExtraArgs="--disable_minimal_length" bicleanerThreshold=0.5 \
&> "${WORK}/reports/10-mt-en-is-extra.report"

@@ -0,0 +1,11 @@
#!/usr/bin/bash

WORK=/data1/lpla/macocu-release-1-using-release-2-code
bitextor --notemp -j 32 \
--config profiling=True permanentDir="${WORK}/permanent/bitextor-mt-output-en-is-r2code-v2" granularity='["sentences", "documents"]' \
dataDir="${WORK}/data/data-mt-en-is-r2code-v2" transientDir="${WORK}/transient-mt-en-is-r2code-v2" \
preverticalsFile="'/data1/lpla/prevertical_is_outgood_cld2.list'" shards=8 batches=999999 lang1=en lang2=is \
documentAligner="externalMT" alignerCmd="bash /home/lpla/bitextor/bitextor/example/marian-translate-is.sh" translationDirection="is2en" sentenceAligner="bleualign" \
bifixer=True bicleaner=True bicleanerModel="${WORK}/bicleaner-model-ai/en-is/metadata.yaml" deferred=True tmx=True boilerplateCleaning=True deduped=True paragraphIdentification=True additionalMetadata=True bicleanerExtraArgs="--disable_minimal_length" bicleanerThreshold=0.5 \
&> "${WORK}/reports/10-mt-en-is-r2code-v2.report"

11 changes: 11 additions & 0 deletions bitextor/example/macocu-production/macocu-is.sh
@@ -0,0 +1,11 @@
#!/usr/bin/bash

WORK=/data1/lpla/macocu
bitextor --notemp -j 32 \
--config profiling=True permanentDir="${WORK}/permanent/bitextor-mt-output-en-is-paragraph-and-loomchild-and-bicleanerai" \
dataDir="${WORK}/data/data-mt-en-is-paragraph-and-loomchild-and-bicleanerai" transientDir="${WORK}/transient-mt-en-is-paragraph-and-loomchild-and-bicleanerai" \
preverticalsFile="'/data1/lpla/prevertical_is.list'" shards=1 batches=512 lang1=en lang2=is \
documentAligner="externalMT" alignerCmd="bash /home/lpla/bitextor/bitextor/example/marian-translate-is.sh" translationDirection="is2en" sentenceAligner="bleualign" \
bifixer=True bicleaner=True bicleanerModel="${WORK}/bicleaner-model-ai/en-is/metadata.yaml" deferred=True tmx=True boilerplateCleaning=True deduped=True paragraphIdentification=True \
&> "${WORK}/reports/10-mt-en-is-paragraph-and-loomchild-and-bicleanerai.report"

@@ -0,0 +1,11 @@
#!/usr/bin/bash

WORK=/data1/lpla/macocu-release-1-using-release-2-code
bitextor --notemp -j 32 \
--config profiling=True permanentDir="${WORK}/permanent/bitextor-mt-output-en-mt-r2code-v2" granularity='["sentences", "documents"]' \
dataDir="${WORK}/data/data-mt-en-mt-r2code-v2" transientDir="${WORK}/transient-mt-en-mt-r2code-v2" \
preverticalsFile="'/data1/lpla/prevertical_mt_outgood_cld2.list'" shards=8 batches=999999 lang1=en lang2=mt \
documentAligner="externalMT" alignerCmd="bash /home/lpla/bitextor/bitextor/example/marian-translate-mt.sh" translationDirection="mt2en" sentenceAligner="bleualign" \
bifixer=True bicleaner=True bicleanerModel="${WORK}/bicleaner-model-ai/en-mt/metadata.yaml" deferred=True tmx=True boilerplateCleaning=True deduped=True paragraphIdentification=True additionalMetadata=True bicleanerExtraArgs="--disable_minimal_length" bicleanerThreshold=0.5 \
&> "${WORK}/reports/10-mt-en-mt-r2code-v2.report"

11 changes: 11 additions & 0 deletions bitextor/example/macocu-production/macocu-mt.sh
@@ -0,0 +1,11 @@
#!/usr/bin/bash

WORK=/data1/lpla/macocu
bitextor --notemp -j 32 \
--config profiling=True permanentDir="${WORK}/permanent/bitextor-mt-output-en-mt-paragraph-and-loomchild-and-bicleanerai" \
dataDir="${WORK}/data/data-mt-en-mt-paragraph-and-loomchild-and-bicleanerai" transientDir="${WORK}/transient-mt-en-mt-paragraph-and-loomchild-and-bicleanerai" \
preverticalsFile="'/data1/lpla/prevertical_mt.list'" shards=1 batches=512 lang1=en lang2=mt \
documentAligner="externalMT" alignerCmd="bash /home/lpla/bitextor/bitextor/example/marian-translate-mt.sh" translationDirection="mt2en" sentenceAligner="bleualign" \
bifixer=True bicleaner=True bicleanerModel="${WORK}/bicleaner-model-ai/en-mt/metadata.yaml" deferred=True tmx=True boilerplateCleaning=True deduped=True paragraphIdentification=True \
&> "${WORK}/reports/10-mt-en-mt-paragraph-and-loomchild-and-bicleanerai.report"

11 changes: 11 additions & 0 deletions bitextor/example/macocu-production/macocu-sq-release2.sh
@@ -0,0 +1,11 @@
#!/usr/bin/bash

WORK=/data1/lpla/macocu-release-2
bitextor --notemp -j 32 \
--config profiling=True permanentDir="${WORK}/permanent/bitextor-mt-output-en-sq-release-2-v2" granularity='["sentences", "documents"]' \
dataDir="${WORK}/data/data-mt-en-sq-release-2-v2" transientDir="${WORK}/transient-mt-en-sq-release-2-v2" \
preverticalsFile="'/data1/lpla/prevertical_sq_release2.list'" shards=8 batches=999999 lang1=en lang2=sq \
documentAligner="externalMT" alignerCmd="bash /home/lpla/bitextor/bitextor/example/marian-translate-sq.sh" translationDirection="sq2en" sentenceAligner="bleualign" \
bifixer=True bicleaner=True bicleanerModel="${WORK}/bicleaner-model-ai/en-sq/metadata.yaml" deferred=True tmx=True boilerplateCleaning=True deduped=True paragraphIdentification=True additionalMetadata=True bicleanerExtraArgs="--disable_minimal_length" bicleanerThreshold=0.5 \
&> "${WORK}/reports/10-mt-en-sq-release-2-v2.report"

@@ -0,0 +1,11 @@
#!/usr/bin/bash

WORK=/data1/lpla/macocu-release-1-using-release-2-code
bitextor --notemp -j 32 \
--config profiling=True permanentDir="${WORK}/permanent/bitextor-mt-output-en-tr-r2code-v2" granularity='["sentences", "documents"]' \
dataDir="${WORK}/data/data-mt-en-tr-r2code-v2" transientDir="${WORK}/transient-mt-en-tr-r2code-v2" \
preverticalsFile="'/data1/lpla/prevertical_tr_outgood_cld2.list'" shards=8 batches=999999 lang1=en lang2=tr \
documentAligner="externalMT" alignerCmd="bash /home/lpla/bitextor/bitextor/example/marian-translate-tr.sh" translationDirection="tr2en" sentenceAligner="bleualign" \
bifixer=True bicleaner=True bicleanerModel="${WORK}/bicleaner-model-ai/en-tr/metadata.yaml" deferred=True tmx=True boilerplateCleaning=True deduped=True paragraphIdentification=True additionalMetadata=True bicleanerExtraArgs="--disable_minimal_length" bicleanerThreshold=0.5 \
&> "${WORK}/reports/10-mt-en-tr-r2code-v2.report"

11 changes: 11 additions & 0 deletions bitextor/example/macocu-production/macocu-tr.sh
@@ -0,0 +1,11 @@
#!/usr/bin/bash

WORK=/data1/lpla/macocu
bitextor --notemp -j 32 \
--config profiling=True permanentDir="${WORK}/permanent/bitextor-mt-output-en-tr-paragraph-and-loomchild-and-bicleanerai" \
dataDir="${WORK}/data/data-mt-en-tr-paragraph-and-loomchild-and-bicleanerai" transientDir="${WORK}/transient-mt-en-tr-paragraph-and-loomchild-and-bicleanerai" \
preverticalsFile="'/data1/lpla/prevertical_tr.list'" shards=1 batches=512 lang1=en lang2=tr \
documentAligner="externalMT" alignerCmd="bash /home/lpla/bitextor/bitextor/example/marian-translate-tr.sh" translationDirection="tr2en" sentenceAligner="bleualign" \
bifixer=True bicleaner=True bicleanerModel="${WORK}/bicleaner-model-ai/en-tr/metadata.yaml" deferred=True tmx=True boilerplateCleaning=True deduped=True paragraphIdentification=True \
&> "${WORK}/reports/10-mt-en-tr-paragraph-and-loomchild-and-bicleanerai.report"

2 changes: 2 additions & 0 deletions bitextor/example/marian-translate-ca.sh
@@ -0,0 +1,2 @@
#!/usr/bin/env bash
/data1/lpla/marian-dev/build/marian-decoder -c /data1/students/caen.student.tiny11/config.intgemm8bit.alphas.yml --quiet --cpu-threads 1
2 changes: 2 additions & 0 deletions bitextor/example/marian-translate-is.sh
@@ -0,0 +1,2 @@
#!/usr/bin/env bash
/data1/lpla/marian-dev/build/marian-decoder -c /data1/students/isen.student.base/config.intgemm8bit.alphas.yml --quiet --cpu-threads 2
2 changes: 2 additions & 0 deletions bitextor/example/marian-translate-mt.sh
@@ -0,0 +1,2 @@
#!/usr/bin/env bash
/data1/lpla/marian-dev/build/marian-decoder -c /data1/students/mten.student.tiny11/config.intgemm8bit.alphas.yml --quiet --cpu-threads 1
2 changes: 2 additions & 0 deletions bitextor/example/marian-translate-sq.sh
@@ -0,0 +1,2 @@
#!/usr/bin/env bash
/data1/lpla/marian-dev/build/marian-decoder -c /data1/students/sqen.student.tiny11/config.intgemm8bit.alphas.yml --quiet --cpu-threads 1

0 comments on commit ee2c7ca

Please sign in to comment.