In [2]:
import tarfile
import wget
import sys
from pathlib import Path

# local imports
sys.path.append(str(Path().resolve().parent))
from python.definitions import *
from python.utils import *

files = [
    "http://www.statmt.org/wmt13/training-parallel-europarl-v7.tgz",
    "http://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz",
    "http://www.statmt.org/wmt13/training-parallel-un.tgz",
    "http://www.statmt.org/wmt15/training-parallel-nc-v10.tgz",
    "http://www.statmt.org/wmt10/training-giga-fren.tar",
    "http://www.statmt.org/wmt13/dev.tgz"
]

FRENCH_DATA_DIR.mkdir(parents=True, exist_ok=True)

In [None]:
# Download french corpus
for file in files:
    print(f'downloading {file}')
    wget.download(file, out=str(FRENCH_DATA_DIR))

downloading http://www.statmt.org/wmt13/training-parallel-europarl-v7.tgz
downloading http://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz
downloading http://www.statmt.org/wmt13/training-parallel-un.tgz


In [3]:
# unzip
zip_files = map(lambda x: FRENCH_DATA_DIR / Path(x).name, files)

for file in zip_files:
    t_file = tarfile.open(str(file))
    for member in t_file.getmembers():
        if any(s in member.name for s in ('fr-en', 'en-fr', 'fren', 'enfr', 'newstest2012-ref.fr.sgm', 'newstest2012-src.en.sgm')):
            print(f'extracting {member.name}')
            t_file.extract(member, path=str(file.parent))

for file in FRENCH_DATA_DIR.glob('*.gz'):
    gunzip(str(file), '-f')

for file in FRENCH_DATA_DIR.rglob('*.sgm'):
    out_file = FRENCH_DATA_DIR / f'develop.{str(file).split(".")[-2].lower()}'
    ! {MOSES_SGM} < {file} > {out_file}

extracting news-commentary-v10.fr-en.en
extracting news-commentary-v10.fr-en.fr


In [6]:
# Convert to LF line endings
def norm_endings_lang(lang):
    for file in FRENCH_DATA_DIR.glob('**/*.' + lang):
        print(f'normalizing line ending: {file.name}')
        normalize_line_ending(file)

norm_endings_lang('en')
norm_endings_lang('fr')

normalizing file ending: giga-fren.release2.fixed.en
normalizing file ending: commoncrawl.fr-en.en
normalizing file ending: develop.en
normalizing file ending: news-commentary-v10.fr-en.en
normalizing file ending: undoc.2000.fr-en.en
normalizing file ending: europarl-v7.fr-en.en
normalizing file ending: commoncrawl.fr-en.fr
normalizing file ending: giga-fren.release2.fixed.fr
normalizing file ending: news-commentary-v10.fr-en.fr
normalizing file ending: develop.fr
normalizing file ending: undoc.2000.fr-en.fr
normalizing file ending: europarl-v7.fr-en.fr
concatenating: commoncrawl.fr-en.en
3244152
concatenating: europarl-v7.fr-en.en
2007723
concatenating: giga-fren.release2.fixed.en
22520376
concatenating: news-commentary-v10.fr-en.en
200239
concatenating: undoc.2000.fr-en.en
12886831
concatenating: commoncrawl.fr-en.fr
3244152
concatenating: europarl-v7.fr-en.fr
2007723
concatenating: giga-fren.release2.fixed.fr
22520376
concatenating: news-commentary-v10.fr-en.fr
200239
concatenating:

In [4]:
# Combine corpus into one file
def concat_corpus(lang):
    out_file = FRENCH_DATA_DIR / f'corpus.{lang}'
    files = [x for x in FRENCH_DATA_DIR.glob(f'**/*.{lang}') if 'corpus' not in x.name and 'develop' not in x.name]
    concatenate(out_file, *files)

concat_corpus('en')
concat_corpus('fr')

concatenating: /home/TILDE.LV/arturs.stafanovics/gender-bias/data/fr/commoncrawl.fr-en.en
concatenating: /home/TILDE.LV/arturs.stafanovics/gender-bias/data/fr/giga-fren.release2.fixed.en
concatenating: /home/TILDE.LV/arturs.stafanovics/gender-bias/data/fr/news-commentary-v10.fr-en.en
concatenating: /home/TILDE.LV/arturs.stafanovics/gender-bias/data/fr/training/europarl-v7.fr-en.en
concatenating: /home/TILDE.LV/arturs.stafanovics/gender-bias/data/fr/un/undoc.2000.fr-en.en
concatenating: /home/TILDE.LV/arturs.stafanovics/gender-bias/data/fr/commoncrawl.fr-en.fr
concatenating: /home/TILDE.LV/arturs.stafanovics/gender-bias/data/fr/giga-fren.release2.fixed.fr
concatenating: /home/TILDE.LV/arturs.stafanovics/gender-bias/data/fr/news-commentary-v10.fr-en.fr
concatenating: /home/TILDE.LV/arturs.stafanovics/gender-bias/data/fr/training/europarl-v7.fr-en.fr
concatenating: /home/TILDE.LV/arturs.stafanovics/gender-bias/data/fr/un/undoc.2000.fr-en.fr


In [5]:
# Cleanup
for file in FRENCH_DATA_DIR.rglob('*'):
    if 'corpus' in file.name or 'develop' in file.name:
        continue
    if not file.is_dir():
        file.unlink()

for folder in FRENCH_DATA_DIR.iterdir():
    if folder.is_dir():
        folder.rmdir()

In [4]:
# Tokenize corpus
for file in ('develop', 'corpus'):
    for lang in ('fr', 'en'):
        in_file = FRENCH_DATA_DIR / f'{file}.{lang}'
        out_file = FRENCH_DATA_DIR / f'{file}.tok.{lang}'
        ! cat {in_file} | {NORMALIZE_PUNCTUATION} -l {lang} | {TOKENIZER} -q -a -l {lang} >{out_file} 2> /dev/null

# Clean corpus
! {CLEAN_CORPUS} {FRENCH_DATA_DIR / 'corpus.tok'} en fr {FRENCH_DATA_DIR / 'corpus.clean'} 1 80 {FRENCH_DATA_DIR / 'corpus.retained'}
! {CLEAN_CORPUS} {FRENCH_DATA_DIR / 'develop.tok'} en fr {FRENCH_DATA_DIR / 'develop.clean'} 1 80 {FRENCH_DATA_DIR / 'develop.retained'}

clean-corpus.perl: processing /home/TILDE.LV/arturs.stafanovics/gender-bias/data/fr/corpus.tok.en & .fr to corpus.clean, cutoff 1-80, ratio 9
..........(100000)..........(200000)..........(300000)..........(400000)..........(500000)..........(600000)..........(700000)..........(800000)..........(900000)..........(1000000)..........(1100000)..........(1200000)..........(1300000)..........(1400000)..........(1500000)..........(1600000)..........(1700000)..........(1800000)..........(1900000).........Unicode non-character U+FDD3 is not recommended for open interchange in print at /home/TILDE.LV/arturs.stafanovics/gender-bias/tools/moses-scripts/scripts/training/clean-corpus-n.perl line 140, <E> line 1996159.
.(2000000)....Unicode non-character U+FDD3 is not recommended for open interchange in print at /home/TILDE.LV/arturs.stafanovics/gender-bias/tools/moses-scripts/scripts/training/clean-corpus-n.perl line 141, <E> line 2044383.
......(2100000)..........(2200000)..........(2300000)...

In [None]:
# Train truecase
for lang in ('fr', 'en'):
    truecase_model = PROJECT_ROOT / 'models' / f'truecase-model.{lang}'
    corpus = FRENCH_DATA_DIR / f'corpus.tok.{lang}'
    ! {TRAIN_TRUECASER} -model {truecase_model} -corpus {corpus}

In [11]:
# Apply truecase
for file in ('develop', 'corpus'):
    for lang in ('fr', 'en'):
        corpus_in = FRENCH_DATA_DIR / f'{file}.clean.{lang}'
        corpus_out = FRENCH_DATA_DIR / f'{file}.tc.{lang}'
        truecase_model = PROJECT_ROOT / 'models' / f'truecase-model.{lang}'
        ! {TRUECASE} < {corpus_in} > {corpus_out} -model {truecase_model}