In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import csv
import logging
import os
import shutil
import sys
from pathlib import Path

sys.path.insert(0, '../')

from swissdox import SwissdoxData
import tokenization
import utils

In [None]:
csv.field_size_limit(sys.maxsize)

In [4]:
logging.basicConfig(level=logging.INFO)

In [5]:
!export TOKENIZERS_PARALLELISM=false

In [6]:
LANGUAGES = [
    "de_CH",
    "fr_CH",
    "it_CH",
    "rm_CH",
]

In [7]:
data_dir = Path("../data/swissdox")
assert data_dir.exists()

In [8]:
out_dir_xlm_vocab = data_dir / "xlm_vocab"
out_dir_xlm_vocab.mkdir(exist_ok=True)
out_dir_new_vocab = data_dir / "new_vocab"
out_dir_new_vocab.mkdir(exist_ok=True)

## Extract articles from the Swissdox@LiRI TSV output

In [None]:
for language in LANGUAGES:
    print(language)
    data = SwissdoxData(data_dir / "raw" / f"{language}.tsv")
    articles = list(data.get_articles())
    train, valid = utils.create_split(articles)
    print(f"train: {len(train)} articles")
    print(f"valid: {len(valid)} articles")

    print("XLM vocab => no custom special tokens")
    add_metadata = True
    metadata_use_special_tokens = False
    train_path = out_dir_xlm_vocab / f"{language}.train.txt"
    with open(train_path, "w") as f:
        for article in train:
            article = article.to_txt(add_metadata=add_metadata, metadata_use_special_tokens=metadata_use_special_tokens)
            f.write(article + "\n\n")
    valid_path = out_dir_xlm_vocab / f"{language}.valid.txt"
    with open(valid_path, "w") as f:
        for article in valid:
            article = article.to_txt(add_metadata=add_metadata, metadata_use_special_tokens=metadata_use_special_tokens)
            f.write(article + "\n\n")

    print("Custom vocab => custom special tokens")
    add_metadata = True
    metadata_use_special_tokens = True
    train_path = out_dir_new_vocab / f"{language}.train.txt"
    with open(train_path, "w") as f:
        for article in train:
            article = article.to_txt(add_metadata=add_metadata, metadata_use_special_tokens=metadata_use_special_tokens)
            f.write(article + "\n\n")
    valid_path = out_dir_new_vocab / f"{language}.valid.txt"
    with open(valid_path, "w") as f:
        for article in valid:
            article = article.to_txt(add_metadata=add_metadata, metadata_use_special_tokens=metadata_use_special_tokens)
            f.write(article + "\n\n")

## Tokenization with XLM vocabulary

In [None]:
for language in LANGUAGES:
    for txt_path in [
        out_dir_xlm_vocab / f"{language}.train.txt",
        out_dir_xlm_vocab / f"{language}.valid.txt"
    ]:
        tokenization.tokenize_xlm(
            txt_path,
            txt_path.with_suffix(f".xlm.bpe")
        )

## Tokenization with new vocabulary

In [None]:
tokenization.create_spm_vocabulary(
    txt_paths=[out_dir_new_vocab / f"{language}.train.txt" for language in LANGUAGES],
    name="swissbert",
    sampling_alpha=0.3,
    vocab_size=50260,
    user_defined_symbols=["</s>", "<medium>", "<year>", "<month>"],
)

In [None]:
model_path = Path("swissbert.v1.model")
vocab_path = Path("swissbert.v1.vocab")
assert model_path.exists()
assert vocab_path.exists()
vocab_dir = Path("../vocab")
assert vocab_dir.exists()
shutil.move(model_path, vocab_dir / model_path.name)
shutil.move(vocab_path, vocab_dir / vocab_path.name)
swissbert_model_path = vocab_dir / model_path.name
swissbert_vocab_path = vocab_dir / vocab_path.name

In [None]:
for language in LANGUAGES:
    for txt_path in [
        out_dir_new_vocab / f"{language}.train.txt",
        out_dir_new_vocab / f"{language}.valid.txt"
    ]:
        tokenization.tokenize_hf(
            swissbert_model_path,
            txt_path,
            txt_path.with_suffix(f".new.bpe"),
        )

## Binarization

In [None]:
os.environ["DATA_DIR"] = str(out_dir_xlm_vocab.resolve())
for language in LANGUAGES:
    os.environ["LANGUAGE"] = language
    !fairseq-preprocess \
      --only-source \
      --trainpref "$DATA_DIR/$LANGUAGE.train.xlm.bpe" \
      --validpref "$DATA_DIR/$LANGUAGE.valid.xlm.bpe" \
      --destdir "$DATA_DIR/bin/$LANGUAGE" \
      --bpe sentencepiece \
      --srcdict ../vocab/xlm.dict.txt \
      --workers 20
    !rm "$DATA_DIR/bin/$LANGUAGE/dict.txt"
    !cp ../vocab/xlm.dict.txt "$DATA_DIR/bin/dict.txt"

In [None]:
# Convert spm vocab to fairseq format
swissbert_dict_path = swissbert_vocab_path.with_suffix(".dict.txt")
with open(swissbert_vocab_path) as f_in, open(swissbert_dict_path, "w") as f_out:
    for line in f_in:
        token, _ = line.split()
        if token in {"<s>", "<pad>", "</s>", "<unk>"}:
            continue
        f_out.write(f"{token} 1\n")

In [None]:
os.environ["DATA_DIR"] = str(out_dir_new_vocab.resolve())
os.environ["DICT_PATH"] = str(swissbert_dict_path.resolve())
for language in LANGUAGES:
    os.environ["LANGUAGE"] = language
    !fairseq-preprocess \
      --only-source \
      --trainpref "$DATA_DIR/$LANGUAGE.train.new.bpe" \
      --validpref "$DATA_DIR/$LANGUAGE.valid.new.bpe" \
      --destdir "$DATA_DIR/bin/$LANGUAGE" \
      --bpe sentencepiece \
      --srcdict "$DICT_PATH" \
      --workers 20
    !rm "$DATA_DIR/bin/$LANGUAGE/dict.txt"
    !cp $DICT_PATH "$DATA_DIR/bin/dict.txt"