In [3]:
# Change dir to model trainer base dir
% cd "../"

/home/akali/projects/model-trainer


In [4]:
# Create the data directory if it doesn't exist
import os

os.makedirs("./data", exist_ok=True)

In [5]:
# Download raw wiki dump file
from generic_iterative_stemmer.utils import get_path
import requests

language_code = "he"

if language_code == "en":
    url = "https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles-multistream1.xml-p1p41242.bz2"
    file_name = "wiki-en.xml.bz2"
else:
    url = "https://dumps.wikimedia.org/hewiki/latest/hewiki-latest-pages-articles.xml.bz2"
    file_name = "wiki-he.xml.bz2"

response = requests.get(url, stream=True)
wiki_dump_path = get_path(file_name)
open(wiki_dump_path, "wb").write(response.content)

257157873

In [11]:
# Build a corpus file
from generic_iterative_stemmer.training.base.create_corpus import generate_wiki_corpus_file, hebrew_tokenizer_no_suffix

corpus_folder = get_path(f"wiki-{language_code}")
corpus_file_path = os.path.join(corpus_folder, "corpus.txt")
os.makedirs(corpus_folder, exist_ok=True)
tokenizer = hebrew_tokenizer_no_suffix if language_code == "he" else None

generate_wiki_corpus_file(
    articles_file_path=wiki_dump_path,
    output_file_path=corpus_file_path,
    tokenizer_func=tokenizer,
)

Generate wiki corpus: 20704it [03:09, 109.26it/s]


In [14]:
# Train a stemmed model

from generic_iterative_stemmer.training.stemming import FastTextStemmingTrainer
from generic_iterative_stemmer.training.stemming.default_stem_generator import DefaultStemGenerator
from generic_iterative_stemmer.training.stemming.stemming_trainer import IterationProgram
from generic_iterative_stemmer.utils import configure_logging

configure_logging(level="INFO")

training_program = [
    IterationProgram(stem_generator=DefaultStemGenerator(min_cosine_similarity=0.87, max_edit_distance=0)),
    IterationProgram(
        stem_generator=DefaultStemGenerator(
            min_cosine_similarity=0.85, max_edit_distance=1, min_cosine_similarity_for_edit_distance=0.90
        ),
    ),
]

training_params = {"vector_size": 100, "epochs": 6, "window": 5}
default_stemming_params = {"min_cosine_similarity": 0.65, "min_cosine_similarity_for_edit_distance": 0.75}
trainer = FastTextStemmingTrainer(
    corpus_folder=corpus_folder,
    max_iterations=2,
    completed_iterations=0,
    training_program=training_program,
    default_training_params=training_params,
    default_stem_generator_params=default_stemming_params,
)

trainer.train()

[17:12:06] Starting iterations stemmer training... [generic_iterative_stemmer.training.stemming.stemming_trainer]
[17:12:06] Running stemming iteration number 1. [generic_iterative_stemmer.training.stemming.stemming_trainer]
[17:12:06] loading KeyedVectors object from ./data/wiki-en/iter-1/model.kv [gensim.utils]
[17:12:06] loading vectors_vocab from ./data/wiki-en/iter-1/model.kv.vectors_vocab.npy with mmap=None [gensim.utils]
[17:12:06] loading vectors_ngrams from ./data/wiki-en/iter-1/model.kv.vectors_ngrams.npy with mmap=None [gensim.utils]
[17:12:08] setting ignored attribute vectors to None [gensim.utils]
[17:12:08] setting ignored attribute buckets_word to None [gensim.utils]
[17:12:20] FastTextKeyedVectors lifecycle event {'fname': './data/wiki-en/iter-1/model.kv', 'datetime': '2022-02-03T17:12:20.598247', 'gensim': '4.1.2', 'python': '3.8.10 (default, Sep 28 2021, 16:10:42) \n[GCC 9.3.0]', 'platform': 'Linux-5.11.0-40-generic-x86_64-with-glibc2.29', 'event': 'loaded'} [gensim.

Add stemming tasks: 100%|██████████| 177307/177307 [00:05<00:00, 33387.87it/s]
Generate stem dict: 100%|██████████| 177307/177307 [21:46<00:00, 135.74it/s]

[17:34:12] Total 19303 stems generated [generic_iterative_stemmer.training.stemming.stem_generator]





[17:34:13] Stemming corpus... [generic_iterative_stemmer.training.stemming.corpus_stemmer]


Corpus stemming: 20704it [00:29, 704.64it/s] 

[17:34:42] Stemming corpus done [generic_iterative_stemmer.training.stemming.corpus_stemmer]
[17:34:42] Stemming iteration 1 completed. [generic_iterative_stemmer.training.stemming.stemming_trainer]





[17:34:42] Running stemming iteration number 2. [generic_iterative_stemmer.training.stemming.stemming_trainer]
[17:34:42] loading KeyedVectors object from ./data/wiki-en/iter-2/model.kv [gensim.utils]
[17:34:43] loading vectors_vocab from ./data/wiki-en/iter-2/model.kv.vectors_vocab.npy with mmap=None [gensim.utils]
[17:34:43] loading vectors_ngrams from ./data/wiki-en/iter-2/model.kv.vectors_ngrams.npy with mmap=None [gensim.utils]
[17:34:44] setting ignored attribute vectors to None [gensim.utils]
[17:34:44] setting ignored attribute buckets_word to None [gensim.utils]
[17:34:50] FastTextKeyedVectors lifecycle event {'fname': './data/wiki-en/iter-2/model.kv', 'datetime': '2022-02-03T17:34:50.667090', 'gensim': '4.1.2', 'python': '3.8.10 (default, Sep 28 2021, 16:10:42) \n[GCC 9.3.0]', 'platform': 'Linux-5.11.0-40-generic-x86_64-with-glibc2.29', 'event': 'loaded'} [gensim.utils]
[17:34:50] Vectors loaded [generic_iterative_stemmer.utils.loader]
[17:34:50] Generating stem dict for word

Add stemming tasks: 100%|██████████| 151866/151866 [00:04<00:00, 36544.54it/s]
Generate stem dict: 100%|██████████| 151866/151866 [15:37<00:00, 161.91it/s]

[17:50:32] Total 5192 stems generated [generic_iterative_stemmer.training.stemming.stem_generator]





[17:50:33] Stemming corpus... [generic_iterative_stemmer.training.stemming.corpus_stemmer]


Corpus stemming: 20704it [00:26, 795.03it/s] 


[17:50:59] Stemming corpus done [generic_iterative_stemmer.training.stemming.corpus_stemmer]
[17:50:59] Stemming iteration 2 completed. [generic_iterative_stemmer.training.stemming.stemming_trainer]
[17:50:59] Reached 2 iterations, quitting. [generic_iterative_stemmer.training.stemming.stemming_trainer]
