In [1]:
%cd "../"
%reload_ext autoreload
%autoreload 2

/home/akali/projects/model-trainer


In [2]:
# %cd "./model-trainer"

In [15]:
import json
import logging
from typing import Iterable, Tuple, List

from generic_iterative_stemmer.training.stemming import Word2VecStemmingTrainer
from generic_iterative_stemmer.utils import get_path, configure_logging

configure_logging()


def similarities_to_words(similarities: Iterable[Tuple[str, float]]) -> List[str]:
    return [word for word, _ in similarities]


log = logging.getLogger(__name__)

[12:56:49] Logging configured [generic_iterative_stemmer.utils.logging]


In [38]:
corpus_name = "wiki-he-ft"
corpus_folder = get_path(corpus_name)
trainer = Word2VecStemmingTrainer.load_from_state_file(corpus_folder=corpus_folder)

In [7]:
# trainer.train()

KeyboardInterrupt: 

In [4]:
trainer.save_stem_dict()

[22:03:27] Reducing stem dict of size 39677 [generic_iterative_stemmer.training.stemming.stem_generator]
[22:03:27] Stem dict saved: ./data/wiki-he-cbow-2/iter-9/model.kv.stem-dict.json. [generic_iterative_stemmer.models.stemmed_keyed_vectors]


In [39]:
model = trainer.get_stemmed_keyed_vectors()

[14:12:12] loading KeyedVectors object from ./data/wiki-he-ft/iter-5/model.kv [gensim.utils]
[14:12:12] loading vectors_vocab from ./data/wiki-he-ft/iter-5/model.kv.vectors_vocab.npy with mmap=None [gensim.utils]
[14:12:12] loading vectors_ngrams from ./data/wiki-he-ft/iter-5/model.kv.vectors_ngrams.npy with mmap=None [gensim.utils]
[14:12:15] setting ignored attribute vectors to None [gensim.utils]
[14:12:15] setting ignored attribute buckets_word to None [gensim.utils]
[14:12:24] FastTextKeyedVectors lifecycle event {'fname': './data/wiki-he-ft/iter-5/model.kv', 'datetime': '2022-01-22T14:12:24.937846', 'gensim': '4.1.2', 'python': '3.8.10 (default, Sep 28 2021, 16:10:42) \n[GCC 9.3.0]', 'platform': 'Linux-5.11.0-40-generic-x86_64-with-glibc2.29', 'event': 'loaded'} [gensim.utils]


In [40]:
model.most_similar("מאדימ", topn=10)

[('מארינרמאדימ', 0.8637520670890808),
 ('הירח', 0.7906030416488647),
 ('אנשי-המאדימ', 0.7521252036094666),
 ('למאדימ', 0.7250083088874817),
 ('הגשושית', 0.7064836025238037),
 ('מאדימי', 0.7051835656166077),
 ('גשושית', 0.6916646361351013),
 ("ג'מינידימ", 0.6877880692481995),
 ('שביטימ', 0.6837199926376343),
 ('באטמוספירה', 0.6762114763259888)]

In [31]:
model.most_similar(positive=["אישה", "מלך"], negative=["גבר"], topn=5)

[('מללה', 0.6541310548782349),
 ('מהדבורה', 0.6261004209518433),
 ('מלורה', 0.6221399307250977),
 ('מלה', 0.6206751465797424),
 ('מלולה', 0.618818998336792)]

In [42]:
model.similarity("מלך", "מלכה")

0.37712467

In [43]:
model.most_similar(positive=["ירושלימ", "גרמניה"], negative=["ברלינ"], topn=5)

[('מצרים-ישראל', 0.7028128504753113),
 ('ישראל-אפריקה', 0.7020168900489807),
 ('ספרד-ירושלימ', 0.6924636363983154),
 ('צרפת-ישראל', 0.6680106520652771),
 ('ישראל-טורקיה', 0.6668156981468201)]

In [44]:
model.most_similar(positive=["לונדונ", "גרמניה"], negative=["ברלינ"], topn=5)

[('בריטניה', 0.8269242644309998),
 ('אנגליה', 0.8238611221313477),
 ('אירלנד', 0.8103494644165039),
 ('אירלנדבריטניה', 0.8073332905769348),
 ('סקוטלנד', 0.7803863286972046)]

In [32]:
model.most_similar(positive=["שמש", "שבתאי", "צדק"], topn=5)

[('ספירא', 0.6659421920776367),
 ('אביצדק', 0.6564679145812988),
 ("שמשון'", 0.6377360820770264),
 ("קדם'", 0.6315962672233582),
 ('ותאור', 0.62503582239151)]

In [59]:
import pandas as pd

base_word = "גבר"
others = model.index_to_key[0:2000]
similarities: list = model.most_similar(base_word, topn=20)


def get_similarity(model, word):
    return word, model.similarity(base_word, word)


similarities.append(get_similarity(model, "קיפוד"))
similarities.append(get_similarity(model, "תפוח"))
similarities.append(get_similarity(model, "קילו"))
similarities.append(get_similarity(model, "פסיפס"))


def representative_distances(model, representative: str, others: List[str]):
    return model.distances(representative, other_words=others)


def association_similarity(w1: str, w2: str, others):
    x1 = representative_distances(model, w1, others)
    x2 = representative_distances(model, w2, others)
    return (x1.T @ x2) / len(others)


records = []
for similar_word, grade in similarities:
    association = association_similarity(base_word, similar_word, others)
    record = (f"x {similar_word} x", grade, association)
    records.append(record)

df = pd.DataFrame(data=records, columns=["word", "cosine_similarity", "association_similarity"])

print(df.sort_values("cosine_similarity", ascending=False)[::5])

          word  cosine_similarity  association_similarity
0     x גבר' x           0.721087                1.072382
5     x גברד x           0.653109                1.048479
10    x יגבר x           0.609564                1.096053
15  x גבריינ x           0.590570                1.026747
20   x קיפוד x           0.190626                1.010865
