This file trains Word2Vec models for task of filling in gaps of Ancient Egyptian sentences. The code performs parameter search in effort to identify the best parameters to use with Word2Vec.

&#9888; Warning: With the default parameters the testing will run for around three hours.

The code is hidden by default for brevity. Click button Show code (or equivalent in your language) to delve in to the details.



# Setup

In [1]:
# @title Setup work space and download files { display-mode: "form" }

# Create working directory and move to it.
!echo -n "Populating working directory: "
!mkdir -p /content/data-word2vec-params
%cd /content/data-word2vec-params

from tqdm.notebook import tqdm, tqdm_notebook
import requests
import os

misc_files = [
  'data/marete-ramses/aligned/combined_dev.txt',
  'data/marete-ramses/aligned/combined_test.txt',
]
egy_path='https://raw.githubusercontent.com/annasahola/egy-gaps/main/'

for path_file in tqdm_notebook(misc_files, desc="Downloading"):
  filename = os.path.basename(path_file)
  if not os.path.exists(filename):
    print(f"Downloading {filename}")
    response = requests.get(egy_path + path_file)
    if not response.ok:
      raise(Exception(f"Failed to download {egy_path + path_file}"))
    with open(filename, "wb") as file_out:
      file_out.write(response.content)

# Setup path for models
!mkdir -p models


Populating working directory: /content/data-word2vec-params


Downloading:   0%|          | 0/2 [00:00<?, ?it/s]

In [2]:
# @title Check files exist. { display-mode: "form" }
import os


for fi in ['combined_dev.txt', 'combined_test.txt']:
  if os.path.exists(fi) != True:
    print(f"file {fi} does not exist on File System")
    raise Exception("Missing file(s)")


print("\U0001f44D All required files located.")

👍 All required files located.


In [3]:
# @title Load required libraries. { display-mode: "form" }
import nltk
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
import json
import io
import re
import string
import random
from tqdm.notebook import tqdm, tqdm_notebook
import numpy as np
import collections
import gensim.models
#import Levenshtein
import heapq
#from unidecode import unidecode
#from geneticalgorithm2 import geneticalgorithm2 as ga2
from collections import Counter
import time
from tabulate import tabulate
#from gardiner2unicode import GardinerToUnicodeMap
from ipywidgets import interact, interactive, fixed, interact_manual, widgets
from gensim.models.callbacks import CallbackAny2Vec
from IPython.display import clear_output

Import libraries (all libraries should be installed above)

# [Word2Vec](https://radimrehurek.com/gensim/models/word2vec.html)

Generate Word2Vec and FastText models, test them and allow user to view some results.

The models are implemented using gensim libraries.


In [4]:
# @title Details: Helper functions. Click `Show code` to view details. { display-mode: "form" }
# @markdown Helper functions are useful to make actual model generation functions easy to read.
# @markdown `print_sentence`, `token_to_text`, `start_pad`, `end_pad`, `is_pad`, `has_pad`, `token_seq`, `token_seq_tr`, `token_seq_rtc`
def print_sentence(sentence):
  print(f'{sentence["text"]}')
  print(f'. tokens: {sentence["token_list"]}')
  print(f'. transliteration: {sentence["sentence_transliteration"]}')
  print(f'. translation: {sentence["translation"]}')
  # Optional English translation (just one sentence here)
  # The sentence dictionary for German to English translations can be used for more.
  if sentence["sentence_transliteration"] == "bn ftt =tw =f":
    print('. It (the inscription on the stele) should not be erased.') # Google Translate (from de)
  elif 'en' in sentence: # Does dictionary lookup translation exist?
    print(f'. en: {sentence["en"]}')

def token_to_text(token):
  s = token
  if token in aecorpus.aed:
    t = aecorpus.aed[token]
    if 'form' in t:
      s += "/" + t['form']
    if 'translations' in t:
      tr = t['translations']
      if len(tr) > 0 and len(tr[0]) > 0:
        tr1 = tr[:1][:1]
        tr0 = list(filter(lambda translation: translation[1] == 'en', tr))
        if len(tr0) > 0:
          tr1 = tr0
        s += "/" + str(tr1[0][0])
  return s

def start_pad(padlen=2):
  if padlen == 2:
    return ['<s0>', '<s1>']
  return [f'<s{i}>' for i in range(padlen)]

def end_pad(padlen=2):
  if padlen == 2:
    return ['</s1>', '</s0>']
  return [f'</s{padlen-i-1}>' for i in range(padlen)]

def is_pad(x):
  return x in start_pad(9) + end_pad(9)

def has_pad(a):
  for x in a:
    if is_pad(x):
      return True
  return False

def token_seq(sentence, replace_numbers=False, padlen=2, nopad=False):
    # Note: replace numbers is no-op here. replacement has already been performed
    token_seq = [token[3] for token in sentence['tokens']] # With lemma id
    if nopad:
      return token_seq
    return start_pad(padlen) + token_seq + end_pad(padlen)

def token_seq_tr(sentence, replace_numbers=False, padlen=2, nopad=False):
    # Note: replace numbers is no-op here. replacement has already been performed
    token_seq = sentence["sentence_transliteration"].split(" ") # Before lemmatization
    if nopad:
      return token_seq
    return start_pad(padlen) + token_seq + end_pad(padlen)

re_fraction=re.compile('^[0-9]+\/[0-9]+$')
def token_seq_rtc(sentence, replace_numbers=False, padlen=2, nopad=False):
    try:
      token_seq = [token[0] for token in sentence['tokens']] # With text - for RTC currently best pick
    except:
      print("Error with: ", sentence)
      raise "Errorred"
    if replace_numbers:
      # Handle numbers
      global re_fraction
      for idx in range(len(token_seq)):
        if token_seq[idx].isnumeric():
          token_seq[idx] = "1...n"
        if re_fraction.match(token_seq[idx]):
          token_seq[idx] = "1...n"
    #token_seq = [token[3] for token in sentence['tokens']] # With lemma id
    if nopad:
      return token_seq
    return start_pad(padlen) + token_seq + end_pad(padlen)


To work in beginning or ending of sentences padding may be useful.
Two pads seemed to work better than 1.

In [5]:
# @title Prediction statistics -- `PredictionStats`. Click `Show code` in the code cell. { display-mode: "form" }
# @markdown
class PredictionStats:
    """Collect statistics regarding correctness of tests"""

    def __init__(self, name):
      self.name = name
      self.c = Counter()
      self.fails = 0
      self.start = time.time()
      self.end = float("nan")

    def start_testing(self):
      self.start = time.time()
      return self
    def end_testing(self):
      self.end = time.time()
      return self
    def miss(self):
      self.c[-1] += 1
    def fail(self):
      self.fails += 1   # Count failures outside N
    def predicted(self, item, predictions):
      if item in predictions:
        idx = predictions.index(item) + 1
        self.c[idx] += 1
      else:
        self.miss()
        idx = -1
      return idx
    def get_name(self):
      return self.name
    def get_n(self):
      # Failures are counted in n.
      return self.c.total() + self.fails
    def get_heading(self, include_mrr5=False):
      if include_mrr5:
        return ['hit', 'hit@5', 'hit@10', 'missed', 'MRR@5', 'MRR@10', 'N', 'untestable', 'ms/test']
      return ['hit', 'hit@5', 'hit@10', 'missed', 'MRR', 'N', 'untestable', 'ms/test']
    def get_stats(self, include_mrr5=False):
      accu = 0
      accu_ = 0
      accu5 = 0
      accu10 = 0
      for r in range(1, 6):
        accu_ += self.c[r]/r
      for r in range(1, 11):
        accu += self.c[r]/r
        accu5 += self.c[r] if r <= 5 else 0
        accu10 += self.c[r]
      n = self.get_n()
      if n < 1:
        n = 1
      if include_mrr5:
        return [
          self.c[1] / n,
          accu5 / n,
          accu10 / n,
          self.c[-1] / n,
          accu_ / n,
          accu / n,
          n,
          self.fails,
          ((self.end - self.start) * 1000) / n
        ]
      return [
          self.c[1] / n,
          accu5 / n,
          accu10 / n,
          self.c[-1] / n,
          accu / n,
          n,
          self.fails,
          ((self.end - self.start) * 1000) / n
      ]

# These variables collect statistics
prs = [None] * 2
stats = dict()

In [6]:
# @title Prediction Function `predict_words`. Use model(s) to predict word. Click `Show code` in the code cell. { display-mode: "form" }
# @markdown Prediction function has restrictions as follows:
# @markdown
# @markdown - It can only predict one part of text.
# @markdown - It can only work on single word missing parts.

def predict_words(tokens,model,model_score=None,lacuna="LACUNA",verbose=False,omitted=None,vocab=None,algo=0,topn=10,padlen=2,stat_append=None,stat_id=None,nopad=False):
  """Predict missing word(s) from sentence.

    Parameters
    ------------
        tokens: list of str
            Sentence (token sequence) to predict (including start_pad() and end_pad())
        model: Word2Vec model
            Model used for predicting with predict_output_word
        model_score: Word2Vec model
            Model used for scoring with score
        lacuna: str
            Placeholder for missing word
        verbose: bool
            Perform debug prints (default=False)
        omitted: str
            Provide omitted token (only needed for verbose mode debug prints)
        vocab: list of str or None
            Possible words - if none, all words are possible. (only for algo=2)
        algo: int [-1, 2]
            Select algorithm to perform prediction
        topn: int
            Select how many result to return (default=10)

    Return
    -----------
        predictions: list of tuples (str, float)
            List of predictions in the best first order with weights.

        May raise exceptions on exceptional situations.

    Bugs
    ------------
        current implementation is able to predict only one missing word.
  """
  if vocab is None:
    if hasattr(model, 'wv'):
      vocab = model.wv.index_to_key

  try:
    idx = tokens.index(lacuna)
  except ValueError:
    return [(1.0), " ".join(tokens)]*topn

  predict = predict_words_internal(idx, tokens,model,model_score,vocab,algo,topn,padlen,nopad)
  if verbose:
    print(f"OMITTED = {omitted}")
    for i, p in enumerate(predict):
      if p[0] == omitted:
        print(f"PREDICTED {i} {token_to_text(p[0])} (prob: {p[1]}) match [rank={i+1}]")
      else:
        if hasattr(model, 'wv'):
          similarity = model.wv.n_similarity(omitted, p[0])
        else:
          similarity = 1.0 - i / topn # No similarity function
        print(f"PREDICTED {i} {token_to_text(p[0])} (prob: {p[1]}) similarity with orig: {similarity}")
  if stat_append is not None:
    tokens_ = filter(lambda t: not has_pad(t), tokens)
    simi=[]
    hit_idx = None
    for i, p in enumerate(predict):
      if hasattr(model, 'wv'):
        similarity = model.wv.n_similarity(omitted, p[0])
      else:
        similarity = 1.0 - i / topn # No similarity function
      simi.append([p[0],p[1],similarity])
      if similarity == 1:
        hit_idx = i + 1
    stat_append.append((stat_id, tokens_, omitted, hit_idx, tuple([p[0] for p in predict]), simi))
  return predict

# Underlying worker function
def predict_words_internal(idx, tokens,model,model_score=None,vocab=None,algo=0,topn=10,padlen=2,nopad=False):
  token_seq_before = tokens[:idx]
  token_seq_after = tokens[idx + 1:]

  if algo == -1:
    # Simply predict top vocabulary words.
    i = 0
    while is_pad(vocab[i]):
      i = i + 1
    return [(vocab[i+l], 1/(l+1)) for l in range(0, 10)]
  if algo == 0:
    if len(token_seq_before) > 0 and len(token_seq_after) > 0:
      if len(token_seq_before) > len(token_seq_after):
        token_seq_before = token_seq_before[-len(token_seq_after):]
      elif len(token_seq_before) < len(token_seq_after):
        token_seq_after = token_seq_after[:len(token_seq_before)]
      maxpad=4
      predict = model.predict_output_word(token_seq_before + token_seq_after,topn=topn+maxpad)
      # Filter predictions, remove padding symbols
      predict_out = []
      for x in predict:
        if not is_pad(x[0]) and x[0] is not None:
          predict_out.append(x)
        if len(predict_out) == topn:
          break
      return predict_out
    else:
      return ValueError("Input not supported: no before or after tokens")
  if algo == 1:
    if len(token_seq_before) > 0 and len(token_seq_after) > 0:
      if len(token_seq_before) > len(token_seq_after):
        token_seq_before = token_seq_before[-len(token_seq_after):]
      elif len(token_seq_before) < len(token_seq_after):
        token_seq_after = token_seq_after[:len(token_seq_before)]
      predict = model.predict_output_word(token_seq_before + token_seq_after,topn=topn*10)
      sentences = []
      for pr in predict:
        tokens = token_seq_before + [pr[0]] + token_seq_after
        if nopad:
          tokens = tokens[len(start_pad(padlen)):len(tokens) - len(end_pad(padlen))]
        sentences.append(" ".join(tokens))
      scores = model_score.score(sentences)
      adjusted_predictions=[(predict[i][0], scores[i]) for i in range(len(scores))]
      adjusted_predictions.sort(key = lambda x: -x[1])
      predict = adjusted_predictions[:topn]
      return predict
    else:
      return ValueError("Input not supported: no before or after tokens")
  if algo == 2:
    sentences = []
    for token in vocab:
      tokens = token_seq_before + [token] + token_seq_after
      if nopad:
        tokens = tokens[len(start_pad(padlen)):len(tokens) - len(end_pad(padlen))]
      sentences.append(" ".join(tokens))
    scores = model_score.score(sentences)
    adjusted_predictions=[(vocab[i], scores[i]) for i in range(len(scores))]
    # TODO: Quicker sorting here
    adjusted_predictions.sort(key = lambda x: -x[1])
    predict = adjusted_predictions[:topn]
    return predict
  raise ValueError("algo is not supported")



In [7]:
# @title Define Model Generation Progress Class. Click `Show code` in the code cell. { display-mode: "form" }
# @markdown

class TqdmModelProgress(CallbackAny2Vec):
     '''Callback to log information about training'''

     def __init__(self, epochs):
         self.epoch = 0
         self.epochs = epochs
         self.tqdm = tqdm_notebook(desc="Starting", total=self.epochs,
                                   unit="epoch", leave=False)

     def on_epoch_begin(self, model):
         if self.epoch == 0:
             self.tqdm.desc = "Training"

     def on_epoch_end(self, model):
         self.epoch += 1
         if self.epoch == self.epochs:
             self.tqdm.description = "Finished"
         loss = model.get_latest_training_loss()
         if loss is not None:
             self.tqdm.set_postfix({'loss=': loss})
         self.tqdm.update(1)

     def close(self):
         self.tqdm.close()

In [8]:
# @title Define Model Testing Functions
# @markdown `test_w2v` for testing AES with lemma id. Use `seq` parameter to override `token_seq`to test AES/TR or RTC.
# @markdown Alternatively you may use convenience wrappers `test_w2v_tr` and `test_w2v_rtc`.
def test_w2v(name, w2v, sentences, silent=False, padlen=2, algo=0, seq=token_seq, omitted_indexes=None):
  pr = PredictionStats(name)
  spadlen = padlen
  fpadlen = padlen * 2
  random.seed(42)
  if silent == False:
    print("Processing test sentences with single hidden word")
  c=0
  idx=0
  for sentence in sentences:
    tokens = seq(sentence,padlen=padlen)
    if len(tokens) == fpadlen:
      pr.fail() # Failed processing
      continue
    if omitted_indexes is None:
      token_omitted = random.randrange(len(tokens) - fpadlen) + spadlen
    else:
      token_omitted = omitted_indexes[idx] + spadlen
      idx = idx + 1
    token_orig = tokens[token_omitted]
    tokens[token_omitted] = "LACUNA"
    verbose = False
    if c < 5 and silent == False:
      verbose = True
    if c == 5 and silent == False:
        print("Processing remaining sentences quietly")
    c += 1

    try:
      predict = predict_words(tokens, w2v, None, verbose=verbose, omitted=token_orig, algo=algo, padlen=padlen)
      if len(predict) == 0:
        raise Exception("No predictions")
      try:
        pr.predicted(token_orig, [x[0] for x in predict])
      except:
        pr.fail() # Failed processing predictions
    except:
      pr.fail() # Failed processing predicting

  return pr.end_testing()



In [9]:
# @title Model Training function `get_gensim_gen`

# @markdown The function automates storage and retrieval of models.
# @markdown
# @markdown Update model id when inputs or model settings are updated.
#
gensim_model_prefix="2024-03-13a" # @param {type:"string"}
gensim_model_store = True # @param {type:"boolean"}
all_models_pregenerated = False # @param {type:"boolean"}

def get_gensim_gen(corpus, vector_size=200, epochs=100, padlen=2,
                   silent=False, token_seq_func = token_seq, nobar=False,
                   model = gensim.models.Word2Vec, id=None,
                   noload=False, nosave=False, test=None, testlimit=0.1,
                   prev_model = None,
                   **kwargs):
  global gensim_model_id
  global gensim_model_store
  if id is None:
    id = model.__class__.__name__
  filepath = f"models/{gensim_model_prefix}-{id}.model"
  # If all_models_pregenerated is set we expect to
  # find all models encountered in processing and
  # raise error if model is not found.
  if all_models_pregenerated == True:
    if not os.path.exists(filepath):
      raise(Exception(f"Model path {filepath} does not exist"))
  if gensim_model_store and not noload and os.path.exists(filepath):
    model_out = model.load(filepath)
    if test is not None:
      if test(model_out) < testlimit:
        raise Exception("Loaded Model failed to validate")
    return model_out

  c, cn, cn2 = (0, 0, 0)
  sentences_list = []
  token_kwargs = {key:kwargs[key] for key in ['nopad', 'replace_numbers'] if key in kwargs}
  for sentence in corpus:
    tokens = token_seq_func(sentence, padlen=padlen, **token_kwargs)
    if silent == False:
      if c < 5:
        print(tokens)
        c += 1
      elif cn < 2 and '1...n' in tokens:
        print(tokens)
        cn += 1
      elif cn2 < 2 and '123' in tokens:
        print(tokens)
        cn2 += 1
    sentences_list.append(tokens)
  in_kwargs = dict(kwargs)
  if 'window' not in in_kwargs:
    in_kwargs['window']=5
  if 'min_count' not in in_kwargs:
    in_kwargs['min_count']=1
  if 'replace_numbers' in in_kwargs:
    del in_kwargs['replace_numbers']
  if 'nopad' in in_kwargs:
    del in_kwargs['nopad']
  callbacks = []
  if not nobar:
    callbacks = [TqdmModelProgress(epochs)]

  if prev_model is None:
    model_out = model(sentences=sentences_list, vector_size=vector_size,
                      workers=10, epochs=epochs, callbacks=callbacks,
                      **in_kwargs)
  else:
    #model_out = prev_model.train(sentences=sentences_list, vector_size=vector_size,
    #                  workers=10, epochs=epochs-prev_model.epochs, callbacks=callbacks,
    #                  **in_kwargs)
    # Some arguments only apply in creation of model.
    if 'vector_size' in in_kwargs:
      del in_kwargs['vector_size']
    model_out = prev_model.train(corpus_iterable=sentences_list,
                      epochs=epochs, callbacks=callbacks,
                      **in_kwargs)

  if test is not None:
    if test(model_out) < testlimit:
      raise Exception("Trained model failed to validate")
    if gensim_model_store and not nosave:
      model_out.save(filepath)
  callbacks[0].close()
  return model_out

def get_gensim_ft_rtc(corpus, **kwargs):
  return get_gensim_gen(corpus, model=gensim.models.FastText,
                        token_seq_func = token_seq_rtc, **kwargs)

def get_gensim_ft_aes(corpus, **kwargs):
  return get_gensim_gen(corpus, model=gensim.models.FastText,
                        token_seq_func = token_seq_tr, **kwargs)




In [10]:
# @title Use MaReTe combined AES and Ramses transliterations as training + test source

# @markdown There are combined transliterations, using both AES and Ramses sentences.
# @markdown These use slightly different transliteration than AES MdC so use different models.


# Remove exact same sentences from testing data than
# are present in training data. Default is false as
# some overlap is to be expected.
filter_duplicates=False # @ param {type:"boolean"}
w2v_tr_combined_variants=1 # @ param{type:"int"}


def sentence_transform_in(sentence):
  return {"sentence_transliteration": sentence}

def token_seq_out(sentence, replace_numbers=False, padlen=2, nopad=False):
    # Note: replace numbers has not been implemented
    token_seq = sentence["sentence_transliteration"].split(" ")
    if nopad:
      return token_seq
    return start_pad(padlen) + token_seq + end_pad(padlen)

# Read test materials
with open("combined_dev.txt", "r") as train_file:
  marete_train_file_lines = train_file.readlines()
  marete_train_file_lines = [x.replace("\n", "") for x in marete_train_file_lines]

with open("combined_test.txt", "r") as test_file:
  marete_test_file_lines = test_file.readlines()
  marete_test_file_lines = [x.replace("\n", "") for x in marete_test_file_lines]

marete_train_sentences_in = [sentence_transform_in(sentence) for sentence in marete_train_file_lines]
if filter_duplicates:
  marete_test_sentences_in = [sentence_transform_in(sentence) for sentence in marete_test_file_lines
                                if sentence not in marete_train_file_lines]
else:
  marete_test_sentences_in = [sentence_transform_in(sentence) for sentence in marete_test_file_lines]

for x in marete_train_file_lines:
  if '\n' in x:
    print("Found linefeed in marete_train_file_lines")
  if '\xA0' in x:
    print("Found nonbreaking space in marete_train_file_lines")
for x in marete_test_file_lines:
  if '\n' in x:
    print("Found linefeed in marete_test_file_lines")
  if '\xA0' in x:
    print("Found nonbreaking space in marete_test_file_lines")




In [11]:
# @title Reduce material used for testing/validation
# @markdown MaReTe Transliterations has a largish amount of sentences. For coming up with Word2Vec parameter recommendations quickly, it is recommended to reduce testing set.
# @markdown Training set will be divided by this amount:
training_divisor=3 # @param{type:"slider",min:2,max:100}
# @markdown A validation set is taken from non-overlapping part of training sentences. It is further reduced by this amount:
testing_extra_divisor=3 # @param{type:"slider",min:1,max:100}

#Note: for this we use training sentences also for testing
marete_train_sentences_in_q3=marete_train_sentences_in[::training_divisor]
marete_test_sentences_in_q9=marete_train_sentences_in[1::training_divisor][::testing_extra_divisor]

print(f"Using {len(marete_train_sentences_in_q3)} training sentences and {len(marete_test_sentences_in_q9)} test sentences.")


Using 33822 training sentences and 11274 test sentences.


In [12]:
# @title Define Word2Vec configuration parameter sets to test

resprev = [[0.75, 5, 0, 5, 100, 5, False, 0.001, 0, 0.06560665169247744],
 [0.75, 5, 0, 15, 100, 5, False, 0.001, 0, 0.11450746551526787],
 [0.75, 5, 1, 5, 100, 5, False, 0.001, 0, 0.04956021919610997],
 [0.75, 5, 1, 15, 100, 5, False, 0.001, 0, 0.07332330050145396],
 [0.17955186844534993, 22, 1, 15, 259, 2, True, 0.001, 4, 0.12372699063206216],
 [0.21821168418246245, 21, 1, 15, 227, 2, True, 0.001, 4, 0.11205389394596156],
 [0.17955186844534993, 22, 1, 15, 259, 2, False, 0.001, 4, 0.12372699063206216],
 [0.21821168418246245, 21, 1, 15, 227, 2, False, 0.001, 4, 0.11205389394596156],
 [-0.5, 21, 0, 50, 200, 3, True, 0.001, 4, 0.07551879560982291],
 [0.03, 21, 0, 50, 200, 3, False, 1e-05, 4, 0.11347916279645798],
 [0.03, 21, 0, 50, 200, 3, True, 0.0001, 0, 0.11611021666288246],
 [0.03, 21, 0, 50, 200, 3, False, 0.0001, 2, 0.11765531342904553],
 [0.03, 21, 0, 50, 200, 3, True, 0.0001, 4, 0.11714724494308369],
 [0.03, 21, 0, 50, 200, 3, False, 0.001, 4, 0.15862814159303107],
 [-0.5, 21, 0, 50, 200, 5, True, 0.001, 4, 0.0709201276496465],
 [0.03, 21, 0, 50, 200, 5, False, 1e-05, 4, 0.11153222109008845],
 [0.03, 21, 0, 50, 200, 5, True, 0.0001, 0, 0.10729944813169781],
 [0.03, 21, 0, 50, 200, 5, False, 0.0001, 2, 0.10782739150749553],
 [0.03, 21, 0, 50, 200, 5, True, 0.0001, 4, 0.1074507381074351],
 [0.03, 21, 0, 50, 200, 5, False, 0.001, 4, 0.148848022612652],
 [-0.5, 21, 0, 50, 100, 3, True, 0.001, 4, 0.07345856234022684],
 [0.03, 21, 0, 50, 100, 3, False, 1e-05, 4, 0.11280427658841183],
 [0.03, 21, 0, 50, 100, 3, True, 0.0001, 0, 0.10875556919119991],
 [0.03, 21, 0, 50, 100, 3, False, 0.0001, 2, 0.11029039231249896],
 [0.03, 21, 0, 50, 100, 3, True, 0.0001, 4, 0.11109029798886759],
 [0.03, 21, 0, 50, 100, 3, False, 0.001, 4, 0.1540269247681471],
 [0.05, 21, 0, 50, 200, 3, True, 0.001, 4, 0.16148804598089386],
 [0.05, 21, 0, 50, 100, 3, False, 0.001, 4, 0.153679247112277],
 [0.1, 21, 0, 50, 100, 3, True, 0.001, 4, 0.15531557134287952],
 [0.05, 21, 0, 50, 200, 5, False, 0.001, 4, 0.15164769247994211],
 [0.1, 21, 0, 50, 200, 5, True, 0.001, 4, 0.16021316014814063],
 [0.05, 21, 0, 50, 200, 2, False, 0.001, 4, 0.16556127251835964],
 [-0.5, 21, 1, 150, 100, 3, False, 1e-04, 0, 0],
 [-0.5, 21, 1, 110, 100, 3, False, 1e-05, 0, 0],
 [-0.5, 21, 1, 130, 100, 3, False, 1e-05, 0, 0],
 [-0.5, 21, 1, 140, 100, 3, False, 1e-03, 0, 0],
 [-0.5, 21, 1, 150, 100, 7, False, 1e-05, 0, 0],
 [-0.5, 21, 1, 150, 100, 7, False, 1e-05, 0, 0],
 [0.05, 21, 0, 50, 200, 3, True, 0.01, 4, 0.16148804598089386],
 [0.1, 21, 0, 50, 200, 5, True, 0.01, 4, 0.16021316014814063],
 [0.05, 21, 0, 50, 200, 2, False, 0.01, 4, 0.16556127251835964],
 [-0.1, 21, 0, 50, 200, 3, True, 0.001, 4, 0.16148804598089386],
 [-0.2, 21, 0, 50, 200, 5, True, 0.001, 4, 0.16021316014814063],
 [-0.3, 21, 0, 50, 200, 2, False, 0.001, 4, 0.16556127251835964],
 [-0.2, 21, 0, 50, 200, 3, True, 0.01, 4, 0.16148804598089386],
 [-0.2, 21, 0, 50, 200, 5, True, 0.01, 4, 0.16021316014814063],
 [-0.2, 21, 0, 50, 200, 2, False, 0.01, 4, 0.16556127251835964],
 [0.05, 21, 0, 50, 200, 3, True, 0.01, 4, 0.16148804598089386],
 [0.05, 21, 0, 50, 200, 5, True, 0.01, 4, 0.16021316014814063],
 [0.05, 21, 0, 50, 200, 2, False, 0.01, 4, 0.16556127251835964],
 [-0.2, 21, 1, 50, 200, 3, True, 0.01, 4, 0.16148804598089386],
 [-0.2, 21, 1, 50, 200, 5, True, 0.01, 4, 0.16021316014814063],
 [-0.2, 21, 1, 50, 200, 2, False, 0.01, 4, 0.16556127251835964]]

w2v_tr_combined_variants_ = 100
w2v_tr_opts_ = dict()
w2v_tr_id_ = dict()
w2v_tr_list_ = []
for params in resprev:
  ns_exponent, negative, sg, epochs, vector_size, window, shrink_window, sample, min_count, prevscore = params
  # override number of epochs
  epochs = 10
  t=(ns_exponent, negative, sg, vector_size, window, shrink_window, sample, min_count, epochs)
  if t in w2v_tr_opts_:
    continue
  opts = {'ns_exponent': ns_exponent, 'negative': negative, 'sg': sg,
          'vector_size': vector_size, 'window': window,
          'sample': sample,
          'shrink_windows': (shrink_window == 1), 'compute_loss': True, 'epochs': epochs}

  w2v_tr_opts_[t] = opts
  w2v_tr_id_[t] = f"W2V TR nse={ns_exponent}-neg={negative}-{'sg' if sg else 'cbow'}-vs={vector_size}-w={window}-{'shrink' if shrink_window == 1 else 'static'}-sample={sample}-min={min_count}-epochs={epochs}"
  if len(w2v_tr_opts_) >= w2v_tr_combined_variants_:
    break
  w2v_tr_list_.append((ns_exponent, negative, sg, vector_size, window, shrink_window, sample, min_count, epochs))

column_names = [ 'ns_exponent', 'negative', 'sg', 'vector_size', 'window', 'shrink_windows', 'sample', 'min_count', 'epochs' ]
text_stats_df = pd.DataFrame(w2v_tr_list_, columns=column_names)
pd.set_option('display.max_rows', 100)
pd.set_option('display.min_rows', 100)
print("Evaluating parameter sets:")
display(text_stats_df)


Evaluating parameter sets:


Unnamed: 0,ns_exponent,negative,sg,vector_size,window,shrink_windows,sample,min_count,epochs
0,0.75,5,0,100,5,False,0.001,0,10
1,0.75,5,1,100,5,False,0.001,0,10
2,0.179552,22,1,259,2,True,0.001,4,10
3,0.218212,21,1,227,2,True,0.001,4,10
4,0.179552,22,1,259,2,False,0.001,4,10
5,0.218212,21,1,227,2,False,0.001,4,10
6,-0.5,21,0,200,3,True,0.001,4,10
7,0.03,21,0,200,3,False,1e-05,4,10
8,0.03,21,0,200,3,True,0.0001,0,10
9,0.03,21,0,200,3,False,0.0001,2,10


In [13]:
# @title Training models and testing
w2v_tr_combined_array_ = dict()
w2v_tr_combined_stats_ = dict()
w2v_tr_combined_score_ = 0.0
w2v_tr_combined_data_ = []
for t, opts in tqdm_notebook(w2v_tr_opts_.items(), desc="Variants"):
  id = w2v_tr_id_[t]
  w2v_tr_combined_array_[t] = get_gensim_gen(marete_train_sentences_in_q3, token_seq_func = token_seq_out,
                                            silent=True, id="W2V-TR-combined-1.98", noload=True,
                                            nosave=False, **opts) # Takes around 2 minutes
  res = test_w2v(id, w2v_tr_combined_array_[t], marete_test_sentences_in_q9, seq=token_seq_out, silent=True)
  mrr = res.get_stats()[4]
  w2v_tr_combined_stats_[id] = res
  if mrr > w2v_tr_combined_score_:
    w2v_tr_combined_score_ = mrr
    w2v_tr_combined = w2v_tr_combined_array_[t]
    w2v_tr_combined_opts = opts
  w2v_tr_combined_data_.append([*t,mrr])
pd.set_option('display.width', 1000)



Variants:   0%|          | 0/47 [00:00<?, ?it/s]

Starting:   0%|          | 0/10 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/10 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/10 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/10 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/10 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/10 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/10 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/10 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/10 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/10 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/10 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/10 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/10 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/10 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/10 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/10 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/10 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/10 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/10 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/10 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/10 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/10 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/10 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/10 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/10 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/10 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/10 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/10 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/10 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/10 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/10 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/10 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/10 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/10 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/10 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/10 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/10 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/10 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/10 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/10 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/10 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/10 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/10 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/10 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/10 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/10 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/10 [00:00<?, ?epoch/s]

In [14]:
#@title Results of testing
#@markdown Results presented are MRR.
#@markdown in addition, a statical model is built to estimate statistical relevancy and coefficients of parameters

import statsmodels.api as sm
def stats_analysis(data):
  pd.set_option('display.width', 1000)
  column_names = [ 'ns_exponent', 'negative', 'sg', 'vector_size', 'window', 'shrink_window', 'sample', 'min_count', 'epochs', 'mrr' ]
  text_stats_df = pd.DataFrame(data, columns=column_names)
  display(text_stats_df)
  formula = "mrr ~ ns_exponent + negative + sg + vector_size + window + C(shrink_window) + sample + min_count"
  model = sm.formula.glm(formula = formula, data=text_stats_df).fit()

if w2v_tr_combined_variants_ > 1:
  stats_analysis(w2v_tr_combined_data_)


Unnamed: 0,ns_exponent,negative,sg,vector_size,window,shrink_window,sample,min_count,epochs,mrr
0,0.75,5,0,100,5,False,0.001,0,10,0.07996
1,0.75,5,1,100,5,False,0.001,0,10,0.062123
2,0.179552,22,1,259,2,True,0.001,4,10,0.122736
3,0.218212,21,1,227,2,True,0.001,4,10,0.109955
4,0.179552,22,1,259,2,False,0.001,4,10,0.123139
5,0.218212,21,1,227,2,False,0.001,4,10,0.116317
6,-0.5,21,0,200,3,True,0.001,4,10,0.092476
7,0.03,21,0,200,3,False,1e-05,4,10,0.093054
8,0.03,21,0,200,3,True,0.0001,0,10,0.090466
9,0.03,21,0,200,3,False,0.0001,2,10,0.09675


In [15]:
# @title Statistical model of the first round results.

import statsmodels.api as sm
def stats_analysis(data):
  pd.set_option('display.width', 1000)
  column_names = [ 'ns_exponent', 'negative', 'sg', 'vector_size', 'window', 'shrink_window', 'sample', 'min_count', 'epochs', 'mrr' ]
  text_stats_df = pd.DataFrame(data, columns=column_names)
  formula = "mrr ~ ns_exponent + negative + sg + vector_size + window + C(shrink_window) + sample + min_count"
  model = sm.formula.glm(formula = formula, data=text_stats_df).fit()
  display(model.summary())

if w2v_tr_combined_variants_ > 1:
  stats_analysis(w2v_tr_combined_data_)


0,1,2,3
Dep. Variable:,mrr,No. Observations:,47.0
Model:,GLM,Df Residuals:,38.0
Model Family:,Gaussian,Df Model:,8.0
Link Function:,Identity,Scale:,0.00014142
Method:,IRLS,Log-Likelihood:,146.6
Date:,"Sun, 21 Apr 2024",Deviance:,0.005374
Time:,16:36:35,Pearson chi2:,0.00537
No. Iterations:,3,Pseudo R-squ. (CS):,0.659
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,0.0184,0.018,1.023,0.306,-0.017,0.054
C(shrink_window)[T.True],-0.0010,0.004,-0.253,0.801,-0.008,0.006
ns_exponent,0.0397,0.009,4.183,0.000,0.021,0.058
negative,0.0041,0.001,4.585,0.000,0.002,0.006
sg,0.0003,0.004,0.077,0.939,-0.008,0.009
vector_size,-9.205e-06,4.54e-05,-0.203,0.839,-9.81e-05,7.97e-05
window,0.0002,0.002,0.138,0.890,-0.003,0.003
sample,1.7243,0.509,3.390,0.001,0.727,2.721
min_count,-9.611e-05,0.002,-0.063,0.949,-0.003,0.003


In [16]:
#@title Prepare for Second Round of evaluation: The top candidates with varying epochs
best=5 # @param{type:"slider",min:1,max:40}

w2v_tr_combined_variants__ = 100
w2v_tr_opts__ = dict()
w2v_tr_opts_5 = []
w2v_tr_id__ = dict()

#print(w2v_tr_combined_data_)
data = w2v_tr_combined_data_
data.sort(key=lambda element: (-element[9]))
top=data[:best]
default = list(filter(lambda element: element[0] == 0.75, data))

#resprev.sort(key=lambda element: (-element[0], element[2], element[1]))
for params in top + default:
  ns_exponent, negative, sg, vector_size, window, shrink_window, sample, min_count, epochs, prevscore = params
  # override number of epochs
  for epochs in [5, 10, 20, 30, 40, 50, 100, 150]:
    t=(ns_exponent, negative, sg, vector_size, window, shrink_window, sample, min_count, epochs)
    if t in w2v_tr_opts__:
      continue
    opts = {'ns_exponent': ns_exponent, 'negative': negative, 'sg': sg,
            'vector_size': vector_size, 'window': window,
            'sample': sample,
            'shrink_windows': (shrink_window == 1), 'compute_loss': True, 'epochs': epochs}

    w2v_tr_opts__[t] = opts
    if epochs == 5:
      w2v_tr_opts_5.append((ns_exponent, negative, sg, vector_size, window, shrink_window, sample, min_count))
    w2v_tr_id__[t] = f"W2V TR nse={ns_exponent}-neg={negative}-{'sg' if sg else 'cbow'}-vs={vector_size}-w={window}-{'shrink' if shrink_window == 1 else 'static'}-sample={sample}-min={min_count}-epochs={epochs}"
    if len(w2v_tr_opts__) >= w2v_tr_combined_variants__:
      break

print("Candidates to test on the next round:")
column_names = [ 'ns_exponent', 'negative', 'sg', 'vector_size', 'window', 'shrink_window', 'sample', 'min_count' ]
text_stats_df = pd.DataFrame(w2v_tr_opts_5, columns=column_names)
display(text_stats_df)


Candidates to test on the next round:


Unnamed: 0,ns_exponent,negative,sg,vector_size,window,shrink_window,sample,min_count
0,0.05,21,0,200,2,False,0.01,4
1,0.1,21,0,200,5,True,0.01,4
2,0.05,21,0,200,3,True,0.01,4
3,0.05,21,0,200,5,True,0.01,4
4,0.179552,22,1,259,2,False,0.001,4
5,0.75,5,0,100,5,False,0.001,0
6,0.75,5,1,100,5,False,0.001,0


In [17]:
# @title Second round, training and testing


w2v_tr_combined_array__ = dict()
w2v_tr_combined_stats__ = dict()
w2v_tr_combined_score__ = 0.0
w2v_tr_combined_data__ = []
prev_model = None
for t, opts in tqdm_notebook(w2v_tr_opts__.items(), desc="Variants"):
  id = w2v_tr_id__[t]
  w2v_tr_combined_array__[t] = get_gensim_gen(marete_train_sentences_in_q3, token_seq_func = token_seq_out,
                                              silent=True, id="W2V-TR-combined-1.98b", noload=True,
                                              nosave=False, **opts) # Takes around 2 minutes
  res = test_w2v(id, w2v_tr_combined_array__[t], marete_test_sentences_in_q9, seq=token_seq_out, silent=True)
  #w2v_tr_combined_array[t] = get_gensim_gen(aecorpus.aes_1, token_seq_func = token_seq_tr,
  #                                          silent=True, id="W2V-AES-TR-combined-1.5", noload=True,
  #                                          nosave = True, **opts) # Takes around 2 minutes
  #res = test_w2v(id, w2v_tr_combined_array[t], aecorpus.aes_t, seq=token_seq_tr, silent=True)
  mrr = res.get_stats()[4]
  w2v_tr_combined_stats__[id] = res
  if mrr > w2v_tr_combined_score_:
    w2v_tr_combined_score__ = mrr
    w2v_tr_combined = w2v_tr_combined_array__[t]
    w2v_tr_combined_opts = opts
  w2v_tr_combined_data__.append([*t,mrr])
  prev_model = None
  if opts['epochs'] < 50:
    # Continue learning at the next round
    prev_model =  w2v_tr_combined_array__[t]



Variants:   0%|          | 0/56 [00:00<?, ?it/s]

Starting:   0%|          | 0/5 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/10 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/20 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/30 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/40 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/50 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/100 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/150 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/5 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/10 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/20 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/30 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/40 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/50 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/100 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/150 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/5 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/10 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/20 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/30 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/40 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/50 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/100 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/150 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/5 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/10 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/20 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/30 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/40 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/50 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/100 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/150 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/5 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/10 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/20 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/30 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/40 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/50 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/100 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/150 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/5 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/10 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/20 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/30 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/40 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/50 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/100 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/150 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/5 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/10 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/20 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/30 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/40 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/50 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/100 [00:00<?, ?epoch/s]

Starting:   0%|          | 0/150 [00:00<?, ?epoch/s]

In [18]:
# @title Full Result
pd.set_option('display.width', 1000)
#stats_show(w2v_tr_combined_stats__)
#store_stats(res)
import statsmodels.api as sm
def stats_analysis_(data):
  pd.set_option('display.width', 1000)
  column_names = [ 'ns_exponent', 'negative', 'sg', 'vector_size', 'window', 'shrink_window', 'sample', 'min_count', 'epochs', 'mrr' ]
  text_stats_df = pd.DataFrame(data, columns=column_names)
  display(text_stats_df)

if w2v_tr_combined_variants__ > 1:
  stats_analysis_(w2v_tr_combined_data__)


Unnamed: 0,ns_exponent,negative,sg,vector_size,window,shrink_window,sample,min_count,epochs,mrr
0,0.05,21,0,200,2,False,0.01,4,5,0.114412
1,0.05,21,0,200,2,False,0.01,4,10,0.145476
2,0.05,21,0,200,2,False,0.01,4,20,0.170142
3,0.05,21,0,200,2,False,0.01,4,30,0.179951
4,0.05,21,0,200,2,False,0.01,4,40,0.189451
5,0.05,21,0,200,2,False,0.01,4,50,0.194912
6,0.05,21,0,200,2,False,0.01,4,100,0.203877
7,0.05,21,0,200,2,False,0.01,4,150,0.209922
8,0.1,21,0,200,5,True,0.01,4,5,0.112312
9,0.1,21,0,200,5,True,0.01,4,10,0.150016


In [19]:
# @title Statistical model, including epochs
def stats_analysis_(data):
  pd.set_option('display.width', 1000)
  column_names = [ 'ns_exponent', 'negative', 'sg', 'vector_size', 'window', 'shrink_window', 'sample', 'min_count', 'epochs', 'mrr' ]
  text_stats_df = pd.DataFrame(data, columns=column_names)
  formula = "mrr ~ ns_exponent + negative + sg + vector_size + window + C(shrink_window) + sample + min_count + epochs"
  model = sm.formula.glm(formula = formula, data=text_stats_df).fit()
  display(model.summary())

if w2v_tr_combined_variants__ > 1:
  stats_analysis_(w2v_tr_combined_data__)



0,1,2,3
Dep. Variable:,mrr,No. Observations:,56.0
Model:,GLM,Df Residuals:,48.0
Model Family:,Gaussian,Df Model:,7.0
Link Function:,Identity,Scale:,0.00046875
Method:,IRLS,Log-Likelihood:,139.49
Date:,"Sun, 21 Apr 2024",Deviance:,0.0225
Time:,18:43:26,Pearson chi2:,0.0225
No. Iterations:,3,Pseudo R-squ. (CS):,0.9888
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-0.0243,0.081,-0.299,0.765,-0.184,0.135
C(shrink_window)[T.True],-0.0008,0.014,-0.056,0.955,-0.029,0.027
ns_exponent,0.1370,0.217,0.633,0.527,-0.287,0.561
negative,0.0070,0.008,0.839,0.401,-0.009,0.023
sg,-0.0246,0.011,-2.274,0.023,-0.046,-0.003
vector_size,-0.0003,0.000,-0.691,0.489,-0.001,0.001
window,-0.0012,0.005,-0.231,0.818,-0.012,0.009
sample,-0.0002,0.001,-0.281,0.779,-0.002,0.001
min_count,0.0244,0.073,0.334,0.738,-0.119,0.168


In [33]:
# @title Best parameter sets

display_top_n = 10 # @param {type:"slider",min:1,max:100}
# @title Full Result
pd.set_option('display.width', 1000)
#stats_show(w2v_tr_combined_stats__)
#store_stats(res)
import statsmodels.api as sm
def stats_analysis_(data):
  pd.set_option('display.width', 1000)
  column_names = [ 'ns_exponent', 'negative', 'sg', 'vector_size', 'window', 'shrink_window', 'sample', 'min_count', 'epochs', 'mrr' ]
  data_ = data.copy()
  data_.sort(key=lambda element: (-element[9]))
  top=data_[:display_top_n]
  text_stats_df = pd.DataFrame(top, columns=column_names, copy=True)
  #text_stats_df.sort_values(by=['mrr'], axis=0, ascending=False)
  display(text_stats_df)
  #text_stats_df_head = text_stats_df.head(display_top_n)
  #display(text_stats_df_head)

if w2v_tr_combined_variants__ > 1:
  stats_analysis_(w2v_tr_combined_data__)

Unnamed: 0,ns_exponent,negative,sg,vector_size,window,shrink_window,sample,min_count,epochs,mrr
0,0.05,21,0,200,2,False,0.01,4,150,0.209922
1,0.05,21,0,200,3,True,0.01,4,150,0.20863
2,0.1,21,0,200,5,True,0.01,4,150,0.208078
3,0.1,21,0,200,5,True,0.01,4,100,0.207397
4,0.05,21,0,200,2,False,0.01,4,100,0.203877
5,0.05,21,0,200,3,True,0.01,4,100,0.201863
6,0.05,21,0,200,5,True,0.01,4,150,0.198775
7,0.05,21,0,200,5,True,0.01,4,100,0.196229
8,0.1,21,0,200,5,True,0.01,4,50,0.195625
9,0.05,21,0,200,2,False,0.01,4,50,0.194912
