# Fasttext implementation in Gensim
For code examples etc. see the [tutorial](https://github.com/RaRe-Technologies/gensim/blob/de8657e9b8d5192750296b6765175c31c8bb3298/docs/notebooks/FastText_Tutorial.ipynb).

Currently the code includes two types of warning suppression - check in future installations whether they are still necessary. The first warning only occurs on Windows machines and tells the user that an alternative function will be used. The second warnings line suppresses the "FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated" warning which comes from Numpy.

In [6]:
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
warnings.filterwarnings(action='ignore', category=FutureWarning, module='gensim' )
import gensim
import pickle
import pandas as pd
from gensim.models.word2vec import LineSentence
from gensim.models.fasttext import FastText as FT_gensim

In [2]:
file = "corpus/sux_utf8.txt"
data = LineSentence(file)

In [3]:
model_gensim = FT_gensim(data, min_count=5, window=10, size=100, negative=20, sorted_vocab=1, min_n=1, 
                         max_n=6, sg=1, iter=100)
# iter is for epochs; sg is skipgram (sg = 0 is cbow)

In [10]:
filename = "model/model_cuneiform.model"
model_gensim.save(filename)

In [7]:
with open("output/ogsl.p", "rb") as p:
    o = pd.read_pickle(p)
d = dict(zip(o["value"], o["utf8"]))

In [8]:
def cun(text): 
    """transform transliterated input into cuneiform. Use unicode subscript numbers and separate all signs with hyphens; separate words with blanks
    Examples: 'ma-an-gi₄'; 'd-en-lil₂ nibru-ki'.
    Transliteration style (sugal₇ vs. sukkal; dug₄ vs. du₁₁; gen vs. ŋen; etc.) and capitalization are unimportant.
    """
    cun_line = []
    words = text.lower().split()
    for word in words: 
        signs = word.lower().split('-')
        seq = [d[s] if s in d else s for s in signs]
        seq = ''.join(seq)
        cun_line.append(seq)
    line = ' '.join(cun_line)
    return(line)

In [6]:
def nearestcun(text, n=10, word_freq=None):
    """input transliteration of a word form, signs separated by hyphens.
    Examples: "ma-an-gi₄", "d-en-lil₂", or "urim₅-ki".
    Transliteration style (gu₄ vs. gud; gen vs. ŋen) is unimportant
    The result is fed to nearest_words.
    n is the number of nearest neigbors that nearest_words will return
    word_freq is the minimum word frequency for a neighbor to be considered"""
    word = cun(text)
    return model_gensim.wv.most_similar(positive=[word], topn=n)

In [7]:
nearestcun("ŋeš-ad-da", 15)

[('𒄑𒀜𒍑', 0.76911461353302),
 ('𒄑𒀜', 0.7375223636627197),
 ('𒄑𒄀𒈲', 0.7371855974197388),
 ('𒄤𒋧𒄤', 0.732872486114502),
 ('𒄑𒈣𒁍𒁕', 0.7244366407394409),
 ('𒄑𒈪𒌷𒍝', 0.7211812138557434),
 ('𒄑𒄷𒋛', 0.718241810798645),
 ('𒄑𒍣𒃶', 0.7150076031684875),
 ('𒄑𒀜𒆰', 0.7129932641983032),
 ('𒄑𒀀𒁕', 0.7123159170150757),
 ('𒄑𒈨𒁶', 0.7120527029037476),
 ('𒄑𒋗𒉽𒉽', 0.7095718383789062),
 ('𒄑𒋾𒋾', 0.7089521884918213),
 ('𒄑𒀀𒊏', 0.7083549499511719),
 ('𒄑𒇲𒉌', 0.7068991661071777)]

example
model_wrapper.most_similar(positive=['baghdad', 'england'], negative=['london'])

In [8]:
model_gensim.wv.most_similar(positive=[cun("gud"), cun("sila₄")], negative=[cun("amar")], topn=3)

[('𒇇', 0.9742987155914307),
 ('𒈧', 0.9709023237228394),
 ('𒇻', 0.9708880186080933)]

In [9]:
model_gensim.wv.doesnt_match([cun("sila₄"), cun("udu"), cun("u₈"), cun("amar")])

'𒀫'

In [20]:
model_gensim.wv.most_similar(positive=[cun("lugal"), cun("munus")], negative=[cun("nita₂")], topn=3)

[('𒀭', 0.9684085249900818),
 ('𒈤', 0.9660917520523071),
 ('𒆠', 0.9650214910507202)]

In [31]:
model_gensim.wv.most_similar(positive=[cun("lugal"), cun("nu-banda₃")], negative=[cun("sukkal-mah")])

[('𒁕𒁕', 0.951117217540741),
 ('𒌤𒀀', 0.9501250982284546),
 ('𒉺', 0.9496690034866333),
 ('𒉺𒀠', 0.9495517015457153),
 ('𒆠𒀀', 0.9494520425796509),
 ('𒁀𒊷', 0.9493198394775391),
 ('𒌨𒈩', 0.9478766918182373),
 ('𒌨𒇷', 0.9475545883178711),
 ('𒆠', 0.9475405216217041),
 ('𒈗𒀀', 0.9465681314468384)]

In [58]:
model_gensim.wv.most_similar(cun("šim-gig"), topn=10)

[('𒊺𒇷', 0.8900678157806396),
 ('𒄞𒆪𒊒', 0.8851251602172852),
 ('𒋗𒌫𒈨', 0.8850780725479126),
 ('𒋆𒃵𒃵𒈠', 0.8815943002700806),
 ('𒋢𒁲', 0.8805724382400513),
 ('𒋆𒅎', 0.8791651725769043),
 ('𒍝𒁀𒈝', 0.8783516883850098),
 ('𒃵𒃵𒈠', 0.8764652013778687),
 ('𒉺𒅇𒆪', 0.8720640540122986),
 ('𒇴𒃲', 0.8666182160377502)]

In [42]:
with open("output/utf8_lemm_d.p", "rb") as p:
    utf8_lemm_d = pickle.load(p)

In [46]:
def similar(translit, topn=10):
    c = cun(translit)
    sim = model_gensim.wv.most_similar(c, topn =topn)
    sim_utf8 = [item[0] for item in sim]
    sim_utf8 = [utf8_lemm_d[item] for item in sim_utf8]
    sim_z = zip(sim, sim_utf8)
    return dict(sim_z)

In [55]:
similar("šeg₉-bar")

{('𒄎', 0.9597214460372925): {'lulim', 'lulim[stag]N', '|GIR₃×(LU.IGI)|'},
 ('𒀫', 0.9346693158149719): {'AMAR',
  'amar',
  'amar[young]N',
  'zur',
  'zur[break]V/t',
  'zur[grain-pile]N',
  'zur[roil]V/t',
  'zur[take-care-of]V/t'},
 ('𒁰', 0.9322395324707031): {'DARA₃',
  'dara₃',
  'dar₃',
  'durah',
  'durah[goat]N'},
 ('𒈦𒆕', 0.9320837259292603): {'Mašda[1]RN',
  'mas-da₃',
  'maš-da₃',
  'maš-du₃',
  'maš-gag',
  'maš-kak',
  'mašda[gazelle]N',
  'mašda[stone]N',
  'mašgaʾen[a-social-status-or-profession]N',
  '|MAŠ.GAG|',
  '|MAŠ.KAK|'},
 ('𒊾', 0.9314689636230469): {'kiši[ant]N',
  'kiši₆',
  'sabar₂',
  'ŠEG₉',
  'šeg[animal]N',
  'šeg[frost]N',
  'šeg₉',
  'šenbar',
  'šeŋbar[animal]N'},
 ('𒁱', 0.9258366823196411): {'DARA₄',
  'dara[red]V/i',
  'dara₄',
  'durah[goat]N',
  'ganam[ewe]N',
  'inna[fleece]N',
  'lahar[ewe]N',
  'u[ewe]N',
  'u[sheep-ewe]N'},
 ('𒊾X', 0.9255743026733398): {'ŠEG₉-x', 'šeg₉-x', 'šeŋbar[animal]N'},
 ('𒆸𒁀', 0.9248313307762146): {'gur[thick]V/i',
  'hab[m