# Fasttext implementation in Gensim
For code examples etc. see the [tutorial](https://github.com/RaRe-Technologies/gensim/blob/de8657e9b8d5192750296b6765175c31c8bb3298/docs/notebooks/FastText_Tutorial.ipynb).

Currently the code includes two types of warning suppression - check in future installations whether they are still necessary. The first warning only occurs on Windows machines and tells the user that an alternative function will be used. The second warnings line suppresses the "FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated" warning which comes from Numpy.

In [6]:
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
warnings.filterwarnings(action='ignore', category=FutureWarning, module='gensim' )
import gensim
import pickle
from gensim.models.word2vec import LineSentence
from gensim.models.fasttext import FastText as FT_gensim

In [8]:
file = "corpus/sux_utf8.txt"
data = LineSentence(file)

In [9]:
model_gensim = FT_gensim(data, min_count=5, window=10, size=100, negative=20, sorted_vocab=1, min_n=1, 
                         max_n=6, sg=1, iter=100)
# iter is for epochs; sg is skipgram (sg = 0 is cbow)

In [11]:
with open("output/ogsl.p", "rb") as p:
    o = pickle.load(p)
d = dict(zip(o["value"], o["utf8"]))

In [20]:
def cun(text): 
    """transform transliterated input into cuneiform. Use unicode subscript numbers and separate all signs with hyphens; separate words with blanks
    Examples: 'ma-an-gi₄'; 'd-en-lil₂ nibru-ki'.
    Transliteration style (sugal₇ vs. sukkal; dug₄ vs. du₁₁; gen vs. ŋen; etc.) and capitalization are unimportant.
    """
    cun_line = []
    words = text.lower().split()
    for word in words: 
        signs = word.lower().split('-')
        seq = [d[s] if s in d else s for s in signs]
        seq = ''.join(seq)
        cun_line.append(seq)
    line = ' '.join(cun_line)
    return(line)

In [24]:
def nearestcun(text, n=10, word_freq=None):
    """input transliteration of a word form, signs separated by hyphens.
    Examples: "ma-an-gi₄", "d-en-lil₂", or "urim₅-ki".
    Transliteration style (gu₄ vs. gud; gen vs. ŋen) is unimportant
    The result is fed to nearest_words.
    n is the number of nearest neigbors that nearest_words will return
    word_freq is the minimum word frequency for a neighbor to be considered"""
    word = cun(text)
    return model_gensim.wv.most_similar(positive=[word])

In [26]:
nearestcun("ur-mah")

[('𒌨𒂠', 0.7202852964401245),
 ('𒌨𒂠𒁶', 0.6840295791625977),
 ('𒌨𒁇', 0.6684911251068115),
 ('𒌨𒂠𒊏', 0.6628358960151672),
 ('𒌨𒊩𒌨', 0.6617624759674072),
 ('𒌨𒁇𒊏', 0.6486069560050964),
 ('𒌨𒂠𒂠', 0.6349743604660034),
 ('𒌨𒈤𒁶', 0.6264798045158386),
 ('𒌨𒂠𒊏𒅗', 0.6139445900917053),
 ('𒆪𒌌𒋾', 0.5975202322006226)]

In [28]:
model_gensim.wv.most_similar(positive=[cun("ku-ul-ti"), cun("sipad")])

[('𒌨𒂠𒂠', 0.7906512022018433),
 ('𒌨𒊏', 0.7796335220336914),
 ('𒌨𒊏𒆤', 0.7416032552719116),
 ('𒌨𒈩𒌉', 0.7353944778442383),
 ('𒌨𒂠𒊏𒆤', 0.7347965836524963),
 ('𒌨𒆤', 0.7313037514686584),
 ('𒄿𒆪𒈨𒊬', 0.7218989133834839),
 ('𒌨𒂠𒊏𒂠', 0.7049010992050171),
 ('𒄷𒈨', 0.695325493812561),
 ('𒀲𒋙𒀭𒈨', 0.6947768926620483)]