# Fasttext
Build a fasttext model and do some basic analyses.
Before running this code parse the ogsl data set to produce a dataframe of OGSL sign readings, sign names and utf8 representations, and parse the Ur 3 dataset to produce a text file in cuneiform utf8. 

The code below runs the Python implementation of FastText, which is rather underdeveloped. It is better to run the Gensim implementation, see https://github.com/RaRe-Technologies/gensim/blob/de8657e9b8d5192750296b6765175c31c8bb3298/docs/notebooks/FastText_Tutorial.ipynb

In [14]:
import gensim
from gensim.models.word2vec import LineSentence
from gensim.models.fasttext import FastText as FT_gensim

In [15]:
file = "output/sux.txt"
data = LineSentence(file)

In [32]:
model_gensim = FT_gensim(data, size=100, sg=1, iter=10)
# iter is for epochs

In [31]:
model_gensim.wv.most_similar("𒇻𒀴")

  if np.issubdtype(vec.dtype, np.int):


[('𒇻𒀴𒃲', 0.8541927337646484),
 ('𒇻𒀴𒂠', 0.7803254127502441),
 ('𒇻𒃲', 0.7718027234077454),
 ('𒁇𒋢', 0.7701417803764343),
 ('𒉡𒌴', 0.7648806571960449),
 ('𒇇', 0.7630270719528198),
 ('𒀸𒌴', 0.7606909275054932),
 ('𒁇𒅅', 0.7484268546104431),
 ('𒁀𒌴', 0.7276779413223267),
 ('𒁇𒋢𒂵', 0.7138701677322388)]

In [1]:
import pickle
from fastText import train_unsupervised
from fastText import util
import pandas as pd
import numpy as np

In [34]:
model = train_unsupervised('output/sux.txt', model='skipgram', ws=10, neg=70, epoch=100)
# increase negative sampling to 100 or 200 (70?)
# increase epochs 1000?

In [35]:
model.save_model("output/suxmodel")

In [4]:
labels = model.get_words(include_freq=True)

In [5]:
df = pd.DataFrame({"label": list(labels[0]), "count":list(labels[1])})

In [6]:
len(df)

24181

In [7]:
df

Unnamed: 0,label,count
0,X,172348
1,𒁹,145656
2,𒋡,95239
3,</s>,88788
4,𒈫,78594
5,𒈬,73147
6,𒌋,70695
7,𒐊,65993
8,𒂆,59175
9,𒐈,55188


In [8]:
class FastTextNN:
    """by Ali Abul Hawa; see https://github.com/facebookresearch/fastText/pull/552"""    
    def __init__(self, ft_model, ft_matrix=None):
        self.ft_model = ft_model        
        self.ft_words = ft_model.get_words()
        self.word_frequencies = dict(zip(*ft_model.get_words(include_freq=True)))
        self.ft_matrix = ft_matrix
        if self.ft_matrix is None:
            self.ft_matrix = np.empty((len(self.ft_words), ft_model.get_dimension()))
            for i, word in enumerate(self.ft_words):
                self.ft_matrix[i,:] = ft_model.get_word_vector(word)
    
    def find_nearest_neighbor(self, query_word, vectors, n=10,  cossims=None):
        """
        vectors is a 2d numpy array corresponding to the vectors you want to consider

        cossims is a 1d numpy array of size len(vectors), which can be passed for efficiency
        returns the index of the closest n matches to query within vectors and the cosine similarity (cosine the angle between the vectors)

        """
        
        query  = self.ft_model.get_word_vector(query_word)
        if cossims is None:
            cossims = np.matmul(vectors, query, out=cossims)

        norms = np.sqrt((query**2).sum() * (vectors**2).sum(axis=1))
        cossims = cossims/norms
        if query_word in self.ft_words:
            result_i = np.argpartition(-cossims, range(n+1))[1:n+1]
        else:
            result_i = np.argpartition(-cossims, range(n+1))[0:n]
        return list(zip(result_i, cossims[result_i]))

    def nearest_words(self, word, n=10, word_freq=None):
        result = self.find_nearest_neighbor(word, self.ft_matrix, n=n)
        if word_freq:
            return [(self.ft_words[r[0]], r[1]) for r in result if self.word_frequencies[self.ft_words[r[0]]] >= word_freq]
        else:
            return [(self.ft_words[r[0]], r[1]) for r in result]

# Basic usage of nearest_words

In [9]:
fasttext_nn = FastTextNN(model)
fasttext_nn.nearest_words('𒈗', word_freq=15)

[('𒋀𒀊𒆠𒈠', 0.6943220077703823),
 ('𒀭X𒀭𒂗𒍪', 0.6832159791023636),
 ('𒋀𒀊𒆠𒆤', 0.6669362790812129),
 ('𒀭𒋗𒀭𒂗𒍪', 0.6657241623248064),
 ('𒋀𒀊𒆠𒈠𒅗', 0.6474558773367978),
 ('𒋀𒀊𒆠𒈠𒆤', 0.6286432464111233)]

# Allow input in transliteration

In [10]:
# create conversion dictionary value to utf8
import pickle
with open("output/ogsl.p", "rb") as p:
    o = pickle.load(p)
d = dict(zip(o["value"], o["utf8"]))

In [11]:
def nearestcun(text, n=10, word_freq=None):
    """input transliteration of a word form, signs separated by hyphens.
    Examples: "ma-an-gi₄", "d-en-lil₂", or "urim₅-ki".
    Transliteration style (gu₄ vs. gud; gen vs. ŋen) is unimportant
    The result is fed to nearest_words.
    n is the number of nearest neigbors that nearest_words will return
    word_freq is the minimum word frequency for a neighbor to be considered"""
    signs = text.lower().split('-')
    seq = [d[s] if s in d else s for s in signs]
    seq = ''.join(seq)
    print(seq), print(text)
    return fasttext_nn.nearest_words(seq, n, word_freq)

In [22]:
nearestcun("udu", n=15, word_freq=10)

𒇻
udu


[('𒈧𒃲', 0.7768052066308614),
 ('𒃢', 0.7618290590316584),
 ('𒈧', 0.7441898657146736),
 ('𒇇', 0.7261994709373571),
 ('𒇻𒄾', 0.7232916205782008),
 ('𒊩𒀾𒃼', 0.7029518594779494),
 ('𒈦𒃲', 0.6948568639190539),
 ('𒈧X', 0.6836468687946554),
 ('𒁀𒋆𒂊𒆠', 0.682936915141539),
 ('𒁇𒁀𒍣𒂵', 0.6752486306557356),
 ('𒀊𒀊𒂠', 0.6713824826827895),
 ('𒀀𒈝', 0.6681028141017539)]

# TODO
Transliterate output by finding the word in the corpus

In [None]:
with open("output/sux.p", "rb") as p:
    sux = pickle.load(p)

In [None]:
sux_2 = sux.drop_duplicates(["transliteration"])

In [None]:
sux_2[sux_2["utf-8"]=="𒊺"]

In [None]:
d2 = {}
for item in sux_2["utf-8"].unique():
    d[item] = list(sux_2[sux_2['utf-8']==item]["transliteration"])
    

In [None]:
d2 = {item : d2[item] for item in d2 if not item[:5] == "Start"}

In [None]:
with open("output/translit_to_signs.p", "wb") as w:
    pickle.dump(d2, w)