# Fasttext
Build a fasttext model and do some basic analyses.
Before running this code parse the ogsl data set to produce a dataframe of OGSL sign readings, sign names and utf8 representations, and parse the Ur 3 dataset to produce a text file in cuneiform utf8. 

The code below runs the Python implementation of FastText, which is rather underdeveloped. It is better to run the Gensim implementation, see https://github.com/RaRe-Technologies/gensim/blob/de8657e9b8d5192750296b6765175c31c8bb3298/docs/notebooks/FastText_Tutorial.ipynb

In [1]:
import pickle
from fastText import train_unsupervised
from fastText import util
import pandas as pd
import numpy as np

In [2]:
model = train_unsupervised('output/sux.txt', model='skipgram', ws=10, neg=70, epoch=100)
# increase negative sampling to 100 or 200 (70?)
# increase epochs 1000?

In [13]:
model.save_model("output/suxmodel")

In [3]:
labels = model.get_words(include_freq=True)

In [4]:
df = pd.DataFrame({"label": list(labels[0]), "count":list(labels[1])})

In [6]:
len(df)

24008

In [5]:
df

Unnamed: 0,label,count
0,X,172395
1,𒁹,145656
2,𒋡,95239
3,</s>,88789
4,𒈫,78594
5,𒈬,73152
6,𒌋,70695
7,𒐊,65993
8,𒂆,59179
9,𒐈,55188


In [7]:
class FastTextNN:
    """by Ali Abul Hawa; see https://github.com/facebookresearch/fastText/pull/552"""    
    def __init__(self, ft_model, ft_matrix=None):
        self.ft_model = ft_model        
        self.ft_words = ft_model.get_words()
        self.word_frequencies = dict(zip(*ft_model.get_words(include_freq=True)))
        self.ft_matrix = ft_matrix
        if self.ft_matrix is None:
            self.ft_matrix = np.empty((len(self.ft_words), ft_model.get_dimension()))
            for i, word in enumerate(self.ft_words):
                self.ft_matrix[i,:] = ft_model.get_word_vector(word)
    
    def find_nearest_neighbor(self, query_word, vectors, n=10,  cossims=None):
        """
        vectors is a 2d numpy array corresponding to the vectors you want to consider

        cossims is a 1d numpy array of size len(vectors), which can be passed for efficiency
        returns the index of the closest n matches to query within vectors and the cosine similarity (cosine the angle between the vectors)

        """
        
        query  = self.ft_model.get_word_vector(query_word)
        if cossims is None:
            cossims = np.matmul(vectors, query, out=cossims)

        norms = np.sqrt((query**2).sum() * (vectors**2).sum(axis=1))
        cossims = cossims/norms
        if query_word in self.ft_words:
            result_i = np.argpartition(-cossims, range(n+1))[1:n+1]
        else:
            result_i = np.argpartition(-cossims, range(n+1))[0:n]
        return list(zip(result_i, cossims[result_i]))

    def nearest_words(self, word, n=10, word_freq=None):
        result = self.find_nearest_neighbor(word, self.ft_matrix, n=n)
        if word_freq:
            return [(self.ft_words[r[0]], r[1]) for r in result if self.word_frequencies[self.ft_words[r[0]]] >= word_freq]
        else:
            return [(self.ft_words[r[0]], r[1]) for r in result]

# Basic usage of nearest_words

In [8]:
fasttext_nn = FastTextNN(model)
fasttext_nn.nearest_words('𒈗', word_freq=15)

[('𒋀𒀊𒆠𒈠', 0.751829543903749),
 ('𒀭𒋗𒀭𒂗𒍪', 0.7354096040879524),
 ('𒆗𒂵', 0.73087789111477),
 ('𒀭𒄿𒉈𒀭𒂗𒍪', 0.6750294782463115),
 ('𒆠', 0.6718792040930646),
 ('𒌒𒁕', 0.6614835742377728),
 ('𒉺𒋼𒋛', 0.656594949190874),
 ('</s>', 0.6539255706468889),
 ('𒀭', 0.6537283810392776),
 ('𒈬', 0.6498351259367814)]

# Allow input in transliteration

In [9]:
# create conversion dictionary value to utf8
import pickle
with open("output/ogsl.p", "rb") as p:
    o = pickle.load(p)
d = dict(zip(o["value"], o["utf8"]))

In [10]:
def nearestcun(text, n=10, word_freq=None):
    """input transliteration of a word form, signs separated by hyphens.
    Examples: "ma-an-gi₄", "d-en-lil₂", or "urim₅-ki".
    Transliteration style (gu₄ vs. gud; gen vs. ŋen) is unimportant
    The result is fed to nearest_words.
    n is the number of nearest neigbors that nearest_words will return
    word_freq is the minimum word frequency for a neighbor to be considered"""
    signs = text.lower().split('-')
    seq = [d[s] if s in d else s for s in signs]
    seq = ''.join(seq)
    print(seq), print(text)
    return fasttext_nn.nearest_words(seq, n, word_freq)

In [12]:
nearestcun("huš-a", n=15, word_freq=100)

𒄭𒄊𒀀
huš-a


[('𒆬𒄀', 0.7370118849526337),
 ('𒀵𒀭𒋀𒆠𒋫', 0.6668041363550139),
 ('𒍝𒄵', 0.6326766648603797)]

# TODO
Transliterate output by finding the word in the corpus

In [23]:
with open("output/sux.p", "rb") as p:
    sux = pickle.load(p)

In [24]:
sux_2 = sux.drop_duplicates(["transliteration"])

In [27]:
sux_2[sux_2["utf-8"]=="𒊺"]

Unnamed: 0,transliteration,words,names,utf-8,lemm
2,še,[še],[ŠE],𒊺,še[barley]N
38770,ŠE,[ŠE],[ŠE],𒊺,X
43486,niga,[niga],[ŠE],𒊺,niga[fattened]


In [36]:
d2 = {}
for item in sux_2["utf-8"].unique():
    d[item] = list(sux_2[sux_2['utf-8']==item]["transliteration"])
    

In [40]:
d2 = {item : d2[item] for item in d2 if not item[:5] == "Start"}

In [44]:
with open("output/translit_to_signs.p", "wb") as w:
    pickle.dump(d2, w)