In [168]:
import numpy as np
import pandas as pd

Datasets:

- nimi pi - a dataset of toki pona words to several possible english translations
- word freq - english words along with part of speech, sorted by frequency. We can approximate this with Zipf's law.
- conceptnetw2v - a word vectorization based on semantic meaning. This will allow us to decide between candidates.
- synonyms, definitions - English Dictionary and Thesaurus

In [169]:
#Word, Part of Speech (ranked by popularity)
freq = pd.read_csv("word-freq-top5000.csv")

#toki pona words and translations
toki = pd.read_csv("nimi_pi.csv")
tp_vocab = toki.TOKI.drop_duplicates().tolist()

tokisentences = pd.read_csv('tokisentences.csv')

#Word -> Synonyms. This is a graph, we can search it.
synonyms    = pd.read_csv("thesauri\synonyms.csv")
hypernyms    = pd.read_csv("thesauri\WordnetHypernyms.csv")
hyponyms    = pd.read_csv("thesauri\WordnetHyponyms.csv")
noundefs = pd.read_csv("thesauri\WordnetNouns.csv")
verbdefs = pd.read_csv("thesauri\WordnetVerbs.csv")
adjcdefs = pd.read_csv("thesauri\WordnetAdjectives.csv")
advbdefs = pd.read_csv("thesauri\WordnetAdverbs.csv")

nouns = freq[freq['Part of speech'] == 'Noun']['Word'].tolist()
#['year', 'people', 'way', 'day', 'man', 'thing', 'woman', 'life', 'child']

verbs = freq[freq['Part of speech'] == 'Verb']['Word'].tolist()
#['have', 'do', 'say', 'go', 'can', 'get', 'would', 'make', 'know']

adjcs = freq[freq['Part of speech'] == 'Adjective']['Word'].tolist()
#['new', 'good', 'high', 'old', 'great', 'big', 'American', 'small', 'large']

advbs = freq[freq['Part of speech'] == 'Adverb']['Word'].tolist()
#['not', "n't", 'up', 'so', 'out', 'just', 'now', 'how', 'then', 'more']

In [170]:
freq[freq['Part of speech'] == 'Article']['Word'].tolist()

['the', 'a', 'his', 'their', 'her', 'my', 'your', 'its', 'our', 'no', 'every']

In [171]:
freq[freq['Part of speech'] == 'Determiner']['Word'].tolist()

['this',
 'that',
 'what',
 'all',
 'which',
 'some',
 'these',
 'more',
 'many',
 'those',
 'any',
 'another',
 'much',
 'own',
 'same',
 'such',
 'few',
 'most',
 'each',
 'both',
 'several',
 'former',
 'whose',
 'half',
 'less',
 'whatever',
 'little',
 'enough',
 'fewer',
 'either',
 'latter',
 'neither',
 'matter',
 'no']

Our data is split into 10 parts of speech:
 - Nouns, Verbs, and Adjectives - the meat of it
 - Adverbs - basically just adjectives conjugated differently. I cannot stand adverbs.
 - Prepositions, Conjunctions, Pronouns, Exclamations, Determiners, Measures - Toki Pona only has a few of these, as described below.

In [172]:
# THESE PARTS OF SPEECH ARE FAIRLY SIMPLE TO TRANSLATE

# <DETERMINER> := ali  - all, every, full, infinite
#                 mute - many, more
#                 ni - this, that
#                 seme - what, which

detes = freq[freq['Part of speech'] == 'Determiner']['Word'].tolist()
#['that', 'what', 'all', 'which', 'some', 'these', 'more', 'many', 'those']

# <PREPOSITION> := sewi - above, top
#                  anpa - below, bottom
#                  monsi - back, behind
#                  poka - in front of, next to, beside
#                  insa - inside, between, centered
#                  kama - coming, future, eventual
#                  awen - continuing, present, stay, constant
#                  tawa - toward, past, for
#                  ipa  - part, portion, segment, some
#                  teje - left
#                  soto - right
#                  lon  - true, on, at, in

preps = freq[freq['Part of speech'] == 'Preposition']['Word'].tolist()
#['in', 'to', 'for', 'with', 'on', 'at', 'from', 'by', 'about']

# <NUMBER> := nanpa - number, ordinal suffix
#             lo  - 0, none, empty
#             wan - 1
#             tu  - 2            
#             sa  - 3            
#             po  - 4            
#             ki  - 5   
#   Base six, so po po ki = 445_6 = 5  +  4*6  +  4 * 6^2 = 173 

meass = freq[freq['Part of speech'] == 'Measure']['Word'].tolist()
#['two', 'first', 'last', 'three', 'next', 'million', 'four', 'five', 'second']

# <INTERJECTION> := a - ah!, !, realization, laughter, bravo
#                   mu - meow, moo, animal noise
#                   n - hmm, wow, woah
#                   ajo - hello, hi
#                   tata - goodbye
#                   wa - oh, uh, well, conversational styrofoam
#                   ak - frustation, ouch, fuck (of aversion), sudden fear
#                   ja - yes

excls = freq[freq['Part of speech'] == 'Exclamation']['Word'].tolist()
#['oh', 'yeah', 'no', 'hey', 'hi', 'hello', 'mm-hmm', 'ah', 'wow']

# <PRONOUNS> := ijo - thing, event, object, matter, something, stuff
#               ona - he, she, it, they 
#               mi - I, me, we, us
#               sina - you 

prons = freq[freq['Part of speech'] == 'Pronoun']['Word'].tolist()
#['it', 'I', 'you', 'he', 'they', 'we', 'she', 'who', 'them']

# <CONJUNCTION> := sama - same, similar
#                  ante - different, other
#                  anu - or, minus, intersection, minimum
#                  en  - and, plus, union, maximum     
#                  taso - but, however                 
#                  tan  - by, from, times, because of, x tan y -> if y then x
#                  powe - false, pretend, imposter
#                  ala  - no, not


conjs = freq[freq['Part of speech'] == 'Conjunction']['Word'].tolist()
#['that', 'but', 'or', 'as', 'if', 'when', 'than', 'because', 'while']


In [173]:
adjcs

['other',
 'new',
 'good',
 'high',
 'old',
 'great',
 'big',
 'American',
 'small',
 'large',
 'national',
 'young',
 'different',
 'black',
 'long',
 'little',
 'important',
 'political',
 'bad',
 'white',
 'real',
 'best',
 'right',
 'social',
 'only',
 'public',
 'sure',
 'low',
 'early',
 'able',
 'human',
 'local',
 'late',
 'hard',
 'major',
 'better',
 'economic',
 'strong',
 'possible',
 'whole',
 'free',
 'military',
 'true',
 'federal',
 'international',
 'full',
 'special',
 'easy',
 'clear',
 'recent',
 'certain',
 'personal',
 'open',
 'red',
 'difficult',
 'available',
 'likely',
 'short',
 'single',
 'medical',
 'current',
 'wrong',
 'private',
 'past',
 'foreign',
 'fine',
 'common',
 'poor',
 'natural',
 'significant',
 'similar',
 'hot',
 'dead',
 'central',
 'happy',
 'serious',
 'ready',
 'simple',
 'left',
 'physical',
 'general',
 'environmental',
 'financial',
 'blue',
 'democratic',
 'dark',
 'various',
 'entire',
 'close',
 'legal',
 'religious',
 'cold',
 'fi

For each word, we associate to both its definition and synonyms

In [174]:
merged_data_nouns = pd.merge(noundefs, synonyms,          left_on='Word',  right_on='word', how='inner')
merged_data_nouns = pd.merge(merged_data_nouns, hyponyms,  left_on='Word', right_on='lemma', how='inner')
merged_data_nouns = pd.merge(merged_data_nouns, hypernyms, left_on='Word', right_on='lemma', how='inner')
merged_data_verbs = pd.merge(verbdefs, synonyms, left_on='Word',           right_on='word', how='inner')
merged_data_verbs = pd.merge(merged_data_verbs, hyponyms, left_on='Word',  right_on='lemma', how='inner')
merged_data_verbs = pd.merge(merged_data_verbs, hypernyms, left_on='Word', right_on='lemma', how='inner')
merged_data_adjcs = pd.merge(adjcdefs, synonyms, left_on='Word',           right_on='word', how='inner')
merged_data_adjcs = pd.merge(merged_data_adjcs, hyponyms, left_on='Word',  right_on='lemma', how='inner')
merged_data_adjcs = pd.merge(merged_data_adjcs, hypernyms, left_on='Word', right_on='lemma', how='inner')

# Filter rows where 'Definition' and at least one of 'synonyms', 'hyponyms', or 'hypernyms' is not null
filtered_data_nouns = merged_data_nouns[~(merged_data_nouns['Definition'].isnull() | merged_data_nouns['synonyms'].isnull() | merged_data_nouns['hyponyms'].isnull() | merged_data_nouns['hypernyms'].isnull())]
toDefNouns      = dict(zip(filtered_data_nouns['Word'], zip(filtered_data_nouns['Definition'].tolist(), filtered_data_nouns[['synonyms', 'hyponyms', 'hypernyms']].apply(lambda x: x.str.replace('|', ';').str.split(';')).values.tolist())))

filtered_data_verbs = merged_data_verbs[~(merged_data_verbs['Definition'].isnull() | merged_data_verbs['synonyms'].isnull() | merged_data_verbs['hyponyms'].isnull() | merged_data_verbs['hypernyms'].isnull())]
toDefVerbs      = dict(zip(filtered_data_verbs['Word'], zip(filtered_data_verbs['Definition'].tolist(), filtered_data_verbs[['synonyms', 'hyponyms', 'hypernyms']].apply(lambda x: x.str.replace('|', ';').str.split(';')).values.tolist())))

filtered_data_adjcs = merged_data_adjcs[~(merged_data_adjcs['Definition'].isnull() | merged_data_adjcs['synonyms'].isnull() | merged_data_adjcs['hyponyms'].isnull() | merged_data_adjcs['hypernyms'].isnull())]
toDefAdjectives = dict(zip(filtered_data_adjcs['Word'], zip(filtered_data_adjcs['Definition'].tolist(), filtered_data_adjcs[['synonyms', 'hyponyms', 'hypernyms']].apply(lambda x: x.str.replace('|', ';').str.split(';')).values.tolist())))


  toDefNouns      = dict(zip(filtered_data_nouns['Word'], zip(filtered_data_nouns['Definition'].tolist(), filtered_data_nouns[['synonyms', 'hyponyms', 'hypernyms']].apply(lambda x: x.str.replace('|', ';').str.split(';')).values.tolist())))
  toDefVerbs      = dict(zip(filtered_data_verbs['Word'], zip(filtered_data_verbs['Definition'].tolist(), filtered_data_verbs[['synonyms', 'hyponyms', 'hypernyms']].apply(lambda x: x.str.replace('|', ';').str.split(';')).values.tolist())))
  toDefAdjectives = dict(zip(filtered_data_adjcs['Word'], zip(filtered_data_adjcs['Definition'].tolist(), filtered_data_adjcs[['synonyms', 'hyponyms', 'hypernyms']].apply(lambda x: x.str.replace('|', ';').str.split(';')).values.tolist())))


In [175]:
toDefNouns['lozenge'] #->

# Definition('a small aromatic or medicated candy',
# Synonyms  [['pill', 'tablet', 'tab'],
# Instances  ['cachou', 'cough drop', 'bolus', 'capsule', 'dragee', 'sleeping pill'],
# Categories  ['candy', 'dose']])

('a small aromatic or medicated candy',
 [['pill', 'tablet', 'tab'],
  ['cachou', 'cough drop', 'bolus', 'capsule', 'dragee', 'sleeping pill'],
  ['candy', 'dose']])

In [176]:
toDefNouns['week']

('any period of seven consecutive days; it rained for a week',
 [['hebdomad', 'workweek', 'calendar week'],
  ['holy week', 'rag', 'shiva', 'week from monday'],
  ['time period', 'work time', 'time period']])

For each toki word, we associate to all valid translations (ignoring part of speech for now). Two dictionaries going both ways here.

In [177]:
tokidict = dict()
for i in range(len(toki)):
    l = list(toki.loc[i])
    if l[0] in tokidict:
        tokidict[l[0]] += [x for x in l[2:] if str(x) == x]
    else:
        tokidict[l[0]]  = [x for x in l[2:] if str(x) == x]

In [178]:
tokidict['palisa']

['long',
 'long hard thing; branch',
 'rod',
 'stick',
 'pointy thing',
 'stretch',
 'beat',
 'poke',
 'stab',
 'sexually arouse']

In [179]:
edict = dict()
for k in tokidict:
    for v in tokidict[k]:
        edict[v] = k

edict['reptile']

'akesi'

Naive method: Attempt to translate words, synonyms, and definitions into Toki Pona.

In [180]:
tok2eng_level_0 = dict([(n, []) for n in nouns + verbs + adjcs + advbs])

In [181]:
tok2eng_level_1 = tok2eng_level_0
for k in tokidict:
    for en in tokidict[k]:
        if en in tok2eng_level_1:
            if k not in tok2eng_level_1[en]:
                tok2eng_level_1[en].append(k)

In [182]:
fs = {}
for key in tok2eng_level_0:
    if tok2eng_level_1[key] != '':
        fs[key] = tok2eng_level_0[key]
fs

{'time': ['tenpo'],
 'year': [],
 'people': ['jan', 'kulupu'],
 'way': ['nasin'],
 'day': [],
 'man': ['mije'],
 'thing': ['ijo'],
 'woman': ['meli'],
 'life': ['konwe'],
 'child': [],
 'world': [],
 'school': [],
 'state': [],
 'family': [],
 'student': [],
 'group': ['kulupu'],
 'country': ['ma'],
 'problem': [],
 'hand': ['luka'],
 'part': ['ipa'],
 'place': ['jeno'],
 'case': [],
 'week': [],
 'company': ['kulupu'],
 'system': ['nasin'],
 'program': [],
 'question': [],
 'work': ['pali'],
 'government': [],
 'number': ['nanpa'],
 'night': [],
 'Mr': [],
 'point': [],
 'home': ['tomo'],
 'water': ['telo'],
 'room': ['tomo'],
 'mother': ['mama'],
 'area': [],
 'money': ['mani'],
 'story': [],
 'fact': ['kin'],
 'month': [],
 'lot': [],
 'right': ['pona'],
 'study': ['sona'],
 'book': ['lipu'],
 'eye': ['oko'],
 'job': [],
 'word': ['nimi'],
 'business': ['esun'],
 'issue': [],
 'side': ['poka'],
 'kind': [],
 'head': ['lawa'],
 'house': [],
 'service': [],
 'friend': [],
 'father': [

In [183]:
f = pd.merge(hypernyms,hyponyms)
f

Unnamed: 0,lemma,Count,part_of_speech,hypernyms,hyponyms
0,0.22,4,noun,firearm,twenty-two pistol;twenty-two rifle
1,1,1,noun,digit,monad;singleton
2,2,1,noun,digit,couple;craps
3,12,2,noun,large integer,boxcars
4,1000,4,noun,large integer,millenary
...,...,...,...,...,...
33611,zoroastrianism,14,noun,religion,parsiism
33612,zoysia,6,noun,grass,korean lawn grass;manila grass;mascarene grass
33613,zygote,6,noun,cell,heterozygote;homozygote
33614,zymolysis,9,noun,chemical process,bottom fermentation;top fermentation;vinification


In [184]:
ok = set(nouns + verbs + adjcs + advbs)
defs  = dict([(x,y) for x,y in zip(list(noundefs.Word), list(noundefs.Definition))])
defsV = dict([(x,y) for x,y in zip(list(verbdefs.Word), list(verbdefs.Definition))])
defsA = dict([(x,y) for x,y in zip(list(adjcdefs.Word), list(adjcdefs.Definition))])
defs.update(defsV)
defs.update(defsA)
tok2eng_level_2 = tok2eng_level_1
for k in tok2eng_level_1:
    if tok2eng_level_1[k] == '':
        if k in synonyms:
            syns = synonyms[k]
            for c in f[f.lemma == k]:
                syns += str(c['hypernyms']).split('|')
                syns += str(c['hyponyms']).split(';')
            for s in syns:
                if s in edict:
                    tok2eng_level_2[k] += (f'{edict[s]}')
                if s in defs:
                    defn = defs[s]
                    for word in defn.split():
                        if word in edict and word in ok:
                            tok2eng_level_2[k] += (f'{edict[word]}')
        if k in defs:
            defn = defs[k]
            for word in defn.split():
                if word in edict and word in ok:
                    tok2eng_level_2[k] += (f'{edict[word]}')
# c=1
# for i in tok2eng_level_2.keys():
#     if tok2eng_level_2[i] != []:
#         print(c, i, tok2eng_level_2[i])
#         c+=1
tok2eng_level_2

{'time': ['tenpo'],
 'year': [],
 'people': ['jan', 'kulupu'],
 'way': ['nasin'],
 'day': [],
 'man': ['mije'],
 'thing': ['ijo'],
 'woman': ['meli'],
 'life': ['konwe'],
 'child': [],
 'world': [],
 'school': [],
 'state': [],
 'family': [],
 'student': [],
 'group': ['kulupu'],
 'country': ['ma'],
 'problem': [],
 'hand': ['luka'],
 'part': ['ipa'],
 'place': ['jeno'],
 'case': [],
 'week': [],
 'company': ['kulupu'],
 'system': ['nasin'],
 'program': [],
 'question': [],
 'work': ['pali'],
 'government': [],
 'number': ['nanpa'],
 'night': [],
 'Mr': [],
 'point': [],
 'home': ['tomo'],
 'water': ['telo'],
 'room': ['tomo'],
 'mother': ['mama'],
 'area': [],
 'money': ['mani'],
 'story': [],
 'fact': ['kin'],
 'month': [],
 'lot': [],
 'right': ['pona'],
 'study': ['sona'],
 'book': ['lipu'],
 'eye': ['oko'],
 'job': [],
 'word': ['nimi'],
 'business': ['esun'],
 'issue': [],
 'side': ['poka'],
 'kind': [],
 'head': ['lawa'],
 'house': [],
 'service': [],
 'friend': [],
 'father': [

Underwhelming, expectedly. Our vectors have 300 dimensions, so we will need to do PCA on them.

In [185]:
word2vec = pd.read_csv("conceptnetw2v.csv")

In [186]:
list(word2vec[word2vec.word == 'bumpy'].to_numpy()[0][2:302])

[-0.0705,
 -0.0289,
 0.1469,
 0.002,
 0.0617,
 -0.184,
 -0.0325,
 0.1357,
 0.067,
 0.0914,
 -0.0949,
 0.1012,
 0.0309,
 0.0633,
 0.0749,
 -0.0266,
 -0.1353,
 -0.0297,
 0.1159,
 0.0857,
 0.0507,
 0.0444,
 0.0516,
 -0.0854,
 -0.0745,
 0.0182,
 0.0222,
 0.0678,
 -0.1286,
 0.0112,
 0.0311,
 0.0422,
 0.0061,
 -0.0143,
 -0.0041,
 0.0953,
 -0.0687,
 0.0362,
 0.0784,
 -0.0192,
 -0.0886,
 0.0973,
 0.0593,
 0.0283,
 -0.0267,
 0.1029,
 -0.118,
 0.2267,
 0.0763,
 -0.0746,
 -0.0068,
 -0.0086,
 0.1846,
 -0.0283,
 0.034,
 0.0976,
 -0.0696,
 -0.0372,
 -0.0394,
 0.0799,
 0.0451,
 0.1086,
 0.0545,
 -0.0524,
 -0.0805,
 0.0261,
 0.0395,
 -0.062,
 -0.0155,
 -0.0583,
 0.0968,
 -0.0134,
 0.0541,
 0.0026,
 -0.0712,
 0.0533,
 0.0226,
 0.0289,
 -0.0037,
 -0.0513,
 -0.0579,
 -0.1049,
 0.0327,
 -0.0177,
 -0.0161,
 0.0042,
 -0.0508,
 -0.0538,
 -0.1249,
 0.018,
 0.0212,
 0.0003,
 -0.094,
 -0.0079,
 -0.0417,
 0.0409,
 -0.1392,
 0.1093,
 0.055,
 0.0393,
 -0.0328,
 0.0166,
 0.0445,
 -0.094,
 0.022,
 -0.0468,
 -0.0289,

In [187]:
tpvecs = dict([(k, [0.0] * 300) for k in tokidict.keys()])
for k in tokidict.keys():
    denom = len(tokidict[k])
    for v in tokidict[k]:
        l = word2vec[word2vec.word == v].to_numpy()
        if len(l) > 0:
            tpvecs[k] += l[0, 2:302]
    if denom > 0:
        tpvecs[k] = [tpvecs[k][i] / denom for i in range(300)]

In [188]:
tpvecs

{'a': [-0.026771428571428568,
  -0.04698571428571428,
  0.06342857142857142,
  -0.04728571428571428,
  -0.003757142857142857,
  0.007528571428571429,
  0.014900000000000002,
  -0.08175714285714286,
  0.04762857142857142,
  0.03502857142857142,
  0.059885714285714285,
  0.052899999999999996,
  -0.10062857142857143,
  -0.007257142857142857,
  0.09805714285714287,
  0.05577142857142857,
  -0.031099999999999996,
  0.04511428571428571,
  0.030585714285714282,
  0.01902857142857143,
  0.010300000000000002,
  -0.028499999999999998,
  0.0177,
  0.07757142857142856,
  -0.00012857142857142855,
  -0.015142857142857142,
  -0.03887142857142857,
  0.019542857142857143,
  0.058871428571428575,
  -0.0006285714285714307,
  -0.05845714285714287,
  0.05644285714285714,
  0.07531428571428571,
  0.05555714285714286,
  -0.004371428571428571,
  0.059871428571428575,
  -0.021871428571428573,
  -0.06575714285714286,
  -0.02027142857142857,
  0.026071428571428572,
  -0.053085714285714285,
  -0.06271428571428572

In [189]:
# https://stackoverflow.com/questions/18424228/cosine-similarity-between-2-number-lists

def cosinesimilarity(xs : [int], ys : [int]):
    return np.dot(xs,ys)/(np.linalg.norm(xs)*np.linalg.norm(ys))

In [190]:
tokidict['linja']

['elongated',
 'oblong',
 'long',
 'long and flexible thing; string',
 'rope',
 'hair',
 'thread',
 'cord',
 'chain',
 'line',
 'yarn']

PCA
src: https://builtin.com/data-science/step-step-explanation-principal-component-analysis
1. Find mean and standard deviation of each parameter in big-dimensional space.
2. Find covariance of each pair of attributes
   - (cov(x,y) = sum((x_i - x_m) * (y_i - y_m)) / n )
   - Positive covariance -> x goes up when y goes up and vice versa
   - Negative covariance -> x goes down when y goes up and vice versa
3. Find eigenvectors of resulting matrix with 
   largest eigenvalues
4. Recast initial data with those eigenvectors 
   as axes

In [191]:
def transpose(xss):
    ssx = []
    fs,*rs = xss
    for x in fs:
        ssx.append([x])
    for xs in rs:
        c=0
        for x in xs:
            ssx[c].append(x)
            c+=1
    return ssx

In [192]:
means = []
stdevs = []
for i in range(300):
    mean  = 0
    for k in tpvecs.keys():
        mean += tpvecs[k][i]
    mean /= len(tpvecs)
    means.append(mean)
for i in range(300):
    sumstdev = 0
    for k in tpvecs.keys():
        sumstdev += tpvecs[k][i] ** 2
    sumstdev /= len(tpvecs)
    stdevs.append(sumstdev ** 0.5)





covmatrix = []
for x in range(300):
    covmatrix.append([])
    for y in range(300):
        meanx = means[x]
        meany = means[y]
        summ = 0
        for k in tp_vocab:
            summ += ((tpvecs[k][x] - meanx) * (tpvecs[k][y] - meany))
        covmatrix[-1].append(summ)

eigens = np.linalg.eig(covmatrix)
eigenvalues = list([(eigens[0][y], y) for y in range(300)])
eigenvectors = list(eigens[1])

sortedeigenvalues = sorted(enumerate(eigenvalues), key=lambda x: x[1], reverse=True)
top_10_indices = [index for index, _ in sortedeigenvalues[:10]]
feature_vectors = [eigenvectors[i] for i in top_10_indices]


feature_vectors = feature_vectors
# sortedeigenvalues


tpvecsmatrix = []
for k in tp_vocab:
    tpvecsmatrix.append(tpvecs[k])
tpvecsmatrix = transpose(tpvecsmatrix)

len(tpvecsmatrix), len(tpvecsmatrix[0])

pca_reduced_dimensions = np.matmul(feature_vectors, tpvecsmatrix)

pca_reduced_dimensions = transpose(pca_reduced_dimensions)

In [193]:
len(pca_reduced_dimensions), len(pca_reduced_dimensions[0])

(154, 10)

In [211]:
words = (nouns + verbs + adjcs + advbs + preps + conjs)
np.random.shuffle(words)
for word in words[0:250]:
    l = list(word2vec[word2vec.word == word].to_numpy())
    if len(l) > 0:
        trialvec = l[0][2:302]
    reduced = np.matmul(feature_vectors,trialvec)
    distmeasures = []
    for t in range(len(tp_vocab)):
        dm = 0
        for i,p in enumerate(reduced):
            re = (np.real(pca_reduced_dimensions[t][i]) - np.real(p)) ** 2
            im = (np.imag(pca_reduced_dimensions[t][i]) - np.imag(p)) ** 2
            dm += (re+im)
        distmeasures.append((tp_vocab[t], dm))
    distmeasures
    print(word, [(x[0],round(x[1],3)) for x in sorted(distmeasures, key=lambda x: x[1])[0:3]])


music [('usawi', 0.042), ('kalama', 0.044), ('len', 0.047)]
competition [('kulupu', 0.026), ('alasa', 0.026), ('sin', 0.029)]
indicate [('selo', 0.017), ('weka', 0.022), ('waleja', 0.025)]
mandate [('pana', 0.029), ('utala', 0.03), ('ken', 0.032)]
scientific [('utala', 0.013), ('pana', 0.014), ('kulupu', 0.014)]
entrance [('o', 0.009), ('la', 0.013), ('mu', 0.013)]
wheat [('pan', 0.03), ('kili', 0.046), ('kasi', 0.047)]
raw [('moku', 0.019), ('seli', 0.022), ('lete', 0.027)]
particularly [('taso', 0.009), ('ala', 0.011), ('seme', 0.011)]
couple [('mije', 0.018), ('taso', 0.019), ('pata', 0.023)]
count [('nanpa', 0.009), ('ko', 0.013), ('tu', 0.018)]
elbow [('sinpin', 0.006), ('palisa', 0.016), ('monsi', 0.019)]
presumably [('ni', 0.016), ('anu', 0.017), ('ala', 0.019)]
conduct [('pana', 0.014), ('ken', 0.021), ('pali', 0.022)]
powerful [('ale', 0.024), ('wawa', 0.025), ('suno', 0.031)]
assault [('weka', 0.021), ('utala', 0.024), ('ken', 0.027)]
lower [('ekon', 0.008), ('anpa', 0.017), 

In [195]:
ns = [np.random.randint(300) for i in range(10)]
for n in ns:
    print(tpvecs['nena'][n])
    print(tpvecs['supa'][n])

-0.0020461538461538455
0.009510000000000001
-0.030369230769230766
-0.03626
-0.004815384615384615
-0.018979999999999997
-0.0187
0.0004099999999999979
-0.021107692307692313
-0.02015
-0.009084615384615385
-0.000669999999999999
-0.03813076923076923
0.006520000000000001
0.0070846153846153845
-0.020499999999999997
0.0034307692307692307
-0.04161000000000001
0.014492307692307691
0.00936


In [196]:
tokidict['akesi']

['reptile', 'amphibian']

In [197]:
tp_vocab = toki.TOKI.drop_duplicates().tolist()

In [198]:
''.join(['t','elep','hone'])

'telephone'

In [199]:
# From http://www.textfixer.com/resources/common-english-words.txt
bannedWords = "a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your".split(",")


In [200]:
from matplotlib import pyplot as plt

In [201]:
data = np.array()
m, n = data.shape
np.random.shuffle(data) # shuffle before splitting into dev and training sets

data_dev = data[0:1000].T
Y_dev = data_dev[0]
X_dev = data_dev[1:n]
X_dev = X_dev / 255.

data_train = data[1000:m].T
Y_train = data_train[0]
X_train = data_train[1:n]
X_train = X_train / 255.
_,m_train = X_train.shape

TypeError: array() missing required argument 'object' (pos 0)

In [202]:
tokisentences

Unnamed: 0,toki,english
0,mi suli,I am tall
1,mi suli,Im important
2,mi suli,Im fat
3,mi suli,Im big
4,kili li tawa anpa poka kasi,The apple does not fall far from the tree
...,...,...
25532,tenpo pi mute lili la ijo ike li kama e jan,Sometimes bad things happen to people
25533,tenpo pi mute lili la ijo ike il kama e jan pona,Sometimes bad things happen to good people
25534,ona li weka e kon tawa soweli kepeken luka lon...,She strangled a cat
25535,mi mute li awen ala lon tomo tan seme,Why dont we stay home


In [203]:
from collections import defaultdict


    
        
class QTable:
    def __init__(self, states, targetVisits, discount, rateConstant):
        self.q      = [[0] * len(states)] * len(states)
        self.visits = [[0] * len(states)] * len(states)
        self.targetVisits = targetVisits
        self.d  = discount
        self.rc = rateConstant
        self.ls = 0
        self.la = 0

In [204]:
class DTNode:


    class DTLeaf:
        def __init__(self, ls : list[int]):
            self.label = ls
        def classify(self, v : float):
            return self.label
        def addAllLabels(self, sl : set(int)):
            sl.add(self.label)

    class DTInterior:
        def __init__(self, leftSubtree, rightSubtree, decisionFeature, maxFeatureValue, getFeatureValue, successor):
            self.leftSubtree = leftSubtree
            self.rightSubtree = rightSubtree
            self.decisionFeature = decisionFeature
            self.maxFeatureValue = maxFeatureValue
            self.getFeatureValue = getFeatureValue
            self.successor       = successor
        
        def classify(self, v):
            if self.maxFeatureValue < self.getFeatureValue(v, self.decisionFeature):
                return self.rightSubtree.classify(v)
            else:
                return self.leftSubtree.classify(v)

        def addAllLabels(self, sl):
            self.leftSubtree.addAllLabels(sl)
            self.rightSubtree.addAllLabels(sl)        

        

TypeError: 'type' object is not iterable

In [None]:
# data = dictionary of english string to toki pona sequence

def splitOn(data: dict, feature, featureValue, getFeatureValue):
    less = dict()
    more = dict()
    for k in data.keys():
        if getFeatureValue(k) > getFeatureValue:
            more[k] = data[k]
        else:
            less[k] = data[k]
    return (more, less)

In [None]:
def gini(data : dict):
    g = 0
    lbs = Histogram()
    for k in data.keys():
        lbs.inc(data[k])
    for l in lbs.hg.keys():
        p_i = lbs.getCountFor(l) / len(data)
        g += p_i * p_i
    return 1 - g

def gain(parent, child1, child2):
    return gini(parent) - gini(child1) - gini(child2)

In [None]:
def train(data : dict):
    if len(data.keys()) == 1:
        return data[data.keys()[0]]
    if len(data.keys()) == 0:
        return DTNode.DTLeaf([0])

In [None]:
# with open('tokisentences.txt', 'r') as l:
#     with open('output.txt', 'w') as f:
#         for line in l.readlines():
#             tokens = line.strip().split(',')
#             if len(tokens) == 2:
#                 output_string = ""
#                 for char in line:
#                     if char.isalpha() or char == '\n':
#                         output_string += char
#                 f.write(output_string)

In [None]:
from hmmlearn import hmm
from typing import *
import time

In [None]:
tokitoint = dict()
for i,w in enumerate(tp_vocab):
    tokitoint[w] = i

engtoint = dict()
for i,w in enumerate(freq.Word):
    engtoint[w] = i

In [222]:
tokidata = dict()
for index in range(len(tokisentences)):
    tp,en = str(tokisentences['toki'][index]),str(tokisentences['english'][index])
    if en[0] == ' ':
        en = en[1:]
    tpsep = tp.split()
    ts = []
    es = []
    n=False
    for t in tpsep:
        if t in tokitoint:
            ts.append(tokitoint[t])
        else:
            n=True
    if not n:
        tokidata[en] = ts

In [None]:
engcoder = dict()
for i,c in enumerate(" abcdefghijklmnopqrstuvwxyz"):
    engcoder[c] = i

In [None]:
def build_model_score(num_states: int, training_sequence: List[int]) -> Tuple[hmm.CategoricalHMM, float]:
    model = hmm.CategoricalHMM(n_components=num_states, n_iter=100)
    model.fit(training_sequence)
    return model, model.score(training_sequence)

# STATISTICAL MACHINE TRANSLATION

$f$ = string in  given language, like "I love my cat."

$e$ = string in target language, like "mi olin e soweli suno"

We want to return whatever $e$ maximizes $p(e|f)$.

Bayes' Theorem Micro: CS Edition shows us that $p(f|e)p(e)$ works here. 


In [229]:
class Histogram:
    def __init__(self):
        self.hg = dict()

    def incby(self, key, n):
        if key not in self.hg:
            self.hg[key] = n
        else: 
            p = self.hg[key] 
            self.hg[key] = p + n

    def inc(self, key):
        self.incby(key,1)

    def getCountFor(self, value):
        if value not in self.hg:
            return 0        
        return self.hg[value]
    
    def getTotalCounts(self):
        total = 0
        for x in self.hg.keys():
            if x in self.hg:
                total += self.hg[x]
        return total

    def getPortionFor(self, value):
        return self.getCountFor(value) / self.getTotalCounts()

featuresbylabel = dict()
priors = Histogram()
for k in tokidata.keys():
    fs = k.split()
    
    for f in fs:
        for i in tokidata[k]:
            if i in featuresbylabel:
                featuresbylabel[i].inc(f)
            else:
                featuresbylabel[i] = Histogram()
                featuresbylabel[i].inc(f)
    for i in tokidata[k]:
        priors.inc(i)

def classify(w : str):
    m = 0,""
    for l in tokitoint.keys():
        if tokitoint[l] in featuresbylabel:
            v = featuresbylabel[tokitoint[l]].getPortionFor(w) 
            v /= priors.getCountFor(tokitoint[l])
            if v > m[0]:
                m = v,l
    return m[1]

classify('dog')

'soweli'

In [231]:
print(f"nuts: {classify('nuts')}, dead: {classify('dead')}, music: {classify('music')}, grave: {classify('grave')}, sudden: {classify('sudden')}")

nuts: nasa, dead: moli, music: kalama, grave: , sudden: 


In [None]:
featuresbylabel[75].hg

{'dead': 1,
 'man': 1,
 'I': 4,
 'will': 1,
 'die': 3,
 'for': 1,
 'you': 1,
 'want': 3,
 'to': 4,
 'stay': 1,
 'alive': 1,
 'used': 1}

In [None]:
def filter_text(text: str) -> (str,int):
    result = " "
    lowerbet = "qwertyuiopasdfghjklzxcvbnm "
    upperbet = "QWERTYUIOPASDFGHJKLZXCVBNM"
    for c in text:
        if c in lowerbet:
            result += c
        elif c in upperbet:
            for i in range(26):
                if upperbet[i] == c:
                    result += lowerbet[i]
        else:
            continue
    return result