# Welcome to pyiunstir!

This is a very basic tool to perform spellchecking using a Hidden Markov Model

In [1]:
import json
import numpy as np
from collections import defaultdict
from scipy.stats import median_abs_deviation as mad

from hmmlearn import hmm

from pyiunstir.encoding import *

# Load the encodings
f_json = open('corpus/iberian.json', 'r')
symbols = json.load(f_json)
f_json.close()

# Load a collection of text written using the Northeastern script
f_json = open('corpus/NE_database.json', 'r')
corpus_NE = json.load(f_json)
f_json.close()

### Extract the simplified text for each entry in the database and encode it using the simplified script (remove duality)

In [2]:

words = []

for instance in corpus_NE:
    line = instance['text_simplified']
    for row in line:
        for word in row.split(':'):
            #word = encode_iberian(standarize_word(word),symbols,dual=False)
            word = encode_iberian(word,symbols,dual=False)
            words.append(word)

### Let's check how the encoded words look like

In [3]:
words

[[4, 3, 12, 17, 4, 25, 4],
 [4, 3, 12, 17, 4, 25, 4],
 [4, 3, 12, 17, 4, 25, 4],
 [4, 3, 12, 17, 4, 25, 4],
 [4, 3, 12, 17, 4, 25, 4],
 [4, 3, 12, 17, 4, 25, 4],
 [4, 3, 12, 17, 4, 25, 4],
 [4, 3, 12, 17, 4, 25, 4],
 [4, 3, 12, 17, 4, 25, 4],
 [4, 3, 12, 17, 4, 25, 4],
 [4, 3, 12, 17, 4, 25, 4],
 [4, 3, 12, 17, 4, 2, 4],
 [6, 7, 35, 17, 6],
 [46, 14, 10, 46],
 [46, 14, 10, 46],
 [46, 14, 10, 46],
 [46, 10, 2, 22, 4, 15, 4],
 [46, 10, 2, 22, 4, 15, 4],
 [46, 10, 2, 22, 4, 15],
 [46, 10, 2, 22, 4, 15, 17],
 [46, 10, 2, 22, 4, 15, 17],
 [14, 4, 15, 35, 12],
 [19, 4, 60, 4, 7, 25, 10, 3],
 [19, 4, 60, 4, 7, 25, 10, 3],
 [14, 4, 15, 25, 5, 25, 4],
 [14, 4, 15, 25, 5],
 [3, 58, 12, 58, 12],
 [14, 4, 15, 25, 5, 25, 4],
 [5, 3, 13, 58, 4],
 [3, 11],
 [1, 4],
 [14, 4, 15, 25, 5, 25, 4],
 [3, 11],
 [1, 4],
 [14, 4, 15, 25, 5, 25, 4],
 [3, 58, 12, 58, 12],
 [14, 4, 15, 25, 5, 25, 4],
 [3, 58, 12, 58, 12],
 [14, 4, 15, 25, 5, 25, 4],
 [3, 58, 12, 58, 12],
 [14, 4, 15, 25, 5, 25, 4],
 [3, 58, 12, 5

### Paste all words as sequences for the HMM

In [4]:
all_chars = []
final = []
seqs = []
lseq = []


for word in words:
    cseq = []
    for q in list(word):
        cseq.append([q])
    if len(cseq) > 0:
        lseq.append(len(cseq))
        seqs.append(cseq)

aseqs = np.concatenate(seqs)        

### Train the model with the texts

In [5]:
model = hmm.GaussianHMM(n_components=5, covariance_type="full")
model.fit(aseqs, lseq)

GaussianHMM(covariance_type='full', n_components=5)

### Texts with different sizes will have different scores. Let's use a simple normalization scheme (Robust z-score)

In [6]:
lh = []
cpos = 0
for i in range(len(lseq)):
    cnext = lseq[i] + cpos
    if cnext >= aseqs.shape[0]:
        break
    #print(cpos, cnext)
    frag=aseqs[cpos:cnext]
    rw = []
    for qc in frag.tolist():
        rw.append(qc[0])
    rw = decode_iberian(rw,symbols)
    lh.append([i,model.score(frag),rw])
    cpos = cnext
lh = sorted(lh,key=lambda s: s[1])

logs_per_size = defaultdict(list)
for row in lh:
    logs_per_size[len(row[2])].append(row[1])

    

### Let's try to identify which words are "less iberian"

In [7]:
norm_loglk = []
for row in lh:
    #if row[2] in places or row[2] in galos or row[2] in nps or row[2] in gs or row[2] in nbs:
    #    continue
    mu = np.median(logs_per_size[len(row[2])])
    sigma = mad(logs_per_size[len(row[2])])
    ns = len(logs_per_size[len(row[2])])
    limit = mu - 1*sigma
    #if row[1] < limit:
    if sigma != 0:
        norm_loglk.append([row[0],row[2],(row[1]-mu)/sigma, row[1], ns])
nlh = sorted(norm_loglk,key=lambda s: s[2])

In [8]:
nlh

[[3473, 'abuloŕaune', -9.558623493515304, -40.6657450781639, 303],
 [3650, 'alaśbuŕ', -8.931242042598667, -31.442320841758594, 569],
 [3677, 'alaśbuŕ', -8.931242042598667, -31.442320841758594, 569],
 [3707, 'alaśbuŕ', -8.931242042598667, -31.442320841758594, 569],
 [1769, 'buranalir', -8.516153873555595, -36.526894732676304, 379],
 [433, 'buŕsau', -8.078441140544466, -27.771008808272896, 544],
 [434, 'buŕsau', -8.078441140544466, -27.771008808272896, 544],
 [435, 'buŕsau', -8.078441140544466, -27.771008808272896, 544],
 [436, 'buŕsau', -8.078441140544466, -27.771008808272896, 544],
 [437, 'buŕsau', -8.078441140544466, -27.771008808272896, 544],
 [438, 'buŕsau', -8.078441140544466, -27.771008808272896, 544],
 [439, 'buŕsau', -8.078441140544466, -27.771008808272896, 544],
 [2897, 'buŕsau', -8.078441140544466, -27.771008808272896, 544],
 [2898, 'buŕsau', -8.078441140544466, -27.771008808272896, 544],
 [1387, 'abultumantiŕ', -8.075771786056622, -45.90451487740755, 89],
 [3628, 'kanbulo', -

### As can be seen, most of the outliers are actually foreign personal/city names (celtic)