### Importing libraries

In [1]:
from modules.stemmer import *
from modules.ngrams_extractor import *
from modules.assembler import *

### Loading data

#### Loading the training data

In [2]:
trn_file_path = 'task0-data/SURPRISE-LANGUAGES/Uto-Aztecan/ood.trn'
with open(trn_file_path, encoding='utf8') as f:
    trn_data = f.read().split('\n')
    trn_data = [sent.split('\t') for sent in trn_data]
    trn_data.pop(-1)
len(trn_data)

1123

#### Loading validation data

In [3]:
val_file_path = 'task0-data/SURPRISE-LANGUAGES/Uto-Aztecan/ood.dev'
with open(val_file_path, encoding='utf8') as f:
    val_data = f.read().split('\n')
    val_data = [sent.split('\t') for sent in val_data]
    val_data.pop(-1)
len(val_data)

160

#### Loading test data

In [4]:
tst_file_path = 'task0-data/GOLD-TEST/ood.tst'
with open(tst_file_path, encoding='utf8') as f:
    tst_data = f.read().split('\n')
    tst_data = [sent.split('\t') for sent in tst_data]
    tst_data.pop(-1)
#tst_data = tst_data[:1000]
len(tst_data)

314

### Fitting the stemmer

In [5]:
n_neighbors = 2
stemmer = Stemmer(n_neighbors)
stemmer.fit(trn_data)

grouped_stems = stemmer.grouped_stems

### Fitting the NgramsExtractor

In [6]:
extractor = NgramsExtractor()
extractor.fit(trn_data,niter=3)
threshold = 0.1

ngrams_per_attr_group = extractor.get_ngrams_per_attr_group(threshold)

### Fitting the Assembler

In [7]:
assembler = Assembler()
assembler.fit(grouped_stems, ngrams_per_attr_group)

### Evaluation on the test set

In [8]:
precision = 0
mean_dist = 0
max_dist = 0
mean_len = 0
preds = []
i= 0
for lemma, final, attr_grp in tst_data:
    
    i += 1
    if (i % 100) == 0: print(i)
    
    lemma_grp, stem = stemmer.predict(lemma)
    
    try:
        pred = assembler.predict(stem, lemma_grp, attr_grp)
    except:
        pred = ""
        
    preds.append(pred)
    
    if pred == final:
        precision += 1
        
    mean_len += len(final) 
    
    dist = enchant.utils.levenshtein(pred,final)
    #print(pred,final,pred==final,dist)
    mean_dist += dist
    
    if dist > max_dist:
        max_dist = dist
    
precision /= len(tst_data)
mean_dist /= len(tst_data)
mean_len /= len(tst_data)
precision, mean_dist, max_dist, mean_len

100
200
300


(0.40445859872611467, 1.2484076433121019, 6, 6.987261146496815)

In [9]:
f = open("outputs-non-neural/ood.out", "a")
for (l, final ,tag), pred in zip(tst_data,preds):
    f.write("\t".join([l,pred,tag]) + "\n")
f.close()