### Importing libraries

In [1]:
from modules.stemmer import *
from modules.ngrams_extractor import *
from modules.assembler import *
import time

In [2]:
rng = np.random.default_rng(seed=42)

### Selecting a language and loading the data

#### Loading the training data

In [3]:
trn_file_path = 'task0-data/DEVELOPMENT-LANGUAGES/oto-manguean/zpv.trn'
with open(trn_file_path, encoding='utf8') as f:
    trn_data = f.read().split('\n')
    trn_data = [sent.split('\t') for sent in trn_data]
    trn_data.pop(-1)
trn_data = np.array(trn_data)
len(trn_data)

805

#### Loading the validation data

In [4]:
val_file_path = 'task0-data/DEVELOPMENT-LANGUAGES/oto-manguean/zpv.dev'
with open(val_file_path, encoding='utf8') as f:
    val_data = f.read().split('\n')
    val_data = [sent.split('\t') for sent in val_data]
    val_data.pop(-1)
val_data = np.array(val_data)
# permutation = rng.permutation(len(val_data)).astype(int)
# val_data = val_data[permutation][:100]
len(val_data)

113

#### Loading the test data

In [5]:
tst_file_path = 'task0-data/DEVELOPMENT-LANGUAGES/oto-manguean/zpv.tst'
with open(tst_file_path, encoding='utf8') as f:
    tst_data = f.read().split('\n')
    tst_data = [sent.split('\t') for sent in tst_data]
    tst_data.pop(-1)
tst_data = np.array(tst_data)
#tst_data = tst_data[:1000]
len(tst_data)

228

### Optimizing the hyper-parameters on the validation set

In [6]:
n_neighbors_values = [1,2,3,4,5]
niter_values = [3,4,5]
threshold_values = [0.1,0.3,0.4,0.5,0.6]

perf = {}
min_dist = np.inf
min_dist_params = None
max_acc = 0
max_acc_params = None

i = 0

start = time.time()

for n_neighbors in n_neighbors_values:
    for niter in niter_values:
        for threshold in threshold_values:
            
            #print(n_neighbors,niter,threshold)
            
            i +=1
            if i % 10 == 0: print(i)
            
            # Fitting the Stemmer
            stemmer = Stemmer(n_neighbors)
            stemmer.fit(trn_data)

            grouped_stems = stemmer.grouped_stems
            
            # Fitting the NgramsExtractor
            extractor = NgramsExtractor()
            extractor.fit(trn_data,niter=niter)

            ngrams_per_attr_group = extractor.get_ngrams_per_attr_group(threshold)
            
            # Fitting the Assembler
            assembler = Assembler()
            assembler.fit(grouped_stems, ngrams_per_attr_group)
            
            # Evaluation of the performance on the validation set
            mean_dist = 0
            accuracy = 0
            
            for lemma, final, attr_grp in val_data:
                
                lemma_grp, stem = stemmer.predict(lemma)
                
                try:
                    pred = assembler.predict(stem, lemma_grp, attr_grp)
                except:
                    pred = ""

                if pred == final:
                    accuracy += 1

                dist = enchant.utils.levenshtein(pred,final)
                mean_dist += dist
            
            accuracy /= len(val_data)
            mean_dist /= len(val_data)
            
            if accuracy > max_acc:
                max_acc = accuracy
                max_acc_params = (n_neighbors,niter,threshold)
            
            if mean_dist < min_dist:
                min_dist = mean_dist
                min_dist_params = (n_neighbors,niter,threshold)
                
            perf[(n_neighbors,niter,threshold)] = (accuracy, mean_dist)

end = time.time()
duration = end - start
                
print('Best mean distance: {} with {}'.format(min_dist,min_dist_params))
print('Best accuracy:      {} with {}'.format(max_acc,max_acc_params))
print('Duration of the grid-search: {}'.format(duration))

10
20
30
40
50
60
70
Best mean distance: 0.8495575221238938 with (3, 3, 0.1)
Best accuracy:      0.584070796460177 with (3, 3, 0.1)
Duration of the grid-search: 52.004246950149536


In [8]:
#perf