In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torchtext.datasets import TranslationDataset, Multi30k
from torchtext.data import Field, BucketIterator
import models.Model_Evaluation as me

import spacy
import numpy as np

import random
import math
import time

%load_ext autoreload
%autoreload 2

In [2]:
spacy_de = spacy.load('de')
spacy_en = spacy.load('en')

def tokenize_de(text):
    return [tok.text for tok in spacy_de.tokenizer(text)]

def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [3]:
SRC = Field(tokenize = tokenize_de, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True)

TRG = Field(tokenize = tokenize_en, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True)

train_data, valid_data, test_data = Multi30k.splits(exts = ('.de', '.en'), 
                                                    fields = (SRC, TRG))

SRC.build_vocab(train_data, min_freq = 5)
TRG.build_vocab(train_data, min_freq = 5)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

BATCH_SIZE = 128

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE,
    device = device)

In [34]:
for pair in test_data:
    for word in pair.trg:
        if TRG.vocab.stoi[word]==0:
            print(word)

boston
breaking
snowmobiles
strapless
igloo
avoid
pretend
statutes
somersault
sponge
reenacting
union
harpsichord
moderately
populated
such
hoe
wakeboarder
towed
depicting
sundown
limousines
pail
wakeboarding
genders
alligator
hops
perplexed
cubicles
palms
refreshment
rises
mountaineer
descend
wades
retrieve
teaches
retrieving
bruised
rental
heard
joke
muzzles
hoisted
brazil
informal
tabs
abseiling
fox
frowns
snub
smells
days
taco
rose
idle
natural
region
comfy
focuses
tastes
wineglass
shave
lumber
consumed
technician
sheperds
snarling
fetching
advertises
elders
bullhorn
springs
tunes
sponges
mending
chilling
redish
mock
gi
tends
parka
el
corazon
drops
weights
pacific
2007
dried
manner
rising
revolutionary
period
ringing
bell
nuzzling
hummer
insect
demonstarting
texts
highlighted
agility
pinning
gates
pounds
protecting
shots
bunk
alien
taped
dali
lama
participants
brought
carnations
parasols
prayer
proud
loudspeaker
costumed
veil
steeple
vault
drumsticks
kinds
harvest
coal
flapping
gua

In [40]:
a = me.splitDatabySentenceLength(test_data, tick=5, choice='src')

In [42]:
for count, li in enumerate(a.values()):
    print(f'#######{count}#######')
    for s in li:
        print(len(s.src))

#######0#######
#######1#######
7
9
7
7
8
7
8
6
8
8
9
6
9
8
7
9
6
7
8
8
9
7
8
8
8
7
9
9
7
8
9
9
8
8
9
7
9
7
7
9
8
9
9
7
9
9
9
9
9
8
6
7
7
8
9
9
8
9
9
7
5
8
8
8
8
8
9
9
9
7
9
8
7
9
6
9
6
8
7
7
7
7
8
9
8
9
9
8
9
8
9
6
8
8
9
7
7
9
9
8
5
7
6
5
8
9
9
9
8
8
8
8
9
9
8
8
9
7
7
7
8
9
7
9
9
8
7
8
7
9
5
8
9
9
9
9
8
9
9
8
9
9
9
5
7
8
9
9
8
8
8
8
8
8
9
7
8
9
7
8
9
8
9
8
8
8
9
7
6
9
9
8
8
9
9
7
8
8
9
8
5
9
7
6
7
7
9
7
7
8
8
9
8
9
7
9
9
8
9
9
9
9
9
8
8
8
9
8
6
6
8
9
8
9
7
7
7
9
9
5
8
8
7
7
7
9
7
7
9
8
9
7
6
8
8
9
8
8
9
7
7
8
9
7
9
8
9
8
9
7
9
9
9
8
6
8
8
8
8
8
9
9
9
9
7
9
9
9
6
7
6
7
9
6
9
9
9
8
7
#######2#######
11
12
12
13
13
13
11
10
10
14
11
11
10
11
14
13
12
10
14
13
10
14
13
11
10
13
13
12
13
12
10
11
10
12
12
11
14
12
10
14
10
13
12
13
11
11
10
11
13
13
11
11
11
10
13
14
13
12
11
11
12
11
12
12
12
13
11
10
11
10
12
12
11
11
11
10
14
13
13
11
11
10
12
13
10
11
13
10
12
12
12
13
11
13
14
11
13
10
13
11
12
11
11
13
13
13
12
11
10
10
10
10
12
12
11
12
12
11
12
12
13
10
12
13
13
13
13
10
13
14
12
1

In [43]:
b = me.splitDatabyNumberOfUnknownWords(test_data, choice='src', field=SRC)

In [45]:
for idx in sorted(b.keys()):
    print(f'#######{idx}########')
    for sentence in b[idx]:
        num_unk = 0
        for word in sentence.src:
            if SRC.vocab.stoi[word] == 0:
                num_unk += 1
        print(num_unk)

#######0########
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
#######1###

In [7]:
choice = 'trg'
assert (choice == 'trg' or choice == 'hyp'), 'ah oh'

In [4]:
{float(idx/10): [] for idx in range(10)} 

{0.0: [],
 0.1: [],
 0.2: [],
 0.3: [],
 0.4: [],
 0.5: [],
 0.6: [],
 0.7: [],
 0.8: [],
 0.9: []}

In [18]:
import math
print(math.floor(0.9432*10/2))
print(math.floor(0.97656*10))
print(math.floor(0.91235*10))
print(math.floor(0.42312312*10/2))
print(math.floor(0.4999566*10))

4
9
9
2
4
