In [1]:
!gsutil cp -r gs://translation-dataset-aa/data ../

/bin/bash: gsutil: command not found


In [3]:
from translation_machine import dataset_mod
import numpy as np
import torch
_whole_dataset = dataset_mod.DatasetFromTxt("../data/french_english_dataset/fra.txt")

nb_folds = 9
idxs_train = [el  for i in range(nb_folds-2) for el in np.arange(i,len(_whole_dataset),nb_folds)]
idxs_val = np.arange(nb_folds-2,len(_whole_dataset),nb_folds)
idxs_test = np.arange(nb_folds-1,len(_whole_dataset),nb_folds)

import os
os.makedirs("../dataset_splitting",exist_ok=True)
np.save("../dataset_splitting/idx_train",idxs_train)
np.save("../dataset_splitting/idx_val",idxs_val)
np.save("../dataset_splitting/idx_test",idxs_test)


#TODO: check is that 'DatasetFromTxt' is fully loaded
train_dataset = torch.utils.data.Subset(_whole_dataset,idxs_train)
val_dataset = torch.utils.data.Subset(_whole_dataset,idxs_val)

assert np.all(np.load("../dataset_splitting/idx_train.npy") == idxs_train)
assert np.all(np.load("../dataset_splitting/idx_test.npy") == idxs_test)
assert np.all(np.load("../dataset_splitting/idx_val.npy") == idxs_val)

In [4]:
len(idxs_train)/len(_whole_dataset),len(idxs_test)/len(_whole_dataset),len(idxs_val)/len(_whole_dataset)

(0.7777777777777778, 0.1111111111111111, 0.1111111111111111)

In [5]:
import itertools
train_val_dataset = list(itertools.chain(train_dataset,val_dataset))

In [7]:
from torchtext.data.utils import get_tokenizer
# we use the same tokenizer for english and 
english_tokenizer = get_tokenizer('spacy', language='en_core_web_sm')
french_tokenizer = get_tokenizer('spacy', language='fr_core_news_sm')

In [8]:
from collections import Counter, OrderedDict
from itertools import islice
import torchtext
from torchtext.vocab import build_vocab_from_iterator

english_counter = Counter()
french_counter = Counter()

from tqdm import tqdm
for idx,el in enumerate(train_val_dataset):
    english_sentence = english_tokenizer(el[0])
    french_sentence = french_tokenizer(el[1])
    english_counter.update(english_sentence)
    french_counter.update(french_sentence)


vocab_english = torchtext.vocab.vocab(english_counter,min_freq=10,specials=['<unk>'])
vocab_french = torchtext.vocab.vocab(french_counter,min_freq=10,specials=['<unk>','<sos>','<eos>'])

vocab_english.set_default_index(vocab_english['<unk>'])
vocab_french.set_default_index(vocab_french['<unk>'])
len(vocab_english),len(vocab_french)

(4076, 5407)

In [7]:
#getting the tallest sequences for each language
length_en_sentences = []
length_fr_sentrences = []
for el in train_val_dataset:
    length_en_sentences.append(len(el[0]))
    length_fr_sentrences.append(len(el[1]))
max_length_french = max(length_fr_sentrences)
max_length_english = max(length_en_sentences)


### saving the vocabulary along with the model

In [8]:
import os
os.makedirs("../models/vocabs",exist_ok=True)

In [9]:
french_language_info = {"vocab":vocab_french,
                           "max_sentence_train_val":max_length_french}
english_language_info = {"vocab":vocab_english,
                           "max_sentence_train_val":max_length_english}

language_info = {"french":french_language_info,
                "english":english_language_info}

torch.save(language_info,"../models/language_info.pth")

### some  sanity tests for io of vocabs  

In [10]:
language_info_from_file = torch.load("../models/language_info.pth")
vocab_english_from_file = language_info["english"]["vocab"]
vocab_french_from_file = language_info["french"]["vocab"]

In [11]:
assert vocab_french.vocab.get_itos() == vocab_french_from_file.vocab.get_itos() 
assert vocab_french.vocab.get_stoi() == vocab_french_from_file.vocab.get_stoi() 

In [12]:
assert vocab_english.vocab.get_itos() == vocab_english_from_file.vocab.get_itos() 
assert vocab_english.vocab.get_stoi() == vocab_english_from_file.vocab.get_stoi() 