In [1]:

from datasets import load_dataset
from tqdm import tqdm
dataset = load_dataset("wmt/wmt16", 'cs-en')

In [2]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 997240
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 2656
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 2999
    })
})


In [3]:
print(dataset['train']['translation'][0])

{'cs': 'Následný postup na základě usnesení Parlamentu: viz zápis', 'en': "Action taken on Parliament's resolutions: see Minutes"}


In [4]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1006)>


In [5]:
def tokenize_text(text):
    # Tokenize the text into words
    tokens = word_tokenize(text)
    return tokens


In [6]:
tokenize_text("Hello, how are you")

['Hello', ',', 'how', 'are', 'you']

In [7]:
a = word_tokenize(dataset['train']['translation'][0]['cs'], language='czech')

In [8]:
len(a)

9

In [9]:
english_sentences = []
czech_sentences = []
for item in dataset['train']['translation']:
    english_sentences.append(word_tokenize(item['en']))
    czech_sentences.append(word_tokenize(item['cs'], language='czech'))

In [10]:
# get number of unique words in the dataset
english_words = set()
czech_words = set()
for sentence in english_sentences:
    for word in sentence:
        english_words.add(word)
for sentence in czech_sentences:
    for word in sentence:
        czech_words.add(word)
        
print("Number of unique English words: ", len(english_words))
print("Number of unique Czech words: ", len(czech_words))

Number of unique English words:  235981
Number of unique Czech words:  431445


In [14]:
# Get top common words
from collections import Counter
english_word_freq = Counter()
czech_word_freq = Counter()
for sentence in english_sentences:
    for word in sentence:
        english_word_freq[word] += 1
for sentence in czech_sentences:
    for word in sentence:
        czech_word_freq[word] += 1

english_most_common_words = [word for word, _ in english_word_freq.most_common(200)]
czech_most_common_words = [word for word, _ in czech_word_freq.most_common(200)]


In [10]:
# save the tokenized sentences
import pickle
with open('english_sentences.pkl', 'wb') as f:
    pickle.dump(english_sentences, f)
with open('czech_sentences.pkl', 'wb') as f:
    pickle.dump(czech_sentences, f)

In [11]:
import multiprocessing

cores = multiprocessing.cpu_count() # Count the number of cores in a computer

In [12]:
cores

16

In [17]:
from tqdm import tqdm
from gensim.models import Word2Vec
english_model = Word2Vec(english_sentences, vector_size=100, window=5, min_count=5)
czech_model = Word2Vec(czech_sentences, vector_size=100, window=5, min_count=5)

In [18]:
#save the models
english_model.save('english_model.bin')
czech_model.save('czech_model.bin')


In [15]:
english_model.build_vocab(english_sentences, progress_per=10000)

In [16]:
english_model.wv['learn']

array([-1.8469446 ,  2.0154119 ,  1.041538  ,  1.3715504 ,  0.4510828 ,
        0.27229163, -0.90095884,  1.4303962 , -2.984903  ,  0.1789526 ,
        1.2223547 , -1.0638099 ,  1.8196449 , -0.04832289, -1.8261926 ,
       -1.4057215 , -0.09073389,  0.78078526, -0.66417646,  1.1698984 ,
        1.3537631 , -0.7713773 ,  0.68272245, -0.975316  ,  1.5082344 ,
        0.16980742,  0.7679376 , -1.5086436 , -2.5321705 ,  0.7470751 ,
       -1.187493  , -0.44315642,  0.42992514, -0.5700873 ,  1.4696245 ,
        3.579392  , -1.7902557 , -1.4915887 , -0.21909037,  2.8189087 ,
        0.66733664, -0.8593181 , -1.6884806 ,  1.753546  ,  1.6210189 ,
        1.005262  , -0.45152336,  1.5141109 ,  0.42287797,  2.3115773 ],
      dtype=float32)

In [11]:
# read in model
from gensim.models import Word2Vec
english_model = Word2Vec.load('english_model.bin')
czech_model = Word2Vec.load('czech_model.bin')


In [13]:
# length of the vocabulary
len(english_model.wv)
len(czech_model.wv)

119086