In [1]:
from datasets import load_dataset
from tqdm import tqdm
dataset = load_dataset("wmt/wmt16", 'cs-en')

In [2]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 997240
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 2656
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 2999
    })
})


In [3]:
print(dataset['train']['translation'][0])

{'cs': 'Následný postup na základě usnesení Parlamentu: viz zápis', 'en': "Action taken on Parliament's resolutions: see Minutes"}


In [4]:
import nltk
import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('punkt')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/sungwonkim/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
def tokenize_text(text):
    # Tokenize the text into words
    tokens = word_tokenize(text)
    return tokens


In [6]:
tokenize_text("Hello, how are you")

['Hello', ',', 'how', 'are', 'you']

In [7]:
a = word_tokenize(dataset['train']['translation'][0]['cs'], language='czech')

In [8]:
len(a)

9

In [5]:
english_sentences = []
czech_sentences = []
for item in dataset['train']['translation']:
    english_sentences.append(word_tokenize(item['en']))
    czech_sentences.append(word_tokenize(item['cs'], language='czech'))

In [6]:
# make everything lowercase
english_sentences = [[word.lower() for word in sentence] for sentence in english_sentences]
czech_sentences = [[word.lower() for word in sentence] for sentence in czech_sentences]

In [7]:
# get number of unique words in the dataset
english_words = set()
czech_words = set()
for sentence in english_sentences:
    for word in sentence:
        english_words.add(word)
for sentence in czech_sentences:
    for word in sentence:
        czech_words.add(word)
        
print("Number of unique English words: ", len(english_words))
print("Number of unique Czech words: ", len(czech_words))

Number of unique English words:  206508
Number of unique Czech words:  378946


In [8]:
# Get top common words
from collections import Counter
english_word_freq = Counter()
czech_word_freq = Counter()
for sentence in english_sentences:
    for word in sentence:
        english_word_freq[word] += 1
for sentence in czech_sentences:
    for word in sentence:
        czech_word_freq[word] += 1

english_most_common_words = [word for word, _ in english_word_freq.most_common(1000)]
czech_most_common_words = [word for word, _ in czech_word_freq.most_common(1000)]


In [9]:
# save words with counts
with open('english_word_freq.txt', 'w') as f:
    for word, count in english_word_freq.most_common(1000):
        f.write("%s %d\n" % (word, count))
with open('czech_word_freq.txt', 'w') as f:
    for word, count in czech_word_freq.most_common(1000):
        f.write("%s %d\n" % (word, count))

In [10]:
# save common words
with open('english_most_common_words.txt', 'w') as f:
    for word in english_most_common_words:
        f.write("%s\n" % word)
with open('czech_most_common_words.txt', 'w') as f:
    for word in czech_most_common_words:
        f.write("%s\n" % word)

In [11]:
# save the tokenized sentences
import pickle
with open('english_sentences.pkl', 'wb') as f:
    pickle.dump(english_sentences, f)
with open('czech_sentences.pkl', 'wb') as f:
    pickle.dump(czech_sentences, f)

In [12]:
import multiprocessing

cores = multiprocessing.cpu_count() # Count the number of cores in a computer

In [13]:
cores

16

In [14]:
import scipy
scipy.__version__

'1.12.0'

In [15]:
from tqdm import tqdm
from gensim.models import Word2Vec
english_model = Word2Vec(english_sentences, vector_size=100, window=5, min_count=5)
czech_model = Word2Vec(czech_sentences, vector_size=100, window=5, min_count=5)

In [16]:
#save the models
english_model.save('english_model.bin')
czech_model.save('czech_model.bin')


In [17]:
english_model.build_vocab(english_sentences, progress_per=10000)

In [19]:
english_model.wv['learn']

array([-0.54806614, -0.01819965,  1.1773138 ,  0.32080886,  0.8152069 ,
       -0.14914533,  0.18360068, -2.3476794 , -0.9791172 , -0.08045336,
        0.7193804 ,  0.17005268, -0.7254727 , -1.0667393 ,  2.4598668 ,
       -0.2210985 ,  0.03522401, -0.7874495 ,  1.684779  , -0.13761666,
        0.5328858 ,  0.507696  ,  0.06550439, -1.6921133 , -0.89277107,
        0.23387471,  0.356072  ,  0.3973815 , -0.480898  , -1.0152174 ,
        2.1866965 , -1.398485  ,  0.05035563,  0.76237655,  0.93540263,
       -0.45010066,  0.5803226 ,  0.17633936, -0.150698  , -1.0869824 ,
        0.70152026, -2.6999655 , -0.5494913 , -0.5049243 , -2.375663  ,
       -0.5883161 ,  0.26958856,  0.01465278, -1.8970444 , -0.45376408,
       -0.43732616, -0.98177135,  1.8450038 , -1.5685921 , -0.01842064,
       -2.1469214 , -0.67472625, -2.896647  , -0.577647  ,  0.8362844 ,
        0.6472836 , -0.3450616 ,  0.29168335, -1.2197721 ,  0.63091123,
        0.12482826,  1.823888  , -1.3342676 , -1.2550029 ,  0.97

In [22]:
# read in model
from gensim.models import Word2Vec
english_model = Word2Vec.load('english_model.bin')
czech_model = Word2Vec.load('czech_model.bin')


In [24]:
# length of the vocabulary
print(len(english_model.wv))
print(len(czech_model.wv))

60219
119086
