In [1]:
from datasets import load_dataset
import spacy

from tqdm import tqdm
dataset = load_dataset("wmt/wmt16", 'de-en')

In [2]:
nlp_en = spacy.load("en_core_web_sm")
nlp_de = spacy.load("de_core_news_sm")

In [23]:
# get rid of the punctuation
def lemmatize_en(text):
    return [tok.lemma_.lower() for tok in nlp_en(text) if not tok.is_punct]

In [24]:
# get rid of the punctuation and lower case the words
def lemmatize_de(text):
    return [tok.lemma_.lower() for tok in nlp_de(text) if not tok.is_punct]

In [45]:
print(lemmatize_en("Hello, how are The things you America?"))

['hello', 'how', 'be', 'the', 'thing', 'you', 'america']


In [28]:
print(lemmatize_de("Hallo, wie geht es dir?"))

['hallo', 'wie', 'gehen', 'es', 'dir']


In [12]:
dataset = load_dataset("wmt/wmt16", 'de-en')

In [30]:
truncated_dataset = dataset['train']['translation'][:1000000]

In [31]:
english_sentences = []
german_sentences = []

for item in tqdm(truncated_dataset):
    english_sentences.append(lemmatize_en(item['en']))
    german_sentences.append(lemmatize_de(item['de']))
    

100%|██████████| 1000000/1000000 [2:47:49<00:00, 99.31it/s]  


In [35]:
# save sentences
import pickle
with open('english_sentences_lemmatized.pkl', 'wb') as f:
    pickle.dump(english_sentences, f)

with open('german_sentences_lemmatized.pkl', 'wb') as f:
    pickle.dump(german_sentences, f)

In [46]:
# remove stopwords from sentences
from nltk.corpus import stopwords
stop_words_en = set(stopwords.words('english'))
stop_words_de = set(stopwords.words('german'))

english_sentences = [[word for word in sentence if word not in stop_words_en] for sentence in english_sentences]
german_sentences = [[word for word in sentence if word not in stop_words_de] for sentence in german_sentences]

In [47]:
with open('english_sentences_lemmatized_no_stopwords.pkl', 'wb') as f:
    pickle.dump(english_sentences, f)
    
with open('german_sentences_lemmatized_no_stopwords.pkl', 'wb') as f:
    pickle.dump(german_sentences, f)

In [48]:
from gensim.models import Word2Vec

english_model = Word2Vec(english_sentences, vector_size=50, window=5, min_count=4)
german_model = Word2Vec(german_sentences, vector_size=50, window=5, min_count=4)


In [49]:
english_model.save("english_model_lemmatized")
german_model.save("german_model_lemmatized")


In [50]:
# print german model
print(german_model.wv.most_similar("hallo"))

[('weiterfahren', 0.8777409791946411), ('hochdruck', 0.8745231628417969), ('longbridge', 0.8726409077644348), ('336', 0.8722585439682007), ('nachdenke', 0.8708760142326355), ('riesenproblem', 0.8705061078071594), ('0509', 0.8702282905578613), ('freiland', 0.8701116442680359), ('0495', 0.8690578937530518), ('drum', 0.8688647747039795)]


In [51]:
# print frequent words
print(german_model.wv.index_to_key[:10])


['europäisch', 'herr', 'kommission', 'parlament', 'müssen', 'sollen', 'union', 'möchten', 'präsident', 'bericht']


In [52]:
print(english_model.wv.index_to_key[:10])

['european', 'mr', 'commission', 'would', 'also', 'president', 'member', 'must', 'make', 'country']
