### Word2Vec models

Trained with sentences lemmatized without stopwords

In [48]:
from gensim.models import Word2Vec
import os

english_model = Word2Vec.load(os.path.join('data', 'english_model_lemmatized'))
german_model = Word2Vec.load(os.path.join('data', 'german_model_lemmatized'))


#### Some examples of closest words. Maybe they are too similar?

In [49]:
print('Most similar words to "dog" in English:')
for item in english_model.wv.most_similar('dog'):
    print(item)

Most similar words to "dog" in English:
('cat', 0.9153165221214294)
('fur', 0.8387186527252197)
('meal', 0.8324460387229919)
('infected', 0.8271117210388184)
('toothpaste', 0.8145505785942078)
('chicken', 0.8092467784881592)
('fake', 0.8056871294975281)
('clothe', 0.8050008416175842)
('calf', 0.7986875176429749)
('coat', 0.7941100597381592)


In [50]:
print('Most similar words to "dog" in German:')
for item in german_model.wv.most_similar('hund'):
    print(item)

Most similar words to "dog" in German:
('katze', 0.9377791285514832)
('hühner', 0.8825260996818542)
('fernseher', 0.8692821860313416)
('vogel', 0.8529067039489746)
('großmutter', 0.8459739089012146)
('quälen', 0.838893711566925)
('kühlschrank', 0.833003044128418)
('brot', 0.8319242000579834)
('flasche', 0.8301399350166321)
('leiche', 0.8292429447174072)


In [51]:
print('Most similar words to "cat" in English:')
for item in english_model.wv.most_similar('cat'):
    print(item)

Most similar words to "cat" in English:
('dog', 0.9153164625167847)
('fur', 0.8994118571281433)
('chicken', 0.8519833087921143)
('meal', 0.8342467546463013)
('bonemeal', 0.8240389227867126)
('fake', 0.8229784965515137)
('vaccinated', 0.8228892683982849)
('infected', 0.8187688589096069)
('clothe', 0.8175815939903259)
('cocoa', 0.8104070425033569)


In [52]:
print('Most similar words to "cat" in German:')
for item in german_model.wv.most_similar('katze'):
    print(item)

Most similar words to "cat" in German:
('hund', 0.9377790689468384)
('fernseher', 0.8905958533287048)
('hühner', 0.8890235424041748)
('quälen', 0.884523332118988)
('fressen', 0.8778293132781982)
('trinken', 0.8750383853912354)
('füttern', 0.8726291060447693)
('kühlschrank', 0.8669960498809814)
('vogel', 0.8615708947181702)
('sack', 0.8605101108551025)


In [53]:
#common words
# top 20 words in English
print('Top 20 words in English:')
for item in english_model.wv.index_to_key[:20]:
    print(item, end=', ')
    

Top 20 words in English:
european, mr, commission, would, also, president, member, must, make, country, union, parliament, take, report, council, need, states, like, one, europe, 

In [54]:
for item in german_model.wv.index_to_key[:20]:
    print(item, end=', ')

europäisch, herr, kommission, parlament, müssen, sollen, union, möchten, präsident, bericht, mitgliedstaat, land, geben, jahr, europa, rat, frau, frage, neu, sagen, 

In [55]:
print(len(english_model.wv.index_to_key))
print(len(german_model.wv.index_to_key))

26019
60960


### Sentences

In [56]:
english_sentence_pth = os.path.join('data', 'english_sentences_lemmatized_no_stopwords.pkl')
german_sentence_pth = os.path.join('data', 'german_sentences_lemmatized_no_stopwords.pkl')


In [57]:
import pickle

with open(english_sentence_pth, 'rb') as f:
    english_sentences = pickle.load(f)
    
with open(german_sentence_pth, 'rb') as f:
    german_sentences = pickle.load(f)


Sentences are list of words, indexed by sentence number
Total of 1,000,000 sentences

In [58]:
print('Number of sentences:', len(english_sentences))
print('Number of sentences:', len(german_sentences))

print(english_sentences[0])
print(german_sentences[0])

print()
print(english_sentences[1])
print(german_sentences[1])

Number of sentences: 1000000
Number of sentences: 1000000
['resumption', 'session']
['wiederaufnahme', 'sitzungsperiode']

['declare', 'resume', 'session', 'european', 'parliament', 'adjourn', 'friday', '17', 'december', '1999', 'would', 'like', 'wish', 'happy', 'new', 'year', 'hope', 'enjoy', 'pleasant', 'festive', 'period']
['erklären', 'freitag', '17.', 'dezember', 'unterbrochen', 'sitzungsperiode', 'europäisch', 'parlament', 'wiederaufnehmen', 'wünschen', 'nochmals', 'gut', 'jahreswechsel', 'hoffen', 'schön', 'ferien']


In [59]:
#save top 200 words as text files
english_top_1000= english_model.wv.index_to_key[:1000]
german_top_1000 = german_model.wv.index_to_key[:1000]

with open(os.path.join('data', 'english_top_1000.txt'), 'w') as f:
    for word in english_top_1000:
        f.write(word + '\n')
        
with open(os.path.join('data', 'german_top_1000.txt'), 'w') as f:
    for word in german_top_1000:
        f.write(word + '\n')

In [60]:
path = os.path.join('words', 'german_top_1000_translated.txt')
with open(path, 'r') as f:
    german_top_1000_translated = f.read().splitlines()

In [61]:
print(german_top_1000_translated[:10])
print(len(german_top_1000_translated))


['European', 'Mr', 'Commission', 'houses of Parliament', 'must', 'should', 'union', 'want', 'president', 'report']
1000


In [62]:
#lower case
german_top_1000_translated = [word.lower() for word in german_top_1000_translated]

In [63]:
#find common words and translation

common_words = []
for i, ii in enumerate(german_top_1000_translated):
    if ii in english_top_1000:
        german_word = german_top_1000[i]
        english_index = english_top_1000.index(ii)
        common_words.append((ii, german_word, english_index, i))
        
        
        

In [64]:
# save
with open(os.path.join('words', 'anchor_words.txt'), 'w') as f:
    for word in common_words:
        f.write(f'{word[0]} {word[1]} {word[2]} {word[3]}\n')

In [65]:
len(common_words)

654

In [66]:
print(len(german_top_1000_translated))

1000
