In [1]:
Dutch = 'https://www.gutenberg.org/cache/epub/39181/pg39181-images.html'
German = "https://www.gutenberg.org/cache/epub/8085/pg8085.html"
Italian = "https://www.gutenberg.org/cache/epub/1000/pg1000-images.html"
English = "https://www.gutenberg.org/cache/epub/1004/pg1004-images.html"
Spanish = "https://www.gutenberg.org/cache/epub/57303/pg57303-images.html"
Finnish = "https://www.gutenberg.org/cache/epub/12546/pg12546.html"
languages = [Dutch, German, Italian, English, Spanish, Finnish]
names = ['Dutch', 'German', 'Italian', 'English', 'Spanish', 'Finnish']

In [2]:
import gensim
from gensim.models import word2vec
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
import gensim.downloader as api
import numpy as np
from urllib.request import urlopen
import re
from nltk.corpus import stopwords
import pandas as pd
from nameof import nameof

In [3]:
def data_tokenizer(url, language, encoding = 'utf-8'):
    with urlopen(url) as file:
        divine_comedy = file.read().decode(encoding)

    f = divine_comedy.replace("\n", " ")    
    f = re.sub(r'[^\w\s]', '', f)
    data = []
    for i in sent_tokenize(f):
        temp = []
     
    # tokenize the sentence into words
        for j in word_tokenize(i):
            temp.append(j.lower())
 
        data.append(temp)

    stop_words = set(stopwords.words(language))
    stopped = [[i for i in j if i not in stop_words] for j in data]
    return stopped

In [4]:
var_holder = {}
for i in languages:
     langname = names[languages.index(i)]
     var_holder['tokenized_' + langname]= data_tokenizer(i, langname)

locals().update(var_holder)

In [11]:
tokenized_languages = [tokenized_Dutch, tokenized_German, tokenized_Italian, tokenized_English, tokenized_Spanish, tokenized_Finnish]
tokenized_names = 'tokenized_'+pd.Series(names)

In [15]:
for tk in tokenized_languages:
    print(tokenized_names[tokenized_languages.index(tk)],": ", tk)

tokenized_Dutch : [['doctype', 'html', 'html', 'langnl', 'head', 'meta', 'charsetutf8style', 'pgheader', 'div', 'pgfooter', 'div', 'all', 'initial', 'display', 'block', 'margintop', '1em', 'marginbottom', '1em', 'marginleft', '2em', 'pgfooter', 'divagate', 'fontsize', '90', 'margintop', '0', 'marginbottom', '0', 'textalign', 'center', 'pgfooter', 'li', 'all', 'initial', 'display', 'block', 'margintop', '1em', 'marginbottom', '1em', 'textindent', '06em', 'pgfooter', 'divsecthead', 'fontsize', '110', 'fontweight', 'bold', 'pgfooter', 'projectgutenberglicense', 'fontsize', '110', 'margintop', '0', 'marginbottom', '0', 'textalign', 'center', 'pgheaderheading', 'all', 'inherit', 'textalign', 'center', 'fontsize', '110', 'pgfooterheading', 'all', 'inherit', 'textalign', 'center', 'fontsize', '120', 'fontweight', 'normal', 'margintop', '0', 'marginbottom', '0', 'pgheader', 'pgmachineheader', 'p', 'textindent', '4em', 'paddingleft', '4em', 'margintop', '1em', 'pgheader', 'pgheaderauthlist', 'p

In [16]:
def skipgram(language):
    return gensim.models.Word2Vec(language, min_count = 1,
                              vector_size = 100, window = 5, sg = 1).wv
def cbow(language):
    return gensim.models.Word2Vec(language, min_count = 1,
                              vector_size = 100, window = 5, sg = 0).wv

In [20]:
skipgram_English = skipgram(tokenized_English)
cbow_English = cbow(tokenized_English)
skipgram_Dutch = skipgram(tokenized_Dutch)
cbow_Dutch = cbow(tokenized_Dutch)
skipgram_German = skipgram(tokenized_German)
cbow_German = cbow(tokenized_German)
skipgram_Italian = skipgram(tokenized_Italian)
cbow_Italian = cbow(tokenized_Italian)
skipgram_Spanish = skipgram(tokenized_Spanish)
cbow_Spanish = cbow(tokenized_Spanish)
skipgram_Finnish = skipgram(tokenized_Finnish)
cbow_Finnish = cbow(tokenized_Finnish)


In [23]:
print(skipgram_English.index_to_key[:10])
print(cbow_English.index_to_key[:10])

['p', 'classnoindent', 'thou', 'one', 'unto', 'upon', 'thee', 'thy', 'said', 'us']
['p', 'classnoindent', 'thou', 'one', 'unto', 'upon', 'thee', 'thy', 'said', 'us']


In [24]:
print(skipgram_Dutch.index_to_key[:10])
print(cbow_Dutch.index_to_key[:10])

['p', 'classregel', 'classp2a', 'den', 'classp3a', 'a', 'pginternal', 'classnoot', 'gij', 'div']
['p', 'classregel', 'classp2a', 'den', 'classp3a', 'a', 'pginternal', 'classnoot', 'gij', 'div']


In [25]:
print(skipgram_German.index_to_key[:10])
print(cbow_German.index_to_key[:10])

['p', 'sprach', 'sah', 'drum', 'the', 'wohl', 'schon', 'mehr', 'gleich', 'wer']
['p', 'sprach', 'sah', 'drum', 'the', 'wohl', 'schon', 'mehr', 'gleich', 'wer']


In [26]:
print(skipgram_Italian.index_to_key[:10])
print(cbow_Italian.index_to_key[:10])

['p', 'sì', 'de', 'quel', 'me', 'così', 'poi', 'là', 'quando', 'già']
['p', 'sì', 'de', 'quel', 'me', 'così', 'poi', 'là', 'quando', 'già']


In [27]:
print(skipgram_Spanish.index_to_key[:10])
print(cbow_Spanish.index_to_key[:10])

['si', 'div', 'classpagenuma', 'tan', 'hacia', 'aquel', 'así', 'pues', 'pa', 'modo']
['si', 'div', 'classpagenuma', 'tan', 'hacia', 'aquel', 'así', 'pues', 'pa', 'modo']


In [28]:
print(skipgram_Finnish.index_to_key[:10])
print(cbow_Finnish.index_to_key[:10])

['p', 'mi', 'ma', 'näin', 'stylemargintop', 'sa', 'jo', 'mun', 'kaikki', 'mut']
['p', 'mi', 'ma', 'näin', 'stylemargintop', 'sa', 'jo', 'mun', 'kaikki', 'mut']
