https://deepset.ai/german-word-embeddings
https://www.analyticsvidhya.com/blog/2020/03/pretrained-word-embeddings-nlp/


In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding,LSTM,Dense,Bidirectional,Dropout

import numpy as np
import pickle as pkl

In [2]:
# STEP 1: 
# tokenize the corpus

tokenizer = Tokenizer()
data = ''
for i in range(1,11):
    data += open(f'data/el{i}.txt','r').read()
    
corpus = data.split('\n')

# we slaan de tokenizer ook op, omdat we die in een volgende
# stap weer nodig hebben
tokenizer.fit_on_texts(corpus) 
with open ('files/tokenizer.pkl', 'wb') as f:
    pkl.dump(tokenizer, f, protocol=pkl.HIGHEST_PROTOCOL)



In [3]:
# STEP 2: 
# create a dictionary of words 
# key-value pair, key => word, value => token for that word

total_words = len(tokenizer.word_index) + 1 # for OOV token
print (f'The total of different words in the corpus is {total_words}')

The total of different words in the corpus is 2287


In [4]:
# STEP 3:
# Generating the training data

print ('=================')
print ('Generating input-sequences')
input_sequences = []
for line in corpus:
    #list of the token representing the words
    token_list = tokenizer.texts_to_sequences([line])[0]

    # first two words, first three words, first four words, ...
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

print (f'The length of the input_sequence is {len(input_sequences)}')


Generating input-sequences
The length of the input_sequence is 5683


In [5]:
# STEP 5:
# Find the largest sentence in the corpus and 
# pad all the other sentences to this length

# find the length of the longest sentence in the corpus
print ('Finding the length of the longest sentence in the corpus')
max_sequence_length = max([len(x) for x in input_sequences])
print (f'The longest sentence in the corpus is {max_sequence_length } words.'


Finding the length of the longest sentence in the corpus
The longest sentence in the corpus is 13 words.


In [6]:
#pad all the sequences so that they are the same length
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_length, padding='pre')) 
#padded input sequences

#turn them into vectors of x's and y's, input values and their labels...
#last character is the label, first n characters are the x's

input_sequences.shape

(5683, 13)

In [8]:
# STEP 6:
# Creating train and validation data-sets and
# making X-matrices and y-vector
m,n = input_sequences.shape 
data_train, data_cv = input_sequences[:int(m*.8)], input_sequences[int(m*.8):]
X_train, X_cv = data_train[:,:-1], data_cv[:,:-1]
labels_train, labels_cv = data_train[:,-1], data_cv[:,-1]

In [None]:
# STEP 7:
# one-hot encode the labels
ys_train = tf.keras.utils.to_categorical(labels_train, num_classes=total_words)
ys_cv = tf.keras.utils.to_categorical(labels_cv, num_classes=total_words)

In [9]:
# STEP 8:
# Create a model in order to
# find out what the next word should be
model = Sequential() 
model.add(Embedding(total_words, 64, input_length=max_sequence_length - 1))
#model.add(Bidirectional(LSTM(200)))
#model.add(Dropout(.2))
model.add(LSTM(500))

# One neuron per word, which will light up if that is the predicted word
model.add(Dense(total_words, activation="softmax"))

In [15]:
# Het daadwerkelijke trainen van het model duurt te lang voor deze demonstratie
# dus ik laad even een versie die al getraind is.

# model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
# hist = model.fit(X_train, ys_train, epochs=250, validation_data=[X_cv, ys_cv])
# print ("=== [SAVE HISTORY] =====")
# with open('history.pkl', 'wb') as file_pi:
#     pkl.dump(hist.history, file_pi)


from tensorflow.keras.models import load_model
model = load_model('files/rilke_model.h5')
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 12, 64)            146368    
_________________________________________________________________
lstm (LSTM)                  (None, 200)               212000    
_________________________________________________________________
dense (Dense)                (None, 2287)              459687    
Total params: 818,055
Trainable params: 818,055
Non-trainable params: 0
_________________________________________________________________


In [19]:
# predict the next 100 words

line = 'Wer'
for _ in range(100):
    token_list = tokenizer.texts_to_sequences([line])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_length-1, padding='pre')
    #predicted = model.predict_classes(token_list, verbose=0)
    predicted = np.argmax(model.predict(token_list), axis=-1)
    output_word = ''

    #reverse look-up
    for word, index in tokenizer.word_index.items():
        if index == predicted:
            output_word = word
            break
    line += ' ' + output_word
line += "\n"
print (line) 

Wer zeigt ein kind so wie es steht wer stellt will war er ists wieder so blieb sie doch nur mir einmal ich in den andern bezug sei sei das früchte was dir dir weiter daß nie nicht hörte mich doch an alles ist der engel schielaug nackens er grade herzens rief wo er im herzen – über er leise konnte leise mehr – mehr neue seltsam die leere welt dir sich läßt menschen und er so mehr furchtbar neue an der engel wäre ihm er nicht schreiten daß es nicht weil die engel o an an wenig großsein furchtbar dorten

