In [1]:
from gensim.models import Word2Vec
from dataset import load_tokenized_sentences

dataset = load_tokenized_sentences("../datasets/pickled/books_3500_flat.pickle")
occurs = load_tokenized_sentences("../datasets/pickled/books_3500_occurs.pickle")
# model = Word2Vec(sentences=books_3500_dataset, vector_size=100, window=5, min_count=5, workers=12, epochs=10)
# model.save("../w2v/word2vec_large_model.model")
model = Word2Vec.load("../w2v/word2vec_large_model.model")

embedding_dim = len(model.wv)
vector_size = model.vector_size

print({"embedding_dim": embedding_dim, "vector_size": vector_size})
print("Dataset len: ", len(dataset))

{'embedding_dim': 691922, 'vector_size': 100}
Dataset len:  207673043


In [2]:
for i in range(10):
    print(dataset[i])

tytuł
a
b
c
chrześcijaństwaautor
alfred
cholewiński
sjprzedmowawprowadzeniekerygmat
apostolskijezus
nie


In [3]:
for i in range(15):
    word = dataset[i]
    try:
        similar_words = model.wv.most_similar(word, topn=5)

        print(f"Most similar words to '{word}':", [word for word, _ in similar_words])
    except:
        pass

Most similar words to 'tytuł': ['przydomek', 'pseudonim', 'kryptonim', 'podtytuł', 'rodowód']
Most similar words to 'a': ['zaś', 'ale', 'lecz', 'bo', 'i']
Most similar words to 'b': ['d', 'c', 'p', 'h', 'g']
Most similar words to 'c': ['d', 'b', 'p', 'g', 'n']
Most similar words to 'alfred': ['albert', 'jacob', 'walter', 'edmund', 'gilbert']
Most similar words to 'nie': ['nigdy', 'oczywiście', 'też', 'przecież', 'jednak']
Most similar words to 'tylko': ['nawet', 'jedynie', 'też', 'przynajmniej', 'zawsze']
Most similar words to 'otworzył': ['otwarł', 'zamknął', 'otworzywszy', 'odemknął', 'rozwarł']
Most similar words to 'nam': ['im', 'wam', 'mi', 'ci', 'mu']
Most similar words to 'niebo': ['słońce', 'morze', 'słonce', 'obłoki', 'stonce']
Most similar words to 'okrojone': ['przetrzebione', 'zniszczone', 'osłabione', 'uszczuplone', 'przeludnione']


In [4]:
import itertools
import os
import random
from typing import Dict, List, Tuple
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, LSTM, Input, Embedding
from keras import optimizers, Model
from gensim.models import Word2Vec
import keras
import math

def get_log_probs(occurs: Dict[str, List[int]], gamma: float = 1.0) -> Dict[str, float]:    
    probs = {}
    for word in occurs:
        probs[word] = math.log((len(occurs[word]) ** gamma + 1))
    _sum = sum(probs.values())
    for word in probs:
        probs[word] /= _sum
    return probs

def dataset_batch_generator1(
    model: Word2Vec, dataset: List[str], sequence_length: int = 5, batch_size: int = 64, randomize_len = False
) -> Tuple[np.ndarray, np.ndarray]:
    while True:
        perms = np.array_split(
            np.random.permutation(len(dataset) - sequence_length),
            len(dataset) // batch_size,
        )
        if randomize_len:
            length = np.random.randint(1, sequence_length, len(perms))
        else:
            length = np.full(len(perms), sequence_length)
        
        for indexes, s_len in zip(perms, length):
            vector_sequences = []
            vector_targets = []
            indexes = np.random.randint(0, len(dataset) - s_len, batch_size)
            for index in indexes:
                sequence = dataset[index : index + s_len]
                target = dataset[index + s_len]
                vector_sequence = []
                for word in sequence:
                    try:
                        vector_sequence.append(model.wv[word])
                    except KeyError:
                        vector_sequence.append(np.random.rand(vector_size))
                vectors_padded = np.zeros((1, sequence_length, model.vector_size))
                vectors_padded[0, sequence_length - s_len:, :] = np.array(vector_sequence)
                
                vector_sequences.append(vectors_padded)
                
                try:
                    vector_targets.append(model.wv[target])
                except KeyError:
                    vector_targets.append(np.random.rand(vector_size))
                    
            vector_sequences = np.concatenate(vector_sequences, axis=0)
            vector_targets = np.array(vector_targets)
            yield vector_sequences, vector_targets
            
def dataset_batch_generator2(
    model: Word2Vec, dataset: List[str], sequence_length: int = 5, batch_size: int = 64, randomize_len = False
) -> Tuple[np.ndarray, np.ndarray]:

    vectorized_dataset = []
    for word in dataset:
        try:
            vectorized_dataset.append(model.wv[word])
        except KeyError:
            vectorized_dataset.append(np.random.rand(vector_size))
    
    while True:
        perms = np.array_split(
            np.random.permutation(len(vectorized_dataset) - sequence_length),
            len(vectorized_dataset) // batch_size,
        )
        if randomize_len:
            length = np.random.randint(1, sequence_length, len(perms))
        else:
            length = np.full(len(perms), sequence_length)
        
        for indexes, s_len in zip(perms, length):
            vector_sequences = []
            vector_targets = []
            indexes = np.random.randint(0, len(vectorized_dataset) - s_len, batch_size)
            for index in indexes:
                sequence = vectorized_dataset[index : index + s_len]
                target = vectorized_dataset[index + s_len]


                vectors_padded = np.zeros((1, sequence_length, model.vector_size))
                vectors_padded[0, sequence_length - s_len:, :] = sequence
                
                vector_sequences.append(vectors_padded)
                vector_targets.append(target)
                    
            vector_sequences = np.concatenate(vector_sequences, axis=0)
            vector_targets = np.array(vector_targets)
            yield vector_sequences, vector_targets
            
def dataset_batch_generator3(
    model: Word2Vec, dataset: List[str], occurs: Dict[str, List[int]], sequence_length: int = 5, batch_size: int = 64, randomize_len = False, gamma=5.0
) -> Tuple[np.ndarray, np.ndarray]:

    vectorized_dataset = {}
    for word in occurs:
        try:
            vectorized_dataset[word] = model.wv[word]
        except KeyError:
            vectorized_dataset[word] = np.random.rand(vector_size)
    probs = get_log_probs(occurs, gamma)

    while True:
        choosen_words = random.choices(list(probs.keys()), list(probs.values()), k=len(dataset))
        random_indexes = []
        for word in choosen_words:
            index = random.choice(occurs[word])
            if index > sequence_length:
                random_indexes.append(index)
        random.shuffle(random_indexes)

        if randomize_len:
            seq_lengths = np.random.randint(2, sequence_length, len(random_indexes))
        else:
            seq_lengths = np.full(len(random_indexes), sequence_length)

        random_indexes = np.array_split(random_indexes, len(random_indexes) // batch_size)
        seq_lengths = np.array_split(seq_lengths, len(seq_lengths) // batch_size)
        
        for indexes, lengths in zip(random_indexes, seq_lengths):
            vector_sequences = []
            vector_targets = []

            for index, length in zip(indexes, lengths):
                sequence = [vectorized_dataset[word] for word in dataset[index - length : index]]
                target = vectorized_dataset[dataset[index]]

                if randomize_len:
                    vectors_padded = np.zeros((1, sequence_length, model.vector_size))
                    vectors_padded[0, sequence_length - length:, :] = sequence
                    sequence = vectors_padded
                else:
                    sequence = [sequence]
                
                vector_sequences.append(sequence)
                vector_targets.append(target)
        
            yield np.concatenate(vector_sequences, axis=0), np.array(vector_targets)

def preprocess_input_text1(text: str, word2vec_model: Word2Vec, max_len=5):
    # Transform input text into vector representation
    input_vectors = []
    words = text.split()
    words = words[len(words) - max_len :]
    for word in words:
        try:
            input_vectors.append(word2vec_model.wv[word])
        except KeyError:
            input_vectors.append(
                np.random.rand(word2vec_model.vector_size)
            )  # initialize randomly

    # Pad the input text to the required length
    if max_len != len(words):
        input_vectors_padded = np.zeros(
            (1, max_len, word2vec_model.vector_size), dtype="float32"
        )
        input_vectors_padded[0, max_len - len(words):, :] = np.array(input_vectors)
        return input_vectors_padded
    return np.array(input_vectors)[np.newaxis]

def generate_text1(
    model: Model,
    word2vec_model: Word2Vec,
    input_text: str,
    num_words_to_generate=10,
    max_len=5,
    radomize=False,
):
    generated_text = input_text

    for _ in range(num_words_to_generate):
        # Preprocess and pad the input text
        input_vectors_padded = preprocess_input_text1(
            generated_text, word2vec_model, max_len
        )

        # Predict the next word using the LSTM model
        prediction = model.predict_on_batch(input_vectors_padded)
        predicted_vector = prediction[0]

        # Find the corresponding word for the predicted index
        predicted_word = word2vec_model.wv.most_similar(
            positive=[predicted_vector], topn=10
        )
        if radomize:
            top_word = random.choice([word for word, _ in predicted_word])
        else:
            top_word = predicted_word[0][0]

        # Append the predicted word to the input text
        generated_text += " " + top_word

    return generated_text


class PredictCallback(keras.callbacks.Callback):
    def __init__(
        self,
        w2v_model: Word2Vec,
        input_text="Dawno dawno temu był sobie",
        num_words_to_generate=8,
        max_len=5,
        randomize=False,
    ):
        self.w2v_model = w2v_model
        self.input_text = input_text
        self.num_words_to_generate = num_words_to_generate
        self.max_len = max_len
        self.randomize = randomize

    def on_epoch_begin(self, epoch, logs=None):
        text = generate_text1(
            self.model,
            self.w2v_model,
            self.input_text,
            self.num_words_to_generate,
            self.max_len,
            self.randomize,
        )
        print(f"Start epoch {epoch} of training; Generated text:", text)


class SaveCallback(keras.callbacks.Callback):
    def __init__(self, save_path: str):
        self.save_path = save_path

    def on_epoch_end(self, epoch, logs=None):
        self.model.save_weights(os.path.join(self.save_path, f"{epoch}.h5"))

In [5]:
sequence_length = 25
batch_size = 128
gen = dataset_batch_generator3(model, dataset, occurs, sequence_length, batch_size, randomize_len=False, gamma=10)
a, b = next(gen)
print(a.shape, b.shape)

for i, wb in enumerate(b):
    all_words = ""
    for wa in a[i]:
        words = model.wv.most_similar(positive=[wa], topn=1)[0][0]
        all_words += " " + words
    words = model.wv.most_similar(positive=[wb], topn=3)
    print(all_words, words)

(129, 25, 100) (129, 100)
 swój autorytet teza dalsze zaoenia everton to ksztacenie nowej rewolucyjnej wiadomoci i rozszerzanie jej wród ludu lub ptaszycy systemie przygotowanie nowych kierowniczych orodków które rozwin [('awulsów', 0.4050384759902954), ('sejmie', 0.3951239287853241), ('bogdańcu', 0.39329272508621216)]
 tułowia a rozwojowo należą do mięśniówki brzusznej o czym świadczy między innymi źródło ich unerwienia są unerwione przez gałęzie brzuszne nerwów rdzeniowych rami ventrales nervorum [('spinalium', 1.0), ('ventrales', 0.8902422785758972), ('lymphatica', 0.8534058332443237)]
 dobra tylko jeść i z palców zlizywać jak jej ta mufce nie smakuje to niepostrzeżone do choroby bedziem my jo karmić pytam sie dunajów nie [('bojś', 0.9999998807907104), ('warknł', 0.6302300095558167), ('pękaj', 0.6272857189178467)]
 wszystko na komodę fotografię podstawkę szkiełko tekturkę sofia aleksandrowna opadła na fotel i rozpłakała się zakrywając twarz dłońmi funkcjonariusz grzebał teraz w wyci

In [6]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, Input, Embedding, Bidirectional, Dropout, LeakyReLU
from keras import optimizers, Model, activations
from gensim.models import Word2Vec

embedding_dim = len(model.wv)
vector_size = model.vector_size

# Train the LSTM model on the prepared data
kmodel = Sequential()
kmodel.add(Input(shape=(sequence_length, vector_size)))
kmodel.add(LSTM(512))
kmodel.add(Dense(vector_size))
kmodel.compile(loss="mse", optimizer=optimizers.Adam(1e-3))
kmodel.summary()


input_text = "nastał dzień w którym zbliżał się ślub królewskiego syna więc"
kmodel.fit(
    gen,
    epochs=1000,
    steps_per_epoch=5000,
    verbose=1,
    callbacks=[
        PredictCallback(model, input_text, max_len=sequence_length, randomize=True),
        PredictCallback(model, input_text, max_len=sequence_length, randomize=False),
        SaveCallback('../lstm_models/')
    ],
)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 512)               1255424   
                                                                 
 dense (Dense)               (None, 100)               51300     
                                                                 
Total params: 1,306,724
Trainable params: 1,306,724
Non-trainable params: 0
_________________________________________________________________
Start epoch 0 of training; Generated text: nastał dzień w którym zbliżał się ślub królewskiego syna więc pczytywny łac nieświadomemu oczui kolor oznaczałoby wysysanie wyrzucanie
Start epoch 0 of training; Generated text: nastał dzień w którym zbliżał się ślub królewskiego syna więc pczytywny pczytywny pczytywny nasuwający zwierzecia koszemego włosom poławiacze
Epoch 1/1000
Start epoch 1 of training; Generated text: nastał dzień w którym z

KeyboardInterrupt: 

In [8]:
# Example usage
input_text = "Czy grube mury twierdz zakonnych ogromne bogactwa skrzętna niemal fanatyczna gospodarność straszliwy głód władzy nie stanowią znaków głębokiego duchowego konfliktu chrześcijan znaków braku zaufania do Bożych obietnic znaków rozpaczliwych starań"
num_words_to_generate = 16
generated_text = generate_text1(
    kmodel, model, input_text.lower(), num_words_to_generate, max_len=sequence_length
)
print(generated_text)

czy grube mury twierdz zakonnych ogromne bogactwa skrzętna niemal fanatyczna gospodarność straszliwy głód władzy nie stanowią znaków głębokiego duchowego konfliktu chrześcijan znaków braku zaufania do bożych obietnic znaków rozpaczliwych starań zgubnych zgubnych występków obłędnych obłędnych obłędnych obłędnych obłędnych obłędnych obłędnych obłędnych obłędnych obłędnych obłędnych obłędnych obłędnych
