In [1]:
from gensim.models import FastText
import gensim.downloader as api

import ipywidgets as widgets
import IPython.display as ipd

from spacy.tokens import Doc, DocBin

import pandas as pd
import numpy as np
import tensorflow as tf

import spacy
import os
import ssl

In [2]:
ssl._create_default_https_context = ssl._create_unverified_context

In [3]:
docs_data_path = r'../application/resources/models/docs.bin'
vectors_data_path = r'../application/resources/models/vectors.bin'
model_data_path = r'../application/resources/models/bin/model/'

# Preprocessing

In [4]:
nlp = spacy.load('en', disable = ['tagger', 'parser', 'ner'])

In [5]:
article_number = 500

In [6]:
def tokenize(text):
    doc = nlp(text)
    tokens = [token.lemma_.lower().strip() if token.lemma_ != '-PRON-' else token.lower_ for token in doc if token.is_alpha and token.is_ascii and not token.is_stop and (token.lemma_.isalpha() or token.lemma_ == '-PRON-')]
    return tokens

def tokenize_sequence(tokens, token_index):
    return tokens[token_index - sequence_size - 1 : token_index - 1]

def doc_tokenizer(doc):
    tokens = [token.lemma_.lower().strip() if token.lemma_ != '-PRON-' else token.lower_ for token in doc if token.is_alpha and token.is_ascii and not token.is_stop and (token.lemma_.isalpha() or token.lemma_ == '-PRON-')]
    return tokens

def export_corpus_documents(corpus):
    doc_bin = DocBin(attrs = ['LEMMA', 'LOWER'])
    for i, item in enumerate(corpus):
        doc = nlp(' '.join(item['section_texts']))
        removed_indexes = [index for index, token in enumerate(doc) if not token.is_alpha or not token.is_ascii or token.is_stop and (token.lemma_.isalpha() or token.lemma_ == '-PRON-')]
        
        doc_array = doc.to_array(['LEMMA', 'LOWER'])
        doc_array = np.delete(doc_array, removed_indexes, axis=0)
        filtered_doc = Doc(doc.vocab, words=[token.text for index, token in enumerate(doc) if index not in removed_indexes])
        filtered_doc.from_array(['LEMMA', 'LOWER'], doc_array)
        
        doc_bin.add(filtered_doc)
        if i >= article_number:
            break
        
    docs_data = doc_bin.to_bytes()
    file = open(docs_data_path, 'wb')
    file.write(docs_data)
    file.close()

def get_tokens_set(corpus):
    tokens_set = []
    for i, item in enumerate(corpus):
        tokens_set.append(tokenize(' '.join(item['section_texts'])))
        if i >= article_number:
            break
    return tokens_set
    
def import_corpus_documents():
    file = open(docs_data_path, 'rb')
    docs_data = file.read()
    file.close()
    doc_bin = DocBin().from_bytes(docs_data)
    return list(doc_bin.get_docs(nlp.vocab))

In [None]:
corpus = api.load('wiki-english-20171001')

In [None]:
export_corpus_documents(corpus)

In [5]:
docs = import_corpus_documents()

In [51]:
tokens_set = get_tokens_set(corpus)

# Wektoryzacja

In [5]:
sequence_size = 5

In [7]:
def vectorize(word):
    return vectors.wv[word]

def export_vectors(vectors):
    vectors.save(vectors_data_path)
    
def import_vectors():
    return FastText.load(vectors_data_path)

def vectorize_sequence(tokens, token_index):
    sequence_vectors = []
    for i in range(token_index - sequence_size, token_index):
        if i < 0:
            sequence_vectors.append(np.zeros(100, dtype='float32'))
        else:
            try:
                sequence_vectors.append(vectorize(tokens[i]))
            except KeyError:
                sequence_vectors.append(np.zeros(100, dtype='float32'))
    return np.array(sequence_vectors)

In [52]:
vectors = FastText(tokens_set, hs=1, window=sequence_size, iter=100)

In [53]:
export_vectors(vectors)

In [7]:
vectors = import_vectors()

# Model

In [9]:
def get_data():
    input_data = []
    output_data = []
    for doc in docs:
        tokens = doc_tokenizer(doc)
        for index, token in enumerate(tokens):
            if token in vectors.wv.vocab:
                sequence_vectors = vectorize_sequence(tokens, index)

                input_data.append(sequence_vectors)
                output_data.append(vectors.wv.vocab[token].index)
                
    return np.array(input_data), np.array(output_data)

In [10]:
input_data, output_data = get_data()

In [11]:
config = tf.compat.v1.ConfigProto(device_count={'CPU': 8})
tf.compat.v1.keras.backend.set_session(tf.compat.v1.Session(config=config))

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.GRU(256, input_shape=(input_data.shape[1], input_data.shape[2]), return_sequences=True),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.GRU(256),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(len(vectors.wv.vocab), activation='softmax')
])
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [None]:
model.fit(input_data, output_data, epochs=50, batch_size=64)

In [18]:
model.save(model_data_path)

INFO:tensorflow:Assets written to: ./bin/model/assets


In [8]:
model = tf.keras.models.load_model(model_data_path)

In [8]:
def predict_words(tokens):
    input_vectors = np.asarray([vectorize_sequence(tokens, len(tokens))])
    output = model.predict(input_vectors)[0]
    indexes = np.argpartition(output, -5)[-5:]
    words = [vectors.wv.index2word[index] for index in indexes]
    return words

# Prototyp

In [9]:
layout = widgets.Layout(width='700px')
in_textbox = widgets.Textarea(layout=layout)
out_textbox = widgets.Text(layout=layout)
def on_text_changed(change):
    if change['type'] == 'change' and change['name'] == 'value':
        text = in_textbox.value
        if len(text) > 0:
            if text[-1] == ' ':
                tokens = tokenize(text)
                words = predict_words(tokens)
                message = ''
                for index, word in enumerate(words):
                    message += str(index + 1) + ') ' + word + ' '
                out_textbox.value = message
        else:
            out_textbox.value = ''
        
in_textbox.observe(on_text_changed)

display(in_textbox)
display(out_textbox)

Textarea(value='', layout=Layout(width='700px'))

Text(value='', layout=Layout(width='700px'))