In [3]:
import numpy as np
import pandas as pd
import operator

# Corus - NLP datasets
import corus
from corus import load_lenta

#NLTK - Natural Language Tool Kit
import nltk
nltk.download('punkt')

from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.tokenize import sent_tokenize, word_tokenize
from nltk import bigrams
from nltk import ngrams

#Other
from collections import Counter
import re
import string
from tqdm import notebook

import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.utils import np_utils
from keras.utils import to_categorical as to_ct

import tensorflow as tf

[nltk_data] Downloading package punkt to /home/aptmess/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/aptmess/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/aptmess/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [12]:
def tokenize(corpus):
    """Tokenize the corpus text.
    :param corpus: list containing a string of text (example: ["I like playing football with my friends"])
    :return corpus_tokenized: indexed list of words in the corpus, in the same order as the original corpus (the example above would return [[1, 2, 3, 4]])
    :return V: size of vocabulary
    """
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(corpus)
    corpus_tokenized = tokenizer.texts_to_sequences(corpus)
    V = len(tokenizer.word_index)
    return corpus_tokenized, V

def to_categorical(y, num_classes=None):
    """Converts a class vector (integers) to binary class matrix.
    E.g. for use with categorical_crossentropy.
    # Arguments
        y: class vector to be converted into a matrix
            (integers from 0 to num_classes).
        num_classes: total number of classes.
    # Returns
        A binary matrix representation of the input.
    """
    y = np.array(y, dtype='int')
    input_shape = y.shape
    if input_shape and input_shape[-1] == 1 and len(input_shape) > 1:
        input_shape = tuple(input_shape[:-1])
    y = y.ravel()
    if not num_classes:
        num_classes = np.max(y) + 1
    n = y.shape[0]
    categorical = np.zeros((n, num_classes))
    categorical[np.arange(n), y] = 1
    output_shape = input_shape + (num_classes,)
    categorical = np.reshape(categorical, output_shape)
    return categorical

def corpus2io(corpus_tokenized, V, window_size):
    """Converts corpus text into context and center words
    # Arguments
        corpus_tokenized: corpus text
        window_size: size of context window
    # Returns
        context and center words (arrays)
    """
    for words in corpus_tokenized:
        L = len(words)
        for index, word in enumerate(words):
            contexts = []
            labels = []
            s = index - window_size
            e = index + window_size + 1
            contexts.append([words[i]-1 for i in range(s, e) if 0 <= i < L and i != index])
            labels.append(word-1)
            x = np_utils.to_categorical(contexts, V)
            y = np_utils.to_categorical(labels, V)
            yield (x, y.ravel())

            
def softmax(x):
    """Calculate softmax based probability for given input vector
    # Arguments
        x: numpy array/list
    # Returns
        softmax of input array
    """
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)

def cbow(context, label, W1, W2, loss, lr):
    """
    Implementation of Continuous-Bag-of-Words Word2Vec model
    :param context: all the context words (these represent the inputs)
    :param label: the center word (this represents the label)
    :param W1: weights from the input to the hidden layer
    :param W2: weights from the hidden to the output layer
    :param loss: float that represents the current value of the loss function
    :return: updated weights and loss
    """
    x = np.mean(context, axis=(0, 1))
    h = np.dot(W1.T, x)
    u = np.dot(W2.T, h)
    y_pred = softmax(u)

    e = -label + y_pred
    dW2 = np.outer(h, e)
    dW1 = np.outer(x, np.dot(W2, e))

    new_W1 = W1 - lr * dW1
    new_W2 = W2 - lr * dW2

    loss += -float(u[label == 1]) + np.log(np.sum(np.exp(u)))

    return new_W1, new_W2, loss


def text_prepare(text, language='russian', delete_stop_words=False):
    """
        text: a string
        
        return: modified string
    """
    lemmatizer = WordNetLemmatizer()

    # 1. Перевести символы в нижний регистр
    text = text.lower() #your code
    
    # 2.1 Заменить символы пунктуации на пробелы
    text = re.sub(r'[{}]'.format(string.punctuation), ' ', text)
    
    
    
    # 2.2 Удалить "плохие" символы
    text = re.sub('[^A-Za-z0-9]' if language == 'english' else '[^А-яа-я]', ' ', text)

    
    # 3. Применить WordNetLemmatizer
    word_list = nltk.word_tokenize(text)
    text = ' '.join([lemmatizer.lemmatize(w) for w in word_list])
    
    # 4. Удалить стопслова.
    if delete_stop_words:
        stopWords = set(stopwords.words(language))
        for stopWord in stopWords:
            text = re.sub(r'\b{}\b'.format(stopWord), '', text)
        
    # 5. Удаляю пробелы у получая просто строку слов через пробел
    text = ' '.join(text.split())
    
    return text


def get_text(path='../../../data/lenta-ru-news.csv.gz', 
                        amount_of_sentense=1000, 
                        verbose=True, 
                        show_how_much=1000, **kwargs):
    records = load_lenta(path)
    a = []
    count = 1
    try:
        while True and count != amount_of_sentense:
            item = next(records).text
            if verbose:
                print(f'Sentence {count}') if count % show_how_much == 0 else 'pass'
            a.append(text_prepare(item))
            count +=1
    except StopIteration:
        pass
    finally:
        del records
    return a

In [13]:
class Word2Vec():
    
    def __init__(self, d=50, h=5):
        self.d = d
        self.h = h
        
    def fit(self, coprusI, num_epochs=1, lr=0.1):
        np.random.seed(100)
        print('Start counting dictionary')
        self.corpus_tokenized, self.V = tokenize(coprusI)
        print(f'Vocabulary size {self.V}')
        my = zip(self.corpus_tokenized, coprusI)
        print('All Words')
        self.vocabulary = {}
        self.r = {}
        for i, j in my:
            u = j.split()
            for m, n in zip(i, u):
                self.vocabulary[n] = m-1
                self.r[m-1] = n
        print('Start fitting')
        E = np.random.rand(self.V, self.d)
        C = np.random.rand(self.d, self.V)
        print(f'Emb: {E.shape}, Context: {C.shape}, {len(list(self.vocabulary.keys()))}')
        loss = 0.
        for num in range(num_epochs):
            print(f'epoch {num}')
            for i, (context, label) in enumerate(corpus2io(self.corpus_tokenized, self.V, self.h)):
                E, C, loss = cbow(context, label, E, C, loss, lr)
                if i % 1000 ==0 :
                    print(f"\n\t loss = {loss}\n")
                    print(f'Word {i}')
        self.embedding = E
        self.context = C
        
    def predict(self, x):
        prob = softmax(np.dot(self.context.T, np.dot(self.embedding.T, to_ct(self.vocabulary[x], num_classes=self.V))))
        return self.r[np.argmax(prob)]
    
    def emb(self, word):
        if word in self.vocabulary:
            
            return self.embedding[self.vocabulary[word]-1]
        else:
            return f'No {word}'

In [19]:
corpus = get_text(amount_of_sentense=100000)

Sentence 1000
Sentence 2000
Sentence 3000
Sentence 4000
Sentence 5000
Sentence 6000
Sentence 7000
Sentence 8000
Sentence 9000
Sentence 10000
Sentence 11000
Sentence 12000
Sentence 13000
Sentence 14000
Sentence 15000
Sentence 16000
Sentence 17000
Sentence 18000
Sentence 19000
Sentence 20000
Sentence 21000
Sentence 22000
Sentence 23000
Sentence 24000
Sentence 25000
Sentence 26000
Sentence 27000
Sentence 28000
Sentence 29000
Sentence 30000
Sentence 31000
Sentence 32000
Sentence 33000
Sentence 34000
Sentence 35000
Sentence 36000
Sentence 37000
Sentence 38000
Sentence 39000
Sentence 40000
Sentence 41000
Sentence 42000
Sentence 43000
Sentence 44000
Sentence 45000
Sentence 46000
Sentence 47000
Sentence 48000
Sentence 49000
Sentence 50000
Sentence 51000
Sentence 52000
Sentence 53000
Sentence 54000
Sentence 55000
Sentence 56000
Sentence 57000
Sentence 58000
Sentence 59000
Sentence 60000
Sentence 61000
Sentence 62000
Sentence 63000
Sentence 64000
Sentence 65000
Sentence 66000
Sentence 67000
Sent

In [10]:
corpus2 = get_text(amount_of_sentense=2)

In [None]:
w2vec = Word2Vec()
w2vec.fit(coprusI=corpus)

Start counting dictionary
Vocabulary size 337838
All Words
Start fitting
Emb: (337838, 50), Context: (50, 337838), 337838
epoch 0

	 loss = 13.344753096447512

Word 0

	 loss = 11906.455836463838

Word 1000

	 loss = 23151.27835878937

Word 2000

	 loss = 33970.182995615745

Word 3000

	 loss = 44517.539929755956

Word 4000

	 loss = 54921.5257844792

Word 5000

	 loss = 64900.16113174904

Word 6000

	 loss = 74958.96059930391

Word 7000

	 loss = 84936.59507630354

Word 8000


In [16]:
w2vec.predict('вице')

'чем'

In [18]:
w2vec.emb('вице')

array([ 0.78828751,  0.75991898,  0.59237894,  0.28243193,  0.67849068,
        0.8328829 ,  0.61232885,  0.99207672,  0.95076009,  0.18659157,
        0.01499938,  0.13537804,  0.91353095,  0.91108076,  0.19866811,
        0.34967891,  0.52696145,  0.26165538,  0.45716451,  0.69872437,
        0.49284879,  0.7126208 ,  0.50817143, -0.0291303 ,  0.38000492,
        0.47147017,  0.3816719 ,  0.3623444 ,  0.5015089 ,  0.40974543,
        0.07842258,  0.25780519,  0.91474975,  0.0160981 ,  0.04048922,
        0.27057861,  0.56805811,  0.9954855 ,  0.9617149 ,  0.99384862,
        0.12599985,  0.67234866,  0.50216426,  0.16905834,  0.91656985,
        0.26050474,  0.97218019,  0.58778283,  0.18946909,  0.38970682])