## Импорт библиотек

In [89]:
from gensim.models import Word2Vec
import pandas as pd
from reach import Reach

import spacy
import nltk
import numpy as np
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from tqdm import tqdm
from collections import Counter, defaultdict
import itertools
from spacy.symbols import NOUN, PROPN, PUNCT, SYM, ADP, DET, ADJ

nlp = spacy.load("ru_core_news_lg")
stopwords_ru = stopwords.words("russian")
tqdm.pandas(desc="progress-bar")

In [17]:
#! pip3 install reach
#! python3 -m spacy download ru_core_news_lg
# nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

## Подготовка датасетов

In [90]:
# для word2vec

df_train = pd.read_excel('tmp.xlsx')

df_train = df_train[~df_train['text'].isnull()]
df_train = df_train[df_train['text'] != 'string']
df_train = df_train.head(5000)

corpus = [x.lower().replace('...', '.').replace('.', '. ').replace(' .', '. ').strip().split() for x in df_train['text']]

In [11]:
df = pd.read_excel('review.xlsx')
df = df.drop('Unnamed: 0', axis=1)

In [104]:
df_train = df_train.head(5000)

## Word2Vec

In [56]:
f = Word2Vec(corpus,
             sg=0,
             negative=5,
             window=10,
             size=200,
             min_count=2,
             workers=10
            )
f.wv.save_word2vec_format("my_word_vectors.vec")

In [114]:
 r = Reach.load("my_word_vectors.vec", unk_word="<UNK>")
 r.vectors[r.items["<UNK>"]] = r.vectors.max()

## Кандидаты в аспектные термины

In [26]:
def get_noun_phrases(texts):
    
    features_dict = Counter()
    a = []
    
    for item in tqdm(texts):
        text_nlp = nlp(item)
        a = [str(word) for word in text_nlp if word.tag_ in('NOUN', 'PROPN')] + a
    features_dict.update(a)
    return features_dict

def sent_token(text):
    """
    Токенизация по предложениям
    """
    text = str(text).lower()
    text = text.replace('...', '.').replace('.', '. ')
    return [nlp(x, disable=["ner"]) for x in nltk.sent_tokenize(text)]


def process_sent(text, remove_stopwords=False, min_token_length=3):
    """ 
    Apply text preprocessing steps 
    """
    if remove_stopwords:
        return " ".join([token.lemma_.lower() for token in text if not token.is_stop 
                         and len(token.text) >= min_token_length])
    else:
        return " ".join([token.lemma_.lower() for token in text if len(token.text) >= min_token_length])


In [18]:
df["sentences"] = df["review"].progress_map(lambda x: sent_token(x))
df = df.explode('sentences')
df.index = np.arange(0, len(df))
df['clean_sent'] = df['sentences'].progress_map(lambda x: process_sent(x))
features_dict = get_noun_phrases(list(df['clean_sent']))

progress-bar: 100%|██████████| 284/284 [00:36<00:00,  7.83it/s]
progress-bar: 100%|██████████| 3466/3466 [00:00<00:00, 51550.54it/s]
100%|██████████| 3466/3466 [00:40<00:00, 85.31it/s]


In [105]:
df_train["sentences"] = df_train["text"].progress_map(lambda x: sent_token(x))
df_train = df_train.explode('sentences')
df_train.index = np.arange(0, len(df_train))
df_train['clean_sent'] = df_train['sentences'].progress_map(lambda x: process_sent(x))
features_dict = get_noun_phrases(list(df_train['clean_sent']))

aspects = [[k] for k, v in features_dict.most_common(200)]
aspect_vecs = [x.mean(0) for x in r.transform(aspects, remove_oov=False)]
aspect_vecs = np.stack(aspect_vecs)

progress-bar: 100%|██████████| 5000/5000 [02:13<00:00, 37.36it/s] 
progress-bar: 100%|██████████| 15722/15722 [00:00<00:00, 70565.22it/s]
100%|██████████| 15722/15722 [02:53<00:00, 90.67it/s] 


In [94]:
features_dict = get_noun_phrases(list(df['sentences']))
aspects = [[k] for k, v in features_dict.most_common(200)]
aspect_vecs = [x.mean(0) for x in r.transform(aspects, remove_oov=False)]
aspect_vecs = np.stack(aspect_vecs)

100%|██████████| 3466/3466 [00:33<00:00, 103.07it/s]


In [113]:
features_dict.most_common(200)[0:30]

[('бургер', 1764),
 ('место', 1093),
 ('обслуживание', 700),
 ('кухня', 653),
 ('еда', 617),
 ('ресторан', 611),
 ('заведение', 577),
 ('цена', 549),
 ('блюдо', 473),
 ('персонал', 427),
 ('официант', 411),
 ('меню', 396),
 ('заказ', 374),
 ('раз', 362),
 ('атмосфера', 282),
 ('вкус', 265),
 ('музыка', 251),
 ('мясо', 245),
 ('очередь', 245),
 ('человек', 227),
 ('качество', 225),
 ('минута', 222),
 ('интерьер', 221),
 ('стол', 213),
 ('перчатка', 210),
 ('вечер', 189),
 ('время', 185),
 ('спасибо', 182),
 ('народ', 180),
 ('котлета', 179)]

## Attention

In [107]:
from sklearn.metrics.pairwise import rbf_kernel, euclidean_distances

def rbf_attention(vec, memory, gamma=0.1, **kwargs):
    """
    Single-head attention using RBF kernel.
    Parameters
    ----------
    vec : np.array
        an (N, D)-shaped array, representing the tokens of an instance.
    memory : np.array
        an (M, D)-shaped array, representing the memory items
    gamma : float
        the gamma of the RBF kernel.
    Returns
    -------
    attention : np.array
        A (1, N)-shaped array, representing a single-headed attention mechanism
    """
    z = rbf_kernel(vec, memory, gamma)
    s = z.sum()
    if s == 0:
        # If s happens to be 0, back off to uniform
        return np.ones((1, len(vec))) / len(vec)
    return (z.sum(1) / s)[None, :]


def softmax(x, axis=1):
    """Compute softmax values for each sets of scores in x."""
    e_x = np.exp(x - np.max(x, axis, keepdims=True))
    s = e_x.sum(axis=axis, keepdims=True)
    return e_x / s


def attention(vec, memory, **kwargs):
    """
    Standard multi-head attention mechanism.
    Parameters
    ----------
    vec : np.array
        an (N, D)-shaped array, representing the tokens of an instance.
    memory : np.array
        an (M, D)-shaped array, representing the memory items
    Returns
    -------
    attention : np.array
        A (M, N)-shaped array, representing the attention over all memories.
    """
    z = memory.dot(vec.T)
    return softmax(z)

def normalize(x):
    """Normalize a vector while controlling for zero vectors."""
    x = np.copy(x)
    if np.ndim(x) == 1:
        norm = np.linalg.norm(x)
        if norm == 0:
            return x
        return x / np.linalg.norm(x)
    norm = np.linalg.norm(x, axis=-1)
    mask = norm > 0
    x[mask] /= norm[mask][:, None]
    return x

In [108]:
instances = [str(x).strip().split() for x in df['sentences']]
t = r.transform(instances, remove_oov=False)

In [116]:
label_vecs = normalize(r.vectorize(['еда', 'обслуживание', 'нет']))
assert all([x in r.items for x in ['еда', 'обслуживание']])

In [130]:
out = []
print(len(t))
for vec in t:
    att = rbf_attention(vec, aspect_vecs)
    print(att.shape)
    # Att = (n_heads, n_words)
    z = att.dot(vec)
    print(z.shape)
    # z = (n_heads, n_dim)
    x = normalize(z).dot(label_vecs.T)
    # x = (n_heads, n_labels)
    print(x, x.sum(0))
    out.append(x.sum(0))
    print(out)
    break

p = np.stack(out)
pred = p.argmax(1)

3466
(1, 8)
(1, 200)
[[-0.01693357  0.05593684 -0.00727988]] [-0.01693357  0.05593684 -0.00727988]
[array([-0.01693357,  0.05593684, -0.00727988])]


In [118]:
instances_a = [y for x in instances for y in x]

In [132]:
list(zip(pred, instances))

[(1, ['день', '8-го', 'марта', 'прошёл,', 'можно', 'и', 'итоги', 'подвести.'])]

In [None]:
p = list(np.stack(out).argmax(1))