In [1]:
from sklearn.model_selection import train_test_split

In [2]:
import pandas as pd

In [3]:
test_texts = pd.read_csv('dev_reviews.txt', delimiter='\t', names=['text_id','text'])

In [4]:
! pip install stanza



In [5]:
import stanza
stanza.download('ru')

HBox(children=(FloatProgress(value=0.0, description='Downloading https://raw.githubusercontent.com/stanfordnlp…

2021-12-27 04:14:16 INFO: Downloading default packages for language: ru (Russian)...





HBox(children=(FloatProgress(value=0.0, description='Downloading https://huggingface.co/stanfordnlp/stanza-ru/…




2021-12-27 04:15:29 INFO: Finished downloading models and saved to /Users/macbook/stanza_resources.


In [6]:
nlp = stanza.Pipeline('ru', processors='tokenize,lemma')

2021-12-27 04:15:35 INFO: Loading these models for language: ru (Russian):
| Processor | Package   |
-------------------------
| tokenize  | syntagrus |
| lemma     | syntagrus |

2021-12-27 04:15:35 INFO: Use device: cpu
2021-12-27 04:15:35 INFO: Loading: tokenize
2021-12-27 04:15:36 INFO: Loading: lemma
2021-12-27 04:15:36 INFO: Done loading processors!


In [7]:
! pip install pymorphy2



In [8]:
from pymorphy2 import MorphAnalyzer
from pymorphy2.tokenizers import simple_word_tokenize

In [9]:
import re

In [10]:
# Лемматизация текстов
m = MorphAnalyzer()
def lemmatize(text):
    text = text.lower().replace('ё', 'е')
    lemmas = []
    for t in simple_word_tokenize(text):
        lemmas.append(
            m.parse(t)[0].normal_form.replace('ё', 'е')
        )
    return ' '.join(lemmas)

In [11]:
from collections import defaultdict, Counter

In [12]:
test_texts['text_lemmatized'] = [lemmatize(m) for m in test_texts['text']]

In [13]:
test_texts

Unnamed: 0,text_id,text,text_lemmatized
0,13823,"Зашли в""аппетит"" случайно.Не смотря на то,что ...","заслать в "" аппетит "" случайно . не смотреть н..."
1,1427,Здравствуйте!Посетили ваше заведение вчера пер...,здравствовать ! посетить ваш заведение вчера о...
2,16714,"Были в пятницу (19.03.10), заказывали столик д...","быть в пятница ( 19 . 03 . 10 ) , заказывать с..."
3,797,"Были в ресторане 2 раза. Один раз днем, все по...","быть в ресторан 2 раз . один раз днем , весь п..."
4,34710,Удивляюсь отзывам про хорошее обслуживание. Бы...,удивляться отзыв про хороший обслуживание . бы...
...,...,...,...
66,9216,Вы брали этот ресторан так как он близко от до...,вы брать этот ресторан так как он близко от до...
67,8996,"Были с друзьями в пабе Метрополь, всё очень по...","быть с друг в паб метрополь , весь очень понра..."
68,38299,"Случайно увидели акцию на сайте купонов, решил...","случайно увидеть акция на сайт купон , решить ..."
69,37819,Очень долго выбирали ресторан на Новогодний ка...,очень долго выбирать ресторан на новогодний ка...


In [14]:
import nltk
nltk.download('stopwords')
from string import punctuation
from nltk.corpus import stopwords
stops = stopwords.words('russian') + ['это', 'все', 'еще']


[nltk_data] Downloading package stopwords to /Library/Frameworks/Pytho
[nltk_data]     n.framework/Versions/3.7/lib/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

c_vec = TfidfVectorizer(max_features=50000, stop_words=stops, ngram_range=(1, 5))

ngrams = c_vec.fit_transform(test_texts['text_lemmatized'])

vocab = c_vec.vocabulary_

count_values = ngrams.toarray().sum(axis=0)
ngrams = []
for ng_count, ng_text in sorted([(count_values[i],k) for k,i in vocab.items()], reverse=True):
  ngrams.append(ng_text)

In [16]:
patterns = [['NOUN'], ['ADJF', 'NOUN'], ['NOUN', 'ADJF']]

In [17]:
from tqdm.notebook import tqdm

In [18]:
def check_pattern(key_words_list, patterns):
    filtered_keywords = []
    for item in tqdm(key_words_list):
        check_pattern = []
        if ' ' in item:
            i = item.split(' ')
        else:
            i = [item]
        for word in i:
            m = MorphAnalyzer()
            p = m.parse(word)[0]
            pos = p.tag.POS
            check_pattern.append(pos)
        for pattern in patterns:
            if check_pattern == pattern:
                filtered_keywords.append(item)
    return filtered_keywords

In [19]:
filtered_ngrams = check_pattern(ngrams[:5000], patterns)

HBox(children=(FloatProgress(value=0.0, max=5000.0), HTML(value='')))




In [20]:
!pip install gensim



In [21]:
from gensim.models.keyedvectors import KeyedVectors

In [22]:
!wget https://rusvectores.org/static/models/rusvectores4/ruwikiruscorpora/ruwikiruscorpora_upos_skipgram_300_2_2018.vec.gz

/bin/bash: wget: command not found


In [None]:
!gzip -d ruwikiruscorpora_upos_skipgram_300_2_2018.vec.gz

In [23]:
wv = KeyedVectors.load_word2vec_format('ruwikiruscorpora_upos_skipgram_300_2_2018.vec')

In [24]:
classes = ['Whole', 'Food', 'Interior', 'Service', 'Price']
base_words = ['ресторан_NOUN', 'еда_NOUN', 'интерьер_NOUN', 'обслуживание_NOUN', 'цена_NOUN']
base_vectors = []
for c, word in zip(classes, base_words):
    base_vectors.append(wv[word])

In [25]:
import numpy as np
def get_most_similar(word):
    sim = wv.cosine_similarities(wv[word], base_vectors)
    clas = classes[np.argmax(sim)]
    if sim[np.argmax(sim)] >= 0.3:
      return clas
    return 'Not a category'

In [26]:
best_mention = {}
for k in filtered_ngrams:
  key = k + '_NOUN'
  try:
    most_similar = get_most_similar(key)
    if most_similar != 'Not a category':
        best_mention[tuple([k])] = most_similar
  except KeyError:
    pass

In [27]:
def label_texts(text, mentions, max_len=5): #sentiments
    tokenized = [word for sent in nlp(text).sentences for word in sent.words]
    text_end = len(tokenized)
    for i, token in enumerate(tokenized):
        for l in reversed(range(max_len)):
            if i + l > text_end:
                continue
            span = tokenized[i:i + l]
            key = tuple([t.lemma for t in span])
            if key in mentions:
                start, end = span[0].start_char, span[-1].end_char
                yield mentions[key], text[start:end], start, end #sentiments[key]
                break

In [28]:
with open('aspects.txt', 'w') as f:
    for text, idx in tqdm(zip(test_texts['text'], test_texts['text_id'])):
        for asp in label_texts(text, best_mention):
            print(idx, *asp, sep="\t", file=f)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


