Туториал по запуску Stanford NLP
https://towardsdatascience.com/natural-language-processing-using-stanfords-corenlp-d9e64c1e1024

In [17]:
from pycorenlp import StanfordCoreNLP
import pandas as pd
import re
from tqdm import tqdm
import spacy
import json

In [18]:
reviews = pd.read_csv('reviews.csv', sep='\t')

In [19]:
ids = reviews['id'].unique().tolist()

Бьем тексты на предложения, потому что CoreNLP не обрабатывает слишком длинные тексты.

In [20]:
nlp = spacy.load("en_core_web_sm")

In [23]:
hotels = {}
for id_ in tqdm(ids):
    texts = reviews[reviews['id'] == id_]['reviews.text'].tolist()
    texts_splitted = []
    for text in texts:
        text = nlp(text)
        texts_splitted.extend(list(text.sents))
    hotels[id_] = texts_splitted

100%|██████████████████████████████████████████████████████████████████████████████| 2197/2197 [03:51<00:00,  9.50it/s]


In [46]:
counter = 0
for key in hotels.keys():
    texts = hotels[key]
    for text in texts:
        if str(text).endswith('...More')or(str(text) == 'More'):
            counter += 1
            print(text)

More
We...More
More
Hotel also has...More
More
More
More
We look forward to your...More
More
We are happy to hear you were provided with friendly and helpful service and appreciate the acknowledgement of...More
More
More
More
More
and...More
More
More
More
More
More
More
More
More
More
when I...More
More
Joy, Jillian and I are always pleased to assist you in any way and look forward to seeing you again in the...More
More
More
More
And even though...More
More
More
More
Thank you for your...More
More
More
All...More
More
As you were able to see, we are located in the ideal location for those families who...More
Eden Roc...More
I am happy that you enjoyed your...More
More
More
More
We hope you were able to take advantage of our...More
More
More
More
...More
More
More
More
More
More
More
The front lobby is not quite as exquisite as The Four Seasons, so if that is...More
More
More
More
More
More
More
More
It...More
More
More
Skipping on breakfasts, however,...More
More
After this trouble, I

More
More
More
More
More
More
We...More
More
It will be my honor to pass on your kind words to...More
More
More
More
More
More
More
More
More
More
More
More
More
I am not affiliated as a shareholder, supplier, or otherwise with any of...More
More
More
This man in a grey suit practically took the keys out of our hand and was...More
It...More
More
More
More
More
I sincerely apologize that we did not live up to your...More
More
We were also treated with 4...More
More
More
More
The Ritz provides the coffee pods, but ask at the front desk for creamers if...More
For that first...More
More
More
...More
...More
More
We did stay in a premier room, which...More
More
More
More
More
More
More
On a side note for...More
More
More
More
arranged by the...More
More
When I confirmed with the Indian that it was indeed slow, I...More
More
More
More
There is no service in the conference area but there...More
More
From the valet parking attendants...More
More
More
More
More
...More
More
More
More
More
More


In [47]:
counter

884

Предложения в основном рубленые, визуально особо важной  информации в них нет. Удаляем их.

In [50]:
for key in hotels.keys():
    texts = hotels[key]
    texts_clean = []
    for text in texts:
        if (not str(text).endswith('...More'))and(str(text) != 'More'):
            texts_clean.append(str(text))
    hotels[key] = texts_clean

In [52]:
#with open('hotel_sentences.json', 'w', encoding='utf-8') as f:
#    f.write(json.dumps(hotels))

Обработка CoreNLP, получение деревьев.

In [13]:
!java -mx6g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -timeout 5000

^C


In [14]:
nlp = StanfordCoreNLP('http://localhost:9000')

In [15]:
hotel_trees = {}

In [16]:
for id_ in tqdm(list(hotels.keys())):
    trees = []
    texts = hotels[id_]
    for text in texts:
        result = nlp.annotate(text,
                           properties={
                               'annotators': 'sentiment',
                               'outputFormat': 'json',
                               'timeout': 10000,
                           })
        if result == 'CoreNLP request timed out. Your document may be too long.':
            # слишком длинные предложения бьем по запятой
            text = text.split(',')
            for part in text:
                result = nlp.annotate(part,
                           properties={
                               'annotators': 'sentiment',
                               'outputFormat': 'json',
                               'timeout': 10000,
                           })
                # если и часть не получается, пропускаем
                if result == 'CoreNLP request timed out. Your document may be too long.':
                    continue
                for sent in result['sentences']:
                    trees.append(sent['sentimentTree'])
        else:
            for sent in result['sentences']:
                trees.append(sent['sentimentTree'])
    hotel_trees[id_] = trees

  3%|██▍                                                                           | 69/2197 [15:13<7:49:22, 13.23s/it]


KeyboardInterrupt: 

In [97]:
#hotel_trees_json = json.dumps(hotel_trees)
#with open('hotel_trees.json', 'w', encoding='utf-8') as f:
#    f.write(hotel_trees_json)

Чистим деревья.

In [100]:
with open('hotel_trees.json', 'r', encoding='utf-8') as f:
    hotel_trees_json = f.read()
hotel_trees = json.loads(hotel_trees_json)

In [101]:
for key in hotel_trees.keys():
    trees = hotel_trees[key]
    trees_cleaned = []
    for tree in trees:
        tree = re.sub('\\r\\n', '', tree)
        tree = re.sub('\s+', ' ', tree)
        trees_cleaned.append(tree)
    hotel_trees[key] = trees_cleaned

In [102]:
hotel_trees['AVwc252WIN2L1WUfpqLP']

['(ROOT|sentiment=3|prob=0,549 (@S|sentiment=3|prob=0,499 (NP|sentiment=2|prob=0,972 (NP|sentiment=2|prob=0,980 (PRP$|sentiment=2|prob=0,993 Our) (NN|sentiment=2|prob=0,995 experience)) (PP|sentiment=2|prob=1,000 (IN|sentiment=2|prob=0,997 at) (NP|sentiment=2|prob=0,528 (NNP|sentiment=2|prob=0,625 Rancho) (NNP|sentiment=2|prob=0,625 Valencia)))) (VP|sentiment=3|prob=0,607 (VBD|sentiment=2|prob=0,997 was) (ADJP|sentiment=3|prob=0,661 (RB|sentiment=3|prob=0,970 absolutely) (@ADJP|sentiment=3|prob=0,621 (JJ|sentiment=4|prob=0,950 perfect) (PP|sentiment=2|prob=0,798 (IN|sentiment=2|prob=1,000 from) (S|sentiment=2|prob=0,731 (VBG|sentiment=2|prob=0,997 beginning) (S|sentiment=2|prob=1,000 (TO|sentiment=2|prob=0,999 to) (VP|sentiment=2|prob=0,999 end)))))))) (.|sentiment=2|prob=0,625 !!!!))',
 '(ROOT|sentiment=3|prob=0,500 (@S|sentiment=3|prob=0,604 (NP|sentiment=2|prob=0,999 We) (VP|sentiment=3|prob=0,589 (@VP|sentiment=3|prob=0,734 (VBD|sentiment=2|prob=0,998 felt) (ADJP|sentiment=3|prob=0

Парсим деревья.

In [103]:
for key in tqdm(hotel_trees.keys()):
    trees = hotel_trees[key]
    groups_total =[]
    for tree in trees:
        groups = [tree]
        for group in groups:
            #  раскрываем внешние скобки
            group = group[1:len(group)-1]
            #  убираем мусор
            group = re.findall('\(.+\)', group)
            if len(group) == 0:
                continue
            elif len(group) > 1:
                print('Regrex error')
                break
            else:
                group = group[0]
            #  добавляем в список все отдельно стоящие пары скобок
            counter = 0
            start_char = 0
            for num, char in enumerate(group):
                if char == '(':
                    counter += 1
                elif char == ')':
                    counter -= 1
                potential_group = group[start_char: num + 1]
                potential_group = potential_group.strip(' ')
                if (counter == 0)and(potential_group.startswith('(')):
                    groups.append(potential_group)
                    start_char = num + 1
        groups_total.extend(groups)
    hotel_trees[key] = groups_total

100%|██████████████████████████████████████████████████████████████████████████████| 2197/2197 [05:59<00:00,  6.11it/s]


Наконец, выбираем элементы с выраженным сантиментом.

In [104]:
for key in tqdm(hotel_trees.keys()):
    groups = hotel_trees[key]
    pos = []
    neg = []
    for group in groups:
        #synt = re.findall('^\((.+?)\|sentiment=\d\|', group)[0]
        sentiment = re.findall('^\(.+?\|sentiment=(\d)\|', group)[0]
        sentiment = int(sentiment)
        if sentiment > 2:
            pos.append(group)
        elif sentiment == 1:
            neg.append(group)
    group_dict = {}
    group_dict['pos'] = pos
    group_dict['neg'] = neg
    hotel_trees[key] = group_dict

100%|█████████████████████████████████████████████████████████████████████████████| 2197/2197 [00:10<00:00, 211.53it/s]


Чистим полученные группы от тегов, оставляем только текст.

In [105]:
for key in tqdm(hotel_trees.keys()):
    group_dict = hotel_trees[key]
    pos_clean = []
    for group in group_dict['pos']:
        group = re.sub('\(.+?\|sentiment=\d\|prob=\d,\d\d\d |\)', ' ', group)
        group = re.sub('\s+', ' ', group)
        group = group.strip(' ')
        pos_clean.append(group)
    neg_clean = []
    for group in group_dict['neg']:
        group = re.sub('\(.+?\|sentiment=\d\|prob=\d,\d\d\d |\)', ' ', group)
        group = re.sub('\s+', ' ', group)
        group = group.strip(' ')
        neg_clean.append(group)
    group_dict['pos'] = pos_clean
    group_dict['neg'] = neg_clean
    hotel_trees[key] = group_dict

100%|█████████████████████████████████████████████████████████████████████████████| 2197/2197 [00:09<00:00, 222.55it/s]


In [110]:
with open('keywords.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(hotel_trees))

In [78]:
#with open('keywords_NP.json', 'w', encoding='utf-8') as f:
#    f.write(json.dumps(hotel_trees))

In [111]:
text = reviews['reviews.text'].tolist()[1]

In [112]:
result = nlp.annotate(text,
                       properties={
                           'annotators': 'sentiment',
                           'outputFormat': 'json',
                           'timeout': 1000,
                       })

In [116]:
print(result['sentences'][0]['sentimentTree'])

(ROOT|sentiment=4|prob=0,795
  (@NP|sentiment=4|prob=0,978 (JJ|sentiment=4|prob=0,933 Amazing) (NN|sentiment=2|prob=0,995 place))
  (.|sentiment=2|prob=1,000 .))
