In [4]:
import spacy
import json
from spacy import displacy
import pandas as pd
from tqdm import tqdm
from nltk import word_tokenize

Spacy tutorial: https://spacy.io/usage/linguistic-features

In [5]:
reviews = pd.read_csv('reviews.csv', sep='\t')

In [6]:
hotels = dict()
for id_ in reviews['id'].unique().tolist():
    texts = reviews[reviews['id'] == id_]['reviews.text'].tolist()
    hotels[id_] = texts

In [7]:
nlp = spacy.load("en_core_web_sm")

In [8]:
noun_groups = dict()
for id_ in tqdm(hotels.keys()):
    texts = hotels[id_]
    texts = ' '.join(texts)
    texts = nlp(texts)
    texts = list(texts.noun_chunks)
    texts = [str(text) for text in texts]
    noun_groups[id_] = texts

100%|██████████████████████████████████████████████████████████████████████████████| 2197/2197 [02:58<00:00, 12.31it/s]


In [12]:
#with open('noun_groups_spacy.json', 'w', encoding='utf-8') as f:
#    f.write(json.dumps(noun_groups))

In [9]:
with open('keywords.json', 'r', encoding='utf-8') as f:
    keywords = f.read()
    keywords = json.loads(keywords)

In [22]:
clean_keywords = dict()
for key in tqdm(keywords.keys()):
    kws_pos = keywords[key]['pos']
    kws_neg = keywords[key]['neg']
    ngs = noun_groups[key]
    ckws = {'pos': [],
            'neg': []}
    for ng in ngs:
        if ng in kws_pos:
            ckws['pos'].append(ng)
        if ng in kws_neg:
            ckws['neg'].append(ng)
    clean_keywords[key] = ckws

100%|█████████████████████████████████████████████████████████████████████████████| 2197/2197 [00:14<00:00, 156.63it/s]


In [23]:
stop_words = ['a', 'an', 'the', 'this', 'that', 'our', 'your', 'my']
black_list = set(['review', 'feedback'])
for key in tqdm(clean_keywords.keys()):
    words_pos = clean_keywords[key]['pos']
    words_neg = clean_keywords[key]['neg']
    clean_words = {'pos': [],
                   'neg': []}
    for word in words_pos:
        tokens = word_tokenize(word.lower())
        for w in stop_words:
            if w in tokens:
                tokens.remove(w)
        length = len(tokens)
        if not set(tokens).isdisjoint(black_list):
            length = 1
        if length != 1:
            clean_words['pos'].append(word)
    for word in words_neg:
        if length != 1:
            clean_words['neg'].append(word)
    clean_keywords[key] = clean_words

100%|█████████████████████████████████████████████████████████████████████████████| 2197/2197 [00:02<00:00, 811.63it/s]


In [25]:
#with open('filtered_keywords.json', 'w', encoding='utf-8') as f:
#    f.write(json.dumps(clean_keywords))

In [26]:
with open('filtered_keywords.json', 'r', encoding='utf-8') as f:
    np_keywords = json.loads(f.read())
with open('non-NP_extracted_keywords.json', 'r', encoding='utf-8') as f:
    non_np_keywords = json.loads(f.read())

In [27]:
all_keywords = {}
for key in tqdm(np_keywords.keys()):
    np_k_pos = np_keywords[key]['pos']
    np_k_neg = np_keywords[key]['neg']
    non_np_k_pos = non_np_keywords[key]['pos']
    non_np_k_neg = non_np_keywords[key]['neg']
    total = {
        'pos': np_k_pos + non_np_k_pos,
        'neg': np_k_neg + non_np_k_neg,
    }
    all_keywords[key] = total

100%|██████████████████████████████████████████████████████████████████████████| 2197/2197 [00:00<00:00, 439410.90it/s]


In [29]:
#with open('all_keywords.json', 'w', encoding='utf-8') as f:
#    f.write(json.dumps(all_keywords))