In [1]:
from datasets import concatenate_datasets, load_dataset
from datasets import Dataset, DatasetDict
import pandas as pd
import numpy as np
import torch
import os
import ast
import spacy
from tqdm import tqdm
import time
# pd.set_option('display.max_colwidth', None)

# Read ABSA-processed Reviews

In [2]:
df = pd.read_pickle("../../data/space/reviews_absa_processed.pkl")
df.columns = [col.replace('prompt_', '').replace('aspect', 'aspects').replace('sentiment', 'sentiments') for col in df.columns]
df = df[df['sentiments'] != 'neutral']
df = df.rename(columns={'entity_id': 'business_id', 'entity_name': 'business_name'})
df.drop_duplicates(subset=['category', 'review_id', 'sentences']).shape

(5837, 9)

In [3]:
df

Unnamed: 0,review_id,sentences,rating,business_id,business_name,category,aspects,sentiments,aspects_lemm
0,UR70384224,"- Downstairs bathroom is super clean, with mod...",5,1510471,UNA Hotel Roma,Hotels,bathroom,positive,bathroom
1,UR70384224,"- Downstairs bathroom is super clean, with mod...",5,1510471,UNA Hotel Roma,Hotels,room,positive,room
4,UR70384224,- Bed was comfortable - Bathroom was modern an...,5,1510471,UNA Hotel Roma,Hotels,bed,positive,bed
5,UR70384224,- Bed was comfortable - Bathroom was modern an...,5,1510471,UNA Hotel Roma,Hotels,bathroom,positive,bathroom
10,UR70384224,"- When getting off Termini, from the Airport, ...",5,1510471,UNA Hotel Roma,Hotels,tram,positive,tram
...,...,...,...,...,...,...,...,...,...
11307,UR82736454,The best part of the hotel is the 7th floor ro...,4,2516007,"Hotel Erwin, a Joie de Vivre hotel",Hotels,hotel,positive,hotel
11309,UR82736454,Grab a corner couch at sunset and enjoy the bo...,4,2516007,"Hotel Erwin, a Joie de Vivre hotel",Hotels,sunset,positive,sunset
11311,UR82736454,Grab a corner couch at sunset and enjoy the bo...,4,2516007,"Hotel Erwin, a Joie de Vivre hotel",Hotels,cocktail,positive,cocktail
11312,UR82736454,"The food was not very good, so sip your drinks...",4,2516007,"Hotel Erwin, a Joie de Vivre hotel",Hotels,food,negative,food


# Preprocessing

In [4]:
col_agg = {col: lambda x: x.iloc[0] for col in df.columns if col not in ['review_id', 'user_id', 'business_id', 
                                                                         'text', 'sentences',
                                                                         'aspect', 'sentiment' ,'aspect_lemm']}
sent_list_agg = {col: lambda x: x.tolist() for col in df.columns if col in ['aspects', 'sentiments' ,'aspects_lemm']}
col_agg.update(sent_list_agg)

In [5]:
# Aggregating
df = df.groupby(['review_id', 'business_id', 'sentences'], sort=False, as_index=False).agg(col_agg).reset_index(drop=True)

In [6]:
# Indexing
df = df.groupby(['review_id', 'business_id'], 
           sort=False, as_index=False).apply(lambda grp: grp.reset_index(drop=True)).reset_index()
df['id'] = df['review_id'].astype(str) + "######" + df['level_1'].astype(str)

In [7]:
df_scored = df

In [8]:
df_scored['num_of_aspects'] = df_scored['aspects_lemm'].apply(lambda x: len(x))

# Aspect Sentiment Clustering

In [9]:
import spacy
nlp = spacy.load('en_core_web_lg')

def cal_spacy_similarity(text1, text2):
    doc1 = nlp(text1)
    doc2 = nlp(text2)
    return doc1.similarity(doc2)

Merge to the cluster with the highest score

In [10]:
def deduplicate(inputs):
    """ Group similar aspect terms of a business in a greedy fashion."""
    # Deduplication
    buss_aspects = inputs[0]
    sent_buss_df = inputs[1]
    
    filtered = []
    for aspect in buss_aspects:
        find_merge = False
        
        similarity_to_other_clusters = []
        # Get best cluster
        for aspects_cluster in filtered:
            average_cosine = average_similarity_to_cluster(aspect, aspects_cluster, sent_buss_df)
            similarity_to_other_clusters += [average_cosine]
        
        sorted_cluster_indices = np.argsort(similarity_to_other_clusters)[::-1]
        
        if len(sorted_cluster_indices) > 0:
            optimal_cluster_index = sorted_cluster_indices[0]
            if similarity_to_other_clusters[optimal_cluster_index] >= threshold:             
                aspects_other = filtered[optimal_cluster_index]
                aspects_other.append(aspect)
                find_merge = True
                
        if not find_merge:
            filtered.append([aspect])

    aspect_clusters_df = pd.DataFrame()
    aspect_clusters_df['aspects_lemm'] = filtered
    aspect_clusters_df = aspect_clusters_df.reset_index().explode(['aspects_lemm']).rename(columns={'index': 'cluster_id'})
    
    return filtered, sent_buss_df.merge(aspect_clusters_df, on=['aspects_lemm'])

In [11]:
import statistics
def average_similarity_to_cluster(kp, kps_other, sent_buss_df):
    """ Calculate average cosine similarity of an AK to a cluster """
    total_similarity = []
    for kp_other in kps_other:
        total_similarity += [calculate_similarity(kp, kp_other, sent_buss_df)]
        
    return statistics.mean(total_similarity)

In [12]:
def calculate_similarity(text1, text2, sent_buss_df):
    """ Determine if two extractions are the same or not
    Args:
        other (Extraction object)
    Returns:
        True or False
    Rule:
        Consider two extractions as the same if their w2v cosine similarity
        is above the specified threshold:
            ext1 == ext2, if cosine(ext1.emb, ext2.emb) >= threshold
    """
    similarity = cal_spacy_similarity(text1, text2)
    return similarity

Apply

In [13]:
# The merging threshold
threshold = 0.55

In [14]:
sent_df = df_scored.explode(['aspects', 'sentiments' ,'aspects_lemm'])

In [15]:
col_agg = {col: lambda x: x.iloc[0] for col in df.columns if col in ['business_name', 'business_id', 'categories', 'categories_list', 'category']}
sent_list_agg = {col: lambda x: x.tolist() for col in df.columns if col not in ['cluster_id', 'business_name', 'business_id', 'categories', 'categories_list', 'category']}
col_agg.update(sent_list_agg)

In [16]:
from tqdm.contrib.concurrent import process_map  # or thread_map
num_workers = 5

In [17]:
inputs = []
for category in sorted(df_scored['category'].unique()):
    for business_id in sorted(df_scored[df_scored['category'] == category]['business_id'].unique()):
        for sentiment in ['positive', 'negative']:
            sent_buss_df = sent_df[(sent_df['business_id'] == business_id) & (sent_df['sentiments'] == sentiment)]
            sent_buss_df = sent_buss_df[sent_buss_df.apply(lambda row: row['aspects'].lower() in row['sentences'].lower(), axis=1)]

            # Sort aspects by their occurrences in the particular business
            sorted_aspects_index = sent_buss_df['aspects_lemm'].value_counts()
            buss_aspects = sorted_aspects_index.index.tolist()
            
            inputs += [(buss_aspects, sent_buss_df)]

In [18]:
start_time = time.time()
clusters_info = process_map(deduplicate, inputs, max_workers=num_workers)
print("TIME ELAPSED", time.time() - start_time)

  0%|          | 0/20 [00:00<?, ?it/s]

  return doc1.similarity(doc2)
  return doc1.similarity(doc2)
  return doc1.similarity(doc2)
  return doc1.similarity(doc2)
  return doc1.similarity(doc2)
  return doc1.similarity(doc2)
  return doc1.similarity(doc2)
  return doc1.similarity(doc2)
  return doc1.similarity(doc2)
  return doc1.similarity(doc2)
  return doc1.similarity(doc2)
  return doc1.similarity(doc2)


TIME ELAPSED 254.19433498382568


In [19]:
dfs = []

for business_sentiment_cluster_info in clusters_info:
    sent_buss_clustered_df = business_sentiment_cluster_info[1]
    
    # Number of sentences in a cluster must be > the number of aspects"
    sent_buss_clustered_df = sent_buss_clustered_df.groupby(['cluster_id']).filter(lambda grp: len(grp) > len(grp['aspects_lemm'].unique()))

    # Get the final clustered df of comments by aspects
    aspect_clusters_df = sent_buss_clustered_df.groupby(['cluster_id']).agg(col_agg)

    aspect_clusters_df['cluster_sentiment'] = aspect_clusters_df['sentiments'].iloc[0][0]

    dfs += [aspect_clusters_df]

In [20]:
summ_df = pd.concat(dfs)
summ_df.to_pickle("../../data/space/aspect_sentiment_clusters.pkl")

In [21]:
summ_df

Unnamed: 0_level_0,business_id,business_name,category,level_0,level_1,review_id,sentences,rating,aspects,sentiments,aspects_lemm,id,num_of_aspects,cluster_sentiment
cluster_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,156564,Wedgewood Hotel & Spa,Hotels,"[94, 95, 96, 101, 101, 106, 107, 108, 111, 116...","[6, 0, 0, 0, 5, 0, 0, 4, 2, 0, 4, 1, 4, 0, 8, ...","[UR123881946, UR87832081, UR8232951, UR1155085...","[Can't say enough good about this hotel., The ...","[5, 5, 5, 4, 4, 5, 5, 5, 5, 4, 5, 5, 5, 5, 5, ...","[hotel, hotel, hotel, hotel, hotel, hotel, hot...","[positive, positive, positive, positive, posit...","[hotel, hotel, hotel, hotel, hotel, hotel, hot...","[UR123881946######6, UR87832081######0, UR8232...","[1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",positive
1,156564,Wedgewood Hotel & Spa,Hotels,"[93, 94, 94, 98, 100, 100, 101, 101, 104, 105,...","[6, 1, 5, 1, 9, 11, 7, 8, 1, 1, 8, 6, 4, 1, 2,...","[UR2292083, UR123881946, UR123881946, UR755517...","[One great aspect which stood out in my mind, ...","[2, 5, 5, 5, 5, 5, 4, 4, 5, 5, 5, 4, 5, 4, 5, ...","[housekeeping supervisor, staff, staff, staff,...","[positive, positive, positive, positive, posit...","[housekeeping supervisor, staff, staff, staff,...","[UR2292083######6, UR123881946######1, UR12388...","[1, 1, 2, 1, 1, 1, 1, 1, 4, 5, 1, 1, 1, 2, 1, ...",positive
2,156564,Wedgewood Hotel & Spa,Hotels,"[94, 100, 127, 129, 133, 134, 139, 143, 145, 1...","[4, 3, 1, 4, 8, 3, 3, 4, 4, 1, 2, 2, 1, 3, 3, ...","[UR123881946, UR29267072, UR3424000, UR5317158...",[The complimentary chocolates and cookies were...,"[5, 5, 4, 5, 5, 5, 5, 4, 3, 5, 5, 5, 5, 5, 5, ...","[bathroom, bathroom, bathroom, bathroom, bathr...","[positive, positive, positive, positive, posit...","[bathroom, bathroom, bathroom, bathroom, bathr...","[UR123881946######4, UR29267072######3, UR3424...","[2, 1, 4, 2, 1, 1, 1, 2, 4, 5, 1, 1, 2, 6, 1, ...",positive
3,156564,Wedgewood Hotel & Spa,Hotels,"[93, 95, 101, 102, 105, 105, 106, 107, 109, 11...","[8, 6, 9, 2, 1, 3, 2, 2, 3, 2, 3, 4, 0, 1, 5, ...","[UR2292083, UR87832081, UR115508525, UR1156430...",[We spent half the money we did in Bellevue an...,"[2, 5, 4, 5, 5, 5, 5, 5, 4, 5, 5, 3, 4, 4, 4, ...","[service, service, service, service, service, ...","[positive, positive, positive, positive, posit...","[service, service, service, service, service, ...","[UR2292083######8, UR87832081######6, UR115508...","[3, 1, 1, 1, 5, 2, 2, 1, 3, 5, 1, 1, 2, 1, 2, ...",positive
4,156564,Wedgewood Hotel & Spa,Hotels,"[93, 95, 98, 101, 116, 129, 133, 139, 142, 148...","[8, 9, 0, 10, 4, 9, 13, 0, 13, 0, 6, 5, 0, 0, ...","[UR2292083, UR87832081, UR7555175, UR115508525...",[We spent half the money we did in Bellevue an...,"[2, 5, 5, 4, 4, 5, 5, 5, 5, 5, 5, 5, 4, 5, 5, ...","[comfortable night stay, stay, stay, stay, sta...","[positive, positive, positive, positive, posit...","[comfortable night stay, stay, stay, stay, sta...","[UR2292083######8, UR87832081######9, UR755517...","[3, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, ...",positive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5,2516007,"Hotel Erwin, a Joie de Vivre hotel",Hotels,"[854, 864, 875, 895]","[0, 3, 5, 6]","[UR113200345, UR128442797, UR59371749, UR11587...",[This is a great place for a young couple or s...,"[4, 3, 5, 4]","[price, price, parking prices, rack rate]","[negative, negative, negative, negative]","[price, price, parking price, rack rate]","[UR113200345######0, UR128442797######3, UR593...","[3, 1, 2, 2]",negative
6,2516007,"Hotel Erwin, a Joie de Vivre hotel",Hotels,"[876, 907]","[4, 5]","[UR87533553, UR39331673]",[Plus we were right under their rooftop bar an...,"[3, 4]","[rooftop bar, rooftop bar]","[negative, negative]","[rooftop bar, rooftop bar]","[UR87533553######4, UR39331673######5]","[2, 2]",negative
7,2516007,"Hotel Erwin, a Joie de Vivre hotel",Hotels,"[907, 907]","[5, 6]","[UR39331673, UR39331673]",[The down side is there is a rooftop bar which...,"[4, 4]","[music, music]","[negative, negative]","[music, music]","[UR39331673######5, UR39331673######6]","[2, 1]",negative
8,2516007,"Hotel Erwin, a Joie de Vivre hotel",Hotels,"[885, 898, 915, 915, 915]","[3, 3, 6, 2, 5]","[UR38977558, UR60492805, UR121717923, UR121717...",[No guest information in rooms and no informat...,"[3, 4, 3, 3, 3]","[guest information, service, service, guest se...","[negative, negative, negative, negative, negat...","[guest information, service, service, guest se...","[UR38977558######3, UR60492805######3, UR12171...","[1, 1, 1, 1, 2]",negative
