# Import Libraries

In [1]:
import torch
torch.cuda.empty_cache()

In [2]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from collections import Counter
from jinja2 import Template
from lxml import etree as ET
from os.path import join
import ast

In [3]:
from datasets import concatenate_datasets, load_dataset
from datasets import Dataset, DatasetDict
pd.set_option('display.max_colwidth', None)

# Read Dataset

Read preprocessed comment-level Yelp dataset

In [4]:
df = pd.read_pickle("../data/yelp/snippext_yelp_sent_df_single_aspect_5_domains_top_50.pkl")

In [5]:
df['aspects'] = df['aspects'].apply(lambda x: [e.lower() for e in x])
df['opinions'] = df['opinions'].apply(lambda x: [e.lower() for e in x])
df['opinion_aspect_pairs'] = df['opinion_aspect_pairs'].apply(lambda x: [e.lower() for e in x])
df = df.reset_index()
df = df.drop(columns=['review_content'])

In [6]:
df

Unnamed: 0,index,level_1,domain,categories_list,categories,name,business_id,sentences,opinions,aspects,opinion_aspect_pairs,sid,attributes,sentiments
0,487383,0,Hotels,"[Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel]","Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel",Sheraton Centre Toronto Hotel,M7FOXsuEuRbwEfxUrNKv5w,Well located to most of downtown by foot .,[well],[located],[well located],2.0,[location],[positive]
1,487383,2,Hotels,"[Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel]","Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel",Sheraton Centre Toronto Hotel,M7FOXsuEuRbwEfxUrNKv5w,Free WiFi in lobby,[free],[wifi],[free wifi],4.0,[wait-time],[positive]
2,487383,3,Hotels,"[Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel]","Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel",Sheraton Centre Toronto Hotel,M7FOXsuEuRbwEfxUrNKv5w,", hefty charge for in room .",[hefty],[charge for],[hefty charge for],5.0,[value-for-money],[negative]
3,487385,0,Hotels,"[Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel]","Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel",Sheraton Centre Toronto Hotel,M7FOXsuEuRbwEfxUrNKv5w,Been staying at this hotel for years now and still enjoy the place .,[enjoy],[place],[enjoy place],0.0,[recommendation],[positive]
4,487385,1,Hotels,"[Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel]","Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel",Sheraton Centre Toronto Hotel,M7FOXsuEuRbwEfxUrNKv5w,It was renovated a few years ago and has a great location .,[great],[location],[great location],1.0,[location],[positive]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17563,1038117,3,Restaurants,"[Breakfast & Brunch, Soup, Food, Bagels, Salad, Restaurants, Sandwiches]","Breakfast & Brunch, Soup, Food, Bagels, Salad, Restaurants, Sandwiches",Panera Bread,6bgjcFOy4WHMyw62_1V9Pw,Tomato soup is always perfect .,[always perfect],[tomato soup],[always perfect tomato soup],4.0,[food -> quality],[positive]
17564,1038117,4,Restaurants,"[Breakfast & Brunch, Soup, Food, Bagels, Salad, Restaurants, Sandwiches]","Breakfast & Brunch, Soup, Food, Bagels, Salad, Restaurants, Sandwiches",Panera Bread,6bgjcFOy4WHMyw62_1V9Pw,"My favorite location is the Boca Park just because of seating , but this one is suffice !",[favorite],[location],[favorite location],5.0,[recommendation],[positive]
17565,1038121,2,Restaurants,"[Breakfast & Brunch, Soup, Food, Bagels, Salad, Restaurants, Sandwiches]","Breakfast & Brunch, Soup, Food, Bagels, Salad, Restaurants, Sandwiches",Panera Bread,6bgjcFOy4WHMyw62_1V9Pw,Place felt dirty and disorganized .,[dirty],[place],[dirty place],4.0,[restaurant -> atmosphere],[negative]
17566,1038121,3,Restaurants,"[Breakfast & Brunch, Soup, Food, Bagels, Salad, Restaurants, Sandwiches]","Breakfast & Brunch, Soup, Food, Bagels, Salad, Restaurants, Sandwiches",Panera Bread,6bgjcFOy4WHMyw62_1V9Pw,"Place was deserted on Saturday afternoon , now I get why .",[deserted],[place],[deserted place],5.0,[restaurant -> atmosphere],[negative]


# ABSA Predictions Post-Processing

## Fix or ignore wrong sentiment 

Fix on wrong sentiment labels for some aspect with "not" in the opinion, might be impacted by the overall sentiment

In [7]:
import spacy
from spacytextblob.spacytextblob import SpacyTextBlob
nlp = spacy.load('en_core_web_sm')
nlp.add_pipe('spacytextblob')

2023-07-09 13:16:49.238803: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
2023-07-09 13:16:50.775451: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1
2023-07-09 13:16:50.776117: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:923] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-07-09 13:16:50.776367: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: 
pciBusID: 0000:01:00.0 name: NVIDIA GeForce RTX 3080 Ti computeCapability: 8.6
coreClock: 1.77GHz coreCount: 80 deviceMemorySize: 12.00GiB deviceMemoryBandwidth: 849.46GiB/s
2023-07-09 13:16:50.776430: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
2023-07-09 13:16:50.776462: I tensorflow/stream_ex

<spacytextblob.spacytextblob.SpacyTextBlob at 0x7fe269975e80>

In [8]:
mask = (df['opinions'].apply(lambda x: "not" in x[0].split()))
mask &= (df['opinions'].apply(lambda x: len(x[0].split()) <= 3))
mask &= (df['opinions'].str.len() == 1)

In [9]:
df[mask]

Unnamed: 0,index,level_1,domain,categories_list,categories,name,business_id,sentences,opinions,aspects,opinion_aspect_pairs,sid,attributes,sentiments
117,487453,0,Hotels,"[Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel]","Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel",Sheraton Centre Toronto Hotel,M7FOXsuEuRbwEfxUrNKv5w,The food was not bad .,[not bad],[food],[not bad food],3.0,[food -> quality],[negative]
283,520927,6,Hotels,"[Hotels & Travel, Event Planning & Services, Hotels]","Hotels & Travel, Event Planning & Services, Hotels",Hyatt Regency Cleveland,_U7btRgwidF6mYsexgVClQ,It was not a positive experience in any respect .,[not a positive],[experience],[not a positive experience],11.0,[restaurant -> atmosphere],[negative]
350,520973,0,Hotels,"[Hotels & Travel, Event Planning & Services, Hotels]","Hotels & Travel, Event Planning & Services, Hotels",Hyatt Regency Cleveland,_U7btRgwidF6mYsexgVClQ,The rooms are not worth the price !,[not worth],[rooms],[not worth rooms],0.0,[value-for-money],[negative]
462,528813,0,Hotels,"[Venues & Event Spaces, Hotels & Travel, Hotels, Event Planning & Services]","Venues & Event Spaces, Hotels & Travel, Hotels, Event Planning & Services",Renaissance Phoenix Glendale Hotel & Spa,Wgj8dI74HETaldI3dIZ7vg,Not a clean hotel .,[not a clean],[hotel],[not a clean hotel],0.0,[restaurant -> atmosphere],[negative]
496,528829,2,Hotels,"[Venues & Event Spaces, Hotels & Travel, Hotels, Event Planning & Services]","Venues & Event Spaces, Hotels & Travel, Hotels, Event Planning & Services",Renaissance Phoenix Glendale Hotel & Spa,Wgj8dI74HETaldI3dIZ7vg,Each time I stayed there was a convention and the double I had asked for was not available,[not available],[double],[not available double],2.0,[food -> variety],[negative]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17120,1037703,3,Restaurants,"[Restaurants, Mexican, Cocktail Bars, Pizza, Bars, Nightlife, Breakfast & Brunch, Sports Bars]","Restaurants, Mexican, Cocktail Bars, Pizza, Bars, Nightlife, Breakfast & Brunch, Sports Bars",No Què No Mexican Grill,GETVAz5SCk8rFziexV54Aw,My boyfriend ordered the chicken burrito and was not as disappointed as I was but he also mentioned that it did not have a lot of flavor .,[not as disappointed],[chicken burrito],[not as disappointed chicken burrito],6.0,[food -> quality],[negative]
17198,1037785,0,Restaurants,"[Restaurants, Mexican, Cocktail Bars, Pizza, Bars, Nightlife, Breakfast & Brunch, Sports Bars]","Restaurants, Mexican, Cocktail Bars, Pizza, Bars, Nightlife, Breakfast & Brunch, Sports Bars",No Què No Mexican Grill,GETVAz5SCk8rFziexV54Aw,One thing I think people might complain about is that the servings are not huge .,[not huge],[servings],[not huge servings],2.0,[food-quantity],[negative]
17224,1037793,2,Restaurants,"[Nightlife, Restaurants, Italian, Pizza, Wine Bars, Bars]","Nightlife, Restaurants, Italian, Pizza, Wine Bars, Bars",Pacino's Italian,_myTPlWa8QRN5eavs-hNPg,"However , there are a couple of servers who are not the greatest ... slow , forgetful , and inattentive .",[not the greatest],[servers],[not the greatest servers],2.0,[staff],[negative]
17291,1037871,3,Restaurants,"[Nightlife, Restaurants, Italian, Pizza, Wine Bars, Bars]","Nightlife, Restaurants, Italian, Pizza, Wine Bars, Bars",Pacino's Italian,_myTPlWa8QRN5eavs-hNPg,I 'd rated another star but service not so great .,[not so great],[service],[not so great service],4.0,[staff],[negative]


In [10]:
def recompute_sentiment(row):
    doc = nlp(row['sentences'])
    sentiment = doc._.blob.polarity
    sentiment = round(sentiment,2)
    
    if sentiment > 0.2:
        row['sentiments'] = ['positive']
    else:
        row['sentiments'] = ['negative']
    
    return row

In [11]:
df[mask] = df[mask].apply(recompute_sentiment, axis=1)
df = df[~df['aspects'].apply(lambda x: x[0].strip() == "")]

In [12]:
df[mask]

  df[mask]


Unnamed: 0,index,level_1,domain,categories_list,categories,name,business_id,sentences,opinions,aspects,opinion_aspect_pairs,sid,attributes,sentiments
117,487453,0,Hotels,"[Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel]","Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel",Sheraton Centre Toronto Hotel,M7FOXsuEuRbwEfxUrNKv5w,The food was not bad .,[not bad],[food],[not bad food],3.0,[food -> quality],[positive]
283,520927,6,Hotels,"[Hotels & Travel, Event Planning & Services, Hotels]","Hotels & Travel, Event Planning & Services, Hotels",Hyatt Regency Cleveland,_U7btRgwidF6mYsexgVClQ,It was not a positive experience in any respect .,[not a positive],[experience],[not a positive experience],11.0,[restaurant -> atmosphere],[negative]
350,520973,0,Hotels,"[Hotels & Travel, Event Planning & Services, Hotels]","Hotels & Travel, Event Planning & Services, Hotels",Hyatt Regency Cleveland,_U7btRgwidF6mYsexgVClQ,The rooms are not worth the price !,[not worth],[rooms],[not worth rooms],0.0,[value-for-money],[negative]
462,528813,0,Hotels,"[Venues & Event Spaces, Hotels & Travel, Hotels, Event Planning & Services]","Venues & Event Spaces, Hotels & Travel, Hotels, Event Planning & Services",Renaissance Phoenix Glendale Hotel & Spa,Wgj8dI74HETaldI3dIZ7vg,Not a clean hotel .,[not a clean],[hotel],[not a clean hotel],0.0,[restaurant -> atmosphere],[negative]
496,528829,2,Hotels,"[Venues & Event Spaces, Hotels & Travel, Hotels, Event Planning & Services]","Venues & Event Spaces, Hotels & Travel, Hotels, Event Planning & Services",Renaissance Phoenix Glendale Hotel & Spa,Wgj8dI74HETaldI3dIZ7vg,Each time I stayed there was a convention and the double I had asked for was not available,[not available],[double],[not available double],2.0,[food -> variety],[negative]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17120,1037703,3,Restaurants,"[Restaurants, Mexican, Cocktail Bars, Pizza, Bars, Nightlife, Breakfast & Brunch, Sports Bars]","Restaurants, Mexican, Cocktail Bars, Pizza, Bars, Nightlife, Breakfast & Brunch, Sports Bars",No Què No Mexican Grill,GETVAz5SCk8rFziexV54Aw,My boyfriend ordered the chicken burrito and was not as disappointed as I was but he also mentioned that it did not have a lot of flavor .,[not as disappointed],[chicken burrito],[not as disappointed chicken burrito],6.0,[food -> quality],[negative]
17198,1037785,0,Restaurants,"[Restaurants, Mexican, Cocktail Bars, Pizza, Bars, Nightlife, Breakfast & Brunch, Sports Bars]","Restaurants, Mexican, Cocktail Bars, Pizza, Bars, Nightlife, Breakfast & Brunch, Sports Bars",No Què No Mexican Grill,GETVAz5SCk8rFziexV54Aw,One thing I think people might complain about is that the servings are not huge .,[not huge],[servings],[not huge servings],2.0,[food-quantity],[negative]
17224,1037793,2,Restaurants,"[Nightlife, Restaurants, Italian, Pizza, Wine Bars, Bars]","Nightlife, Restaurants, Italian, Pizza, Wine Bars, Bars",Pacino's Italian,_myTPlWa8QRN5eavs-hNPg,"However , there are a couple of servers who are not the greatest ... slow , forgetful , and inattentive .",[not the greatest],[servers],[not the greatest servers],2.0,[staff],[negative]
17291,1037871,3,Restaurants,"[Nightlife, Restaurants, Italian, Pizza, Wine Bars, Bars]","Nightlife, Restaurants, Italian, Pizza, Wine Bars, Bars",Pacino's Italian,_myTPlWa8QRN5eavs-hNPg,I 'd rated another star but service not so great .,[not so great],[service],[not so great service],4.0,[staff],[positive]


In [13]:
df = df[~mask]

  df = df[~mask]


In [14]:
df.head()

Unnamed: 0,index,level_1,domain,categories_list,categories,name,business_id,sentences,opinions,aspects,opinion_aspect_pairs,sid,attributes,sentiments
0,487383,0,Hotels,"[Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel]","Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel",Sheraton Centre Toronto Hotel,M7FOXsuEuRbwEfxUrNKv5w,Well located to most of downtown by foot .,[well],[located],[well located],2.0,[location],[positive]
1,487383,2,Hotels,"[Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel]","Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel",Sheraton Centre Toronto Hotel,M7FOXsuEuRbwEfxUrNKv5w,Free WiFi in lobby,[free],[wifi],[free wifi],4.0,[wait-time],[positive]
2,487383,3,Hotels,"[Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel]","Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel",Sheraton Centre Toronto Hotel,M7FOXsuEuRbwEfxUrNKv5w,", hefty charge for in room .",[hefty],[charge for],[hefty charge for],5.0,[value-for-money],[negative]
3,487385,0,Hotels,"[Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel]","Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel",Sheraton Centre Toronto Hotel,M7FOXsuEuRbwEfxUrNKv5w,Been staying at this hotel for years now and still enjoy the place .,[enjoy],[place],[enjoy place],0.0,[recommendation],[positive]
4,487385,1,Hotels,"[Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel]","Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel",Sheraton Centre Toronto Hotel,M7FOXsuEuRbwEfxUrNKv5w,It was renovated a few years ago and has a great location .,[great],[location],[great location],1.0,[location],[positive]


## Remove 'and', preposition or irrelevant terms in the aspect

In [15]:
df['opinion_aspect_pos'] = df['opinion_aspect_pairs'].apply(lambda x: [token.tag_ for token in nlp(x[0])])

In [16]:
df['aspect_pos'] = df['aspects'].apply(lambda x: [token.tag_ for token in nlp(x[0])])

In [17]:
df['opinion_aspect_pos'].apply(lambda x: x[-1]).value_counts()

NN      12703
NNS      3388
NNP       259
VB        214
IN        142
JJ        137
VBZ        72
VBN        67
VBG        65
VBD        65
VBP        60
RB         58
DT         17
RP         14
FW         11
CC          8
_SP         6
.           4
UH          3
NNPS        2
JJR         2
CD          2
MD          1
''          1
Name: opinion_aspect_pos, dtype: int64

In [18]:
df[df['opinion_aspect_pos'].apply(lambda x: x[-1] == 'CC')]

Unnamed: 0,index,level_1,domain,categories_list,categories,name,business_id,sentences,opinions,aspects,opinion_aspect_pairs,sid,attributes,sentiments,opinion_aspect_pos,aspect_pos
1678,670590,3,Hotels,"[Hotels, Professional Services, Event Planning & Services, Caterers, Hotels & Travel]","Hotels, Professional Services, Event Planning & Services, Caterers, Hotels & Travel",Embassy Suites by Hilton Scottsdale Resort,Kz7q1Q_dfghuRM2S9IOvog,I enjoyed the Shrimp and Steak .,[enjoyed],[shrimp and],[enjoyed shrimp and],3.0,[food -> quality],[positive],"[VBN, NN, CC]","[VB, CC]"
6191,938730,1,Beauty & Spas,"[Beauty & Spas, Hair Removal, Waxing, Nail Salons]","Beauty & Spas, Hair Removal, Waxing, Nail Salons",Beautiful Nails & Spa,UcwWt3BmRUGgZ2-T55J9MA,Always friendly and clean .,[always],[friendly and],[always friendly and],1.0,[staff],[positive],"[RB, JJ, CC]","[JJ, CC]"
8757,972834,2,Beauty & Spas,"[Skin Care, Medical Spas, Hair Salons, Health & Medical, Hair Stylists, Day Spas, Beauty & Spas]","Skin Care, Medical Spas, Hair Salons, Health & Medical, Hair Stylists, Day Spas, Beauty & Spas",Advanced Aesthetics,wNRRF-OaXo_U2qZFHaM1rA,The wine and cheese was a bonus .,[bonus],[wine and],[bonus wine and],2.0,[drink -> quality],[positive],"[NN, NN, CC]","[NN, CC]"
9996,993775,5,Beauty & Spas,"[Day Spas, Massage, Beauty & Spas, Health & Medical, Massage Therapy, Reflexology]","Day Spas, Massage, Beauty & Spas, Health & Medical, Massage Therapy, Reflexology",Sam's Spa,qy6f94ehot4zom1xpcBQfg,They also provide free bottled water or soda .,[free],[bottled water or],[free bottled water or],6.0,[drink -> quality],[positive],"[JJ, JJ, NN, CC]","[JJ, NN, CC]"
12858,1031667,2,Restaurants,"[Barbeque, Restaurants, Bars, Nightlife, American (Traditional), Sports Bars]","Barbeque, Restaurants, Bars, Nightlife, American (Traditional), Sports Bars",Dillons KC BBQ,yqqXDVl7ZJaiDDbuCFXqMg,"The only thing is the Mac & Cheese could have used a bit more salt and pepper , but otherwise they were amazing .",[amazing],[mac &],[amazing mac &],2.0,[food -> quality],[negative],"[NNP, NNP, CC]","[NNP, CC]"
13551,1032270,4,Restaurants,"[Restaurants, Mexican]","Restaurants, Mexican",El Torito Cafe,y3RlbCN0UrDdLnjSF9NkXA,I do admire the free chips and dip though ...,[free],[chips and],[free chips and],5.0,[food -> healthiness],[positive],"[JJ, NNS, CC]","[NNS, CC]"
15157,1035240,0,Restaurants,"[Hot Dogs, Burgers, Sandwiches, Restaurants, Italian]","Hot Dogs, Burgers, Sandwiches, Restaurants, Italian",Luke's of Chicago's,-raUM9T-1dKPiB5_vRmFYA,Best beef and hot dog joint in town,[best],[beef and],[best beef and],1.0,[food -> quality],[positive],"[JJS, NN, CC]","[NN, CC]"
16481,1036991,3,Restaurants,"[Dim Sum, Chinese, Restaurants, Vegan, Vegetarian]","Dim Sum, Chinese, Restaurants, Vegan, Vegetarian",DumplingHaus,VfWX3UCKvUnVktdOuR8TwA,"I do however , recommend the Black Bean and Pork Noodles !",[recommend],[black bean and],[recommend black bean and],4.0,[recommendation],[positive],"[VB, JJ, NN, CC]","[JJ, NN, CC]"


**Preprocessing Starts Here**

- Remove comments with aspects ending with ['CC', '.'] POS
- Remove comments with aspects consisting with ['JJR'] POS
- Remove '-' in the last character of comment's aspects

In [19]:
ignore_last_pos = ['CC', '.']

In [20]:
def preprocess_aspects(row):
    tokens = row['aspects'][0].split(" ")
    opinion_aspect_pos = row['opinion_aspect_pos']
    
    tokens[-1] = tokens[-1].replace('-', '')
    tokens = tokens[:-1] if opinion_aspect_pos[-1] in ignore_last_pos else tokens
    
    row['aspects'] = [" ".join(tokens)]
    row['opinion_aspect_pairs'] = [row['opinions'][0] + " " + " ".join(tokens)]
    
    return row

In [21]:
df = df.apply(preprocess_aspects, axis=1)

In [22]:
df = df[~df['opinion_aspect_pos'].apply(lambda x: x[-1] == 'JJR')]

# KP Extraction

## Calculate number of words

In [23]:
df['num_of_token'] = df['sentences'].apply(lambda x: [token for token in nlp(x) if token.pos_ not in ['PUNCT', 'SPACE']])

In [24]:
df['num_of_token'] = df['num_of_token'].apply(lambda x: len(x))

In [25]:
df

Unnamed: 0,index,level_1,domain,categories_list,categories,name,business_id,sentences,opinions,aspects,opinion_aspect_pairs,sid,attributes,sentiments,opinion_aspect_pos,aspect_pos,num_of_token
0,487383,0,Hotels,"[Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel]","Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel",Sheraton Centre Toronto Hotel,M7FOXsuEuRbwEfxUrNKv5w,Well located to most of downtown by foot .,[well],[located],[well located],2.0,[location],[positive],"[RB, VBN]",[VBN],8
1,487383,2,Hotels,"[Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel]","Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel",Sheraton Centre Toronto Hotel,M7FOXsuEuRbwEfxUrNKv5w,Free WiFi in lobby,[free],[wifi],[free wifi],4.0,[wait-time],[positive],"[JJ, NN]",[NN],4
2,487383,3,Hotels,"[Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel]","Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel",Sheraton Centre Toronto Hotel,M7FOXsuEuRbwEfxUrNKv5w,", hefty charge for in room .",[hefty],[charge for],[hefty charge for],5.0,[value-for-money],[negative],"[JJ, NN, IN]","[NN, IN]",5
3,487385,0,Hotels,"[Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel]","Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel",Sheraton Centre Toronto Hotel,M7FOXsuEuRbwEfxUrNKv5w,Been staying at this hotel for years now and still enjoy the place .,[enjoy],[place],[enjoy place],0.0,[recommendation],[positive],"[VB, NN]",[NN],13
4,487385,1,Hotels,"[Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel]","Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel",Sheraton Centre Toronto Hotel,M7FOXsuEuRbwEfxUrNKv5w,It was renovated a few years ago and has a great location .,[great],[location],[great location],1.0,[location],[positive],"[JJ, NN]",[NN],12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17563,1038117,3,Restaurants,"[Breakfast & Brunch, Soup, Food, Bagels, Salad, Restaurants, Sandwiches]","Breakfast & Brunch, Soup, Food, Bagels, Salad, Restaurants, Sandwiches",Panera Bread,6bgjcFOy4WHMyw62_1V9Pw,Tomato soup is always perfect .,[always perfect],[tomato soup],[always perfect tomato soup],4.0,[food -> quality],[positive],"[RB, VB, NN, NN]","[NN, NN]",5
17564,1038117,4,Restaurants,"[Breakfast & Brunch, Soup, Food, Bagels, Salad, Restaurants, Sandwiches]","Breakfast & Brunch, Soup, Food, Bagels, Salad, Restaurants, Sandwiches",Panera Bread,6bgjcFOy4WHMyw62_1V9Pw,"My favorite location is the Boca Park just because of seating , but this one is suffice !",[favorite],[location],[favorite location],5.0,[recommendation],[positive],"[JJ, NN]",[NN],16
17565,1038121,2,Restaurants,"[Breakfast & Brunch, Soup, Food, Bagels, Salad, Restaurants, Sandwiches]","Breakfast & Brunch, Soup, Food, Bagels, Salad, Restaurants, Sandwiches",Panera Bread,6bgjcFOy4WHMyw62_1V9Pw,Place felt dirty and disorganized .,[dirty],[place],[dirty place],4.0,[restaurant -> atmosphere],[negative],"[JJ, NN]",[NN],5
17566,1038121,3,Restaurants,"[Breakfast & Brunch, Soup, Food, Bagels, Salad, Restaurants, Sandwiches]","Breakfast & Brunch, Soup, Food, Bagels, Salad, Restaurants, Sandwiches",Panera Bread,6bgjcFOy4WHMyw62_1V9Pw,"Place was deserted on Saturday afternoon , now I get why .",[deserted],[place],[deserted place],5.0,[restaurant -> atmosphere],[negative],"[JJ, NN]",[NN],10


In [26]:
dataset = Dataset.from_pandas(df[['index', 'level_1', 'sentences', 'num_of_token']])
dataset

Dataset({
    features: ['index', 'level_1', 'sentences', 'num_of_token', '__index_level_0__'],
    num_rows: 17299
})

## Argument Quality Ranking

In [27]:
root_dir = "../argmining-21-review-sentiment-keypoint-analysis/code/src-ipynb/"

### Load Model

In [28]:
import torch.nn as nn
# from transformers import AutoModelForSequenceClassification
from transformers import RobertaModel

class RoBERTaRegressor(nn.Module):
    
    def __init__(self, drop_rate=0.2, freeze_camembert=False):
        
        super(RoBERTaRegressor, self).__init__()
#         D_in, D_out = 768, 1
        D_in, D_out = 1024, 1
        
#         self.roberta = AutoModelForSequenceClassification.from_pretrained('./model/roberta-large-finetuned-yelp/checkpoint-134060/')
        self.roberta = RobertaModel.from_pretrained('./model/roberta-large-pretrained-yelp/checkpoint-134060/')
        self.regressor = nn.Sequential(
            nn.Linear(D_in, D_out),
            nn.Sigmoid()
        )
        
    def forward(self, input_ids, attention_masks):
        
        outputs = self.roberta(input_ids, attention_masks)
#         class_label_output = outputs[1]
        sequence_output = outputs['last_hidden_state']
        outputs = self.regressor(sequence_output[:,0,:].reshape(-1,1024))
        return outputs
# model = RoBERTaRegressor(drop_rate=0.9)
model = RoBERTaRegressor()

Some weights of the model checkpoint at ./model/roberta-large-pretrained-yelp/checkpoint-134060/ were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at ./model/roberta-large-pretrained-yelp/checkpoint-134060/ and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream t

In [29]:
model.load_state_dict(torch.load('./model/roberta-large-finetuned-yelp-argument-quality-WA/model.pth'))

<All keys matched successfully>

In [30]:
import torch
if torch.cuda.is_available():       
    device = torch.device("cuda")
    print("Using GPU.")
else:
    print("No GPU available, using the CPU instead.")
    device = torch.device("cpu")
# device = torch.device("cpu")
model.to(device)

Using GPU.


RoBERTaRegressor(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm)

In [31]:
model

RoBERTaRegressor(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm)

### Tokenization

In [32]:
model_checkpoint = "roberta-large"

In [33]:
from transformers import RobertaTokenizer

tokenizer = RobertaTokenizer.from_pretrained(model_checkpoint)

In [34]:
tokenizer.model_max_length

512

In [35]:
def tokenize_function(examples):
    return tokenizer(examples["sentences"], padding="max_length", truncation=True)

In [36]:
tokenized_dataset = dataset.map(tokenize_function, batched=True)

  0%|          | 0/18 [00:00<?, ?ba/s]

In [37]:
tokenized_dataset

Dataset({
    features: ['index', 'level_1', 'sentences', 'num_of_token', '__index_level_0__', 'input_ids', 'attention_mask'],
    num_rows: 17299
})

### Inference

In [38]:
tokenized_dataset

Dataset({
    features: ['index', 'level_1', 'sentences', 'num_of_token', '__index_level_0__', 'input_ids', 'attention_mask'],
    num_rows: 17299
})

In [39]:
remove_cols = tokenized_dataset.column_names
remove_cols = list(set(remove_cols) - set(['input_ids', 'attention_mask']))
remove_cols

['num_of_token', 'sentences', 'level_1', '__index_level_0__', 'index']

In [40]:
tokenized_dataset = tokenized_dataset.remove_columns(remove_cols)

In [41]:
tokenized_dataset

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 17299
})

In [42]:
tokenized_dataset.set_format('torch')

In [43]:
from torch.utils.data import DataLoader
from tqdm.notebook import tqdm

eval_dataloader = DataLoader(tokenized_dataset, batch_size=16)
model.eval()
output = []
for batch in tqdm(eval_dataloader):
#     batch_inputs, batch_masks, _ = tuple(b.to(device) for b in batch)
    batch_inputs = batch['input_ids'].to(device)
    batch_masks = batch['attention_mask'].to(device)
    with torch.no_grad():
        output += model(batch_inputs, batch_masks).view(1,-1).tolist()[0]

  0%|          | 0/1082 [00:00<?, ?it/s]

In [44]:
output

[0.4904603362083435,
 0.2828855514526367,
 0.30174127221107483,
 0.4462526738643646,
 0.5950721502304077,
 0.3060912489891052,
 0.5152139067649841,
 0.5053565502166748,
 0.42443177103996277,
 0.31573936343193054,
 0.29124918580055237,
 0.41299575567245483,
 0.5641128420829773,
 0.6517132520675659,
 0.6277260184288025,
 0.36480987071990967,
 0.23659738898277283,
 0.4798718988895416,
 0.38053473830223083,
 0.4867674708366394,
 0.4839179813861847,
 0.49165695905685425,
 0.6727054119110107,
 0.16477106511592865,
 0.26153257489204407,
 0.5782346725463867,
 0.8306098580360413,
 0.6618695259094238,
 0.46929696202278137,
 0.7947989702224731,
 0.5220296382904053,
 0.2585894465446472,
 0.31123587489128113,
 0.438240110874176,
 0.279512882232666,
 0.5354700684547424,
 0.4362970292568207,
 0.5022480487823486,
 0.4031458795070648,
 0.7346140742301941,
 0.5764976739883423,
 0.5360527038574219,
 0.44553565979003906,
 0.3736654818058014,
 0.3921043276786804,
 0.402079313993454,
 0.3461608290672302,
 0

In [45]:
dataset = dataset.add_column("predicted_WA", output)

In [46]:
dataset

Dataset({
    features: ['index', 'level_1', 'sentences', 'num_of_token', '__index_level_0__', 'predicted_WA'],
    num_rows: 17299
})

In [47]:
dataset.set_format("pandas")
dataset = dataset[:]
dataset.to_pickle("../data/yelp/snippext_top_5_domains_sample_dataset_arg_quality_save.pkl")

## Extract KP

In [48]:
dataset = pd.read_pickle("../data/yelp/snippext_top_5_domains_sample_dataset_arg_quality_save.pkl")

In [49]:
dataset['predicted_WA'].describe()

count    17299.000000
mean         0.444898
std          0.160999
min          0.114525
25%          0.316376
50%          0.424024
75%          0.557130
max          0.960261
Name: predicted_WA, dtype: float64

In [50]:
min_tokens = 3
max_tokens = 5

In [51]:
def filter_kp_by_arg_quality_and_tokens(example):
    return example['predicted_WA'] >= 0.42 and example['num_of_token'] >= min_tokens and example['num_of_token'] <= max_tokens

In [52]:
filtered_dataset = dataset[dataset.apply(filter_kp_by_arg_quality_and_tokens, axis=1)]

In [53]:
filtered_dataset

Unnamed: 0,index,level_1,sentences,num_of_token,__index_level_0__,predicted_WA
8,487386,2,Clientele was mixed .,3,8,0.424432
17,487389,2,Great customer service .,3,17,0.479872
28,487392,1,Restaurant is much too expensive .,5,28,0.469297
47,487403,3,Great for business travel .,4,47,0.439564
57,487408,0,Slept well in comfy beds .,5,57,0.427684
...,...,...,...,...,...,...
17174,1037997,2,Salads are fresh and healthy .,5,17442,0.494856
17195,1038021,0,Great location in southwest vegas .,5,17464,0.459289
17199,1038023,1,Great and friendly service here .,5,17468,0.432246
17263,1038086,1,"Friendly , down to earth staff .",5,17532,0.527126


In [54]:
kp_df = filtered_dataset[['index', 'level_1', 'predicted_WA']]
kp_df = kp_df.merge(df, on=['index', 'level_1'])

kp_df = kp_df.drop_duplicates(subset=['domain', 'sentences'])

# kp_df[kp_df['sentences'].str.contains("(?i)street|city")]
kp_df = kp_df[~kp_df['sentences'].str.contains("(?i)street|city")]

kp_df = kp_df.rename(columns={'sentences': 'key_point'})

In [55]:
kp_df

Unnamed: 0,index,level_1,predicted_WA,domain,categories_list,categories,name,business_id,key_point,opinions,aspects,opinion_aspect_pairs,sid,attributes,sentiments,opinion_aspect_pos,aspect_pos,num_of_token
0,487386,2,0.424432,Hotels,"[Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel]","Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel",Sheraton Centre Toronto Hotel,M7FOXsuEuRbwEfxUrNKv5w,Clientele was mixed .,[mixed],[clientele],[mixed clientele],2.0,[staff],[negative],"[JJ, NNS]",[NNP],3
1,487389,2,0.479872,Hotels,"[Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel]","Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel",Sheraton Centre Toronto Hotel,M7FOXsuEuRbwEfxUrNKv5w,Great customer service .,[great],[customer service],[great customer service],3.0,[staff],[positive],"[JJ, NN, NN]","[NN, NN]",3
2,487392,1,0.469297,Hotels,"[Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel]","Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel",Sheraton Centre Toronto Hotel,M7FOXsuEuRbwEfxUrNKv5w,Restaurant is much too expensive .,[too expensive],[restaurant],[too expensive restaurant],2.0,[value-for-money],[negative],"[RB, JJ, NN]",[NN],5
3,487403,3,0.439564,Hotels,"[Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel]","Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel",Sheraton Centre Toronto Hotel,M7FOXsuEuRbwEfxUrNKv5w,Great for business travel .,[great for],[business travel],[great for business travel],3.0,[restaurant -> atmosphere],[positive],"[JJ, IN, NN, NN]","[NN, NN]",4
4,487408,0,0.427684,Hotels,"[Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel]","Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel",Sheraton Centre Toronto Hotel,M7FOXsuEuRbwEfxUrNKv5w,Slept well in comfy beds .,[comfy],[beds],[comfy beds],0.0,[restaurant -> comfort],[positive],"[JJ, NNS]",[NNS],5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
394,1037997,2,0.494856,Restaurants,"[Breakfast & Brunch, Soup, Food, Bagels, Salad, Restaurants, Sandwiches]","Breakfast & Brunch, Soup, Food, Bagels, Salad, Restaurants, Sandwiches",Panera Bread,6bgjcFOy4WHMyw62_1V9Pw,Salads are fresh and healthy .,[fresh],[salads],[fresh salads],2.0,[food -> quality],[positive],"[JJ, NNS]",[VBZ],5
395,1038021,0,0.459289,Restaurants,"[Breakfast & Brunch, Soup, Food, Bagels, Salad, Restaurants, Sandwiches]","Breakfast & Brunch, Soup, Food, Bagels, Salad, Restaurants, Sandwiches",Panera Bread,6bgjcFOy4WHMyw62_1V9Pw,Great location in southwest vegas .,[great],[location],[great location],0.0,[location],[positive],"[JJ, NN]",[NN],5
396,1038023,1,0.432246,Restaurants,"[Breakfast & Brunch, Soup, Food, Bagels, Salad, Restaurants, Sandwiches]","Breakfast & Brunch, Soup, Food, Bagels, Salad, Restaurants, Sandwiches",Panera Bread,6bgjcFOy4WHMyw62_1V9Pw,Great and friendly service here .,[great and friendly],[service],[great and friendly service],1.0,[staff],[positive],"[JJ, CC, JJ, NN]",[NN],5
397,1038086,1,0.527126,Restaurants,"[Breakfast & Brunch, Soup, Food, Bagels, Salad, Restaurants, Sandwiches]","Breakfast & Brunch, Soup, Food, Bagels, Salad, Restaurants, Sandwiches",Panera Bread,6bgjcFOy4WHMyw62_1V9Pw,"Friendly , down to earth staff .",[down to earth],[staff],[down to earth staff],2.0,[staff],[positive],"[RB, IN, NN, NN]",[NN],5


# Extra Processing of KPs

##### Ignore NULL aspect terms

In [56]:
kp_df = kp_df[kp_df['aspects'].apply(lambda x: x != ['null'] and len(x) > 0)]
kp_df = kp_df[kp_df['key_point'].apply(lambda x: x[-1] != '?')]
kp_df['id'] = kp_df['index'].astype(str) + "_" + kp_df['level_1'].astype(str)

In [57]:
kp_df

Unnamed: 0,index,level_1,predicted_WA,domain,categories_list,categories,name,business_id,key_point,opinions,aspects,opinion_aspect_pairs,sid,attributes,sentiments,opinion_aspect_pos,aspect_pos,num_of_token,id
0,487386,2,0.424432,Hotels,"[Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel]","Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel",Sheraton Centre Toronto Hotel,M7FOXsuEuRbwEfxUrNKv5w,Clientele was mixed .,[mixed],[clientele],[mixed clientele],2.0,[staff],[negative],"[JJ, NNS]",[NNP],3,487386_2
1,487389,2,0.479872,Hotels,"[Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel]","Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel",Sheraton Centre Toronto Hotel,M7FOXsuEuRbwEfxUrNKv5w,Great customer service .,[great],[customer service],[great customer service],3.0,[staff],[positive],"[JJ, NN, NN]","[NN, NN]",3,487389_2
2,487392,1,0.469297,Hotels,"[Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel]","Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel",Sheraton Centre Toronto Hotel,M7FOXsuEuRbwEfxUrNKv5w,Restaurant is much too expensive .,[too expensive],[restaurant],[too expensive restaurant],2.0,[value-for-money],[negative],"[RB, JJ, NN]",[NN],5,487392_1
3,487403,3,0.439564,Hotels,"[Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel]","Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel",Sheraton Centre Toronto Hotel,M7FOXsuEuRbwEfxUrNKv5w,Great for business travel .,[great for],[business travel],[great for business travel],3.0,[restaurant -> atmosphere],[positive],"[JJ, IN, NN, NN]","[NN, NN]",4,487403_3
4,487408,0,0.427684,Hotels,"[Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel]","Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel",Sheraton Centre Toronto Hotel,M7FOXsuEuRbwEfxUrNKv5w,Slept well in comfy beds .,[comfy],[beds],[comfy beds],0.0,[restaurant -> comfort],[positive],"[JJ, NNS]",[NNS],5,487408_0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
394,1037997,2,0.494856,Restaurants,"[Breakfast & Brunch, Soup, Food, Bagels, Salad, Restaurants, Sandwiches]","Breakfast & Brunch, Soup, Food, Bagels, Salad, Restaurants, Sandwiches",Panera Bread,6bgjcFOy4WHMyw62_1V9Pw,Salads are fresh and healthy .,[fresh],[salads],[fresh salads],2.0,[food -> quality],[positive],"[JJ, NNS]",[VBZ],5,1037997_2
395,1038021,0,0.459289,Restaurants,"[Breakfast & Brunch, Soup, Food, Bagels, Salad, Restaurants, Sandwiches]","Breakfast & Brunch, Soup, Food, Bagels, Salad, Restaurants, Sandwiches",Panera Bread,6bgjcFOy4WHMyw62_1V9Pw,Great location in southwest vegas .,[great],[location],[great location],0.0,[location],[positive],"[JJ, NN]",[NN],5,1038021_0
396,1038023,1,0.432246,Restaurants,"[Breakfast & Brunch, Soup, Food, Bagels, Salad, Restaurants, Sandwiches]","Breakfast & Brunch, Soup, Food, Bagels, Salad, Restaurants, Sandwiches",Panera Bread,6bgjcFOy4WHMyw62_1V9Pw,Great and friendly service here .,[great and friendly],[service],[great and friendly service],1.0,[staff],[positive],"[JJ, CC, JJ, NN]",[NN],5,1038023_1
397,1038086,1,0.527126,Restaurants,"[Breakfast & Brunch, Soup, Food, Bagels, Salad, Restaurants, Sandwiches]","Breakfast & Brunch, Soup, Food, Bagels, Salad, Restaurants, Sandwiches",Panera Bread,6bgjcFOy4WHMyw62_1V9Pw,"Friendly , down to earth staff .",[down to earth],[staff],[down to earth staff],2.0,[staff],[positive],"[RB, IN, NN, NN]",[NN],5,1038086_1


##### One aspects KP only

In [58]:
kp_df = kp_df[kp_df['aspects'].str.len() == 1]

##### Fill missing adj after 'and' in some opinions

In [59]:
def find_sub_list(sl,l):
    results=[]
    sll=len(sl)
    for ind in (i for i,e in enumerate(l) if e==sl[0]):
        if l[ind:ind+sll]==sl:
            results.append((ind,ind+sll-1))

    return results

In [60]:
def complete_the_opinion(row):
    opinion_words = row['opinions'][0].split(" ")
    results = find_sub_list(opinion_words, row['key_point'].split())
    index = results[0][1] + 1
    opinion_words += [row['key_point'].split()[index].strip()]
    row['opinions'] = [" ".join(opinion_words)]
    row['opinion_aspect_pairs'] = [" ".join([row['opinions'][0].strip(), row['aspects'][0].strip()])]
    
    return row

In [61]:
mask = kp_df['opinions'].apply(lambda x: x[0].split(" ")[-1] == 'and')
kp_df[mask] = kp_df[mask].apply(complete_the_opinion, axis=1)

In [62]:
import ast

kp_df['opinion_aspect_pairs'] = kp_df['opinion_aspect_pairs'].astype(str)
kp_df = kp_df.sort_values(by=['opinion_aspect_pairs', 'predicted_WA'], ascending=[True, False]).drop_duplicates(subset=['domain', 'opinion_aspect_pairs'])
kp_df['opinion_aspect_pairs'] = kp_df['opinion_aspect_pairs'].apply(lambda x: ast.literal_eval(x))
kp_df = kp_df.reset_index(drop=True)

In [63]:
kp_df

Unnamed: 0,index,level_1,predicted_WA,domain,categories_list,categories,name,business_id,key_point,opinions,aspects,opinion_aspect_pairs,sid,attributes,sentiments,opinion_aspect_pos,aspect_pos,num_of_token,id
0,552112,4,0.550564,Hotels,"[Event Planning & Services, Caterers, Hotels, Hotels & Travel]","Event Planning & Services, Caterers, Hotels, Hotels & Travel",Renaissance Pittsburgh Hotel,XAM8bZWsEKBTzOeTOz2qEA,Breakfast is a bit overpriced .,[a bit],[breakfast],[a bit breakfast],5.0,[food -> quality],[negative],"[DT, NN, NN]",[NN],5,552112_4
1,536415,2,0.526827,Hotels,"[Hotels, Hotels & Travel, Event Planning & Services]","Hotels, Hotels & Travel, Event Planning & Services",Omni Charlotte Hotel,IrHAhdmL9QH9g3NscAIsWA,Rooms are a little outdated .,[a little outdated],[rooms],[a little outdated rooms],2.0,[restaurant -> atmosphere],[negative],"[DT, JJ, JJ, NNS]",[NNS],5,536415_2
2,1037067,6,0.474484,Restaurants,"[Dim Sum, Chinese, Restaurants, Vegan, Vegetarian]","Dim Sum, Chinese, Restaurants, Vegan, Vegetarian",DumplingHaus,VfWX3UCKvUnVktdOuR8TwA,Customer Service was A+,[a+],[customer service],[a+ customer service],8.0,[staff],[positive],"[-LRB-, NN, NN]","[NN, NN]",4,1037067_6
3,922097,1,0.506006,Hotels,"[Hotels, Professional Services, Venues & Event Spaces, Hotels & Travel, Resorts, Event Planning & Services]","Hotels, Professional Services, Venues & Event Spaces, Hotels & Travel, Resorts, Event Planning & Services",Sanctuary on Camelback Mountain Resort and Spa,PsTzoIERiCjq6QHrOnk2Lg,Staff goes above and beyond .,[above and beyond],[staff],[above and beyond staff],1.0,[staff],[positive],"[RB, CC, IN, NN]",[NN],5,922097_1
4,1037698,3,0.439098,Restaurants,"[Restaurants, Mexican, Cocktail Bars, Pizza, Bars, Nightlife, Breakfast & Brunch, Sports Bars]","Restaurants, Mexican, Cocktail Bars, Pizza, Bars, Nightlife, Breakfast & Brunch, Sports Bars",No Què No Mexican Grill,GETVAz5SCk8rFziexV54Aw,The service was absolutely terrible .,[absolutely terrible],[service],[absolutely terrible service],8.0,[staff],[negative],"[RB, JJ, NN]",[NN],5,1037698_3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
362,988231,0,0.584501,Automotive,"[Used Car Dealers, Automotive, Car Dealers, Auto Parts & Supplies, Auto Repair]","Used Car Dealers, Automotive, Car Dealers, Auto Parts & Supplies, Auto Repair",AutoNation Honda East Las Vegas,ali4IoPkincXzwHgWSk_rA,Worst customer service .,[worst],[customer service],[worst customer service],0.0,[staff],[negative],"[JJS, NN, NN]","[NN, NN]",3,988231_0
363,970187,0,0.482031,Hotels,"[Hotels & Travel, Hotels, Event Planning & Services]","Hotels & Travel, Hotels, Event Planning & Services",Polo Towers by Diamond Resorts,Ccjo5j4A5-gVaBbuHAqzVA,Worst experience of my life .,[worst],[experience],[worst experience],0.0,[restaurant -> atmosphere],[negative],"[JJS, NN]",[NN],5,970187_0
364,1031366,0,0.431492,Restaurants,"[Restaurants, Italian]","Restaurants, Italian",The Old Spaghetti Factory,J1qzIVBt3lGpiz-8UdjhXg,Worst place to eat,[worst],[place],[worst place],0.0,[recommendation],[negative],"[JJS, NN]",[NN],4,1031366_0
365,981111,2,0.534063,Automotive,"[Towing, Automotive]","Towing, Automotive",777 Towing,iw56fh9lBAIseMOKDcu9_w,WORST SERVICE EVER RECEIVED,[worst],[service],[worst service],5.0,[staff],[negative],"[JJS, NN]",[NN],4,981111_2


In [64]:
len(kp_df['attributes'].astype(str).unique())

13

# Construct contrastive examples

In [65]:
from datasets import load_from_disk

dataset = pd.read_pickle("../data/yelp/snippext_top_5_domains_sample_dataset_arg_quality_save.pkl")
df['predicted_WA'] = dataset[:]['predicted_WA']
df['opinion_aspect_pairs'] = df['opinion_aspect_pairs'].astype(str)
df['id'] = df['index'].astype(str) + "_" + df['level_1'].astype(str)
df

Unnamed: 0,index,level_1,domain,categories_list,categories,name,business_id,sentences,opinions,aspects,opinion_aspect_pairs,sid,attributes,sentiments,opinion_aspect_pos,aspect_pos,num_of_token,predicted_WA,id
0,487383,0,Hotels,"[Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel]","Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel",Sheraton Centre Toronto Hotel,M7FOXsuEuRbwEfxUrNKv5w,Well located to most of downtown by foot .,[well],[located],['well located'],2.0,[location],[positive],"[RB, VBN]",[VBN],8,0.490460,487383_0
1,487383,2,Hotels,"[Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel]","Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel",Sheraton Centre Toronto Hotel,M7FOXsuEuRbwEfxUrNKv5w,Free WiFi in lobby,[free],[wifi],['free wifi'],4.0,[wait-time],[positive],"[JJ, NN]",[NN],4,0.282886,487383_2
2,487383,3,Hotels,"[Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel]","Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel",Sheraton Centre Toronto Hotel,M7FOXsuEuRbwEfxUrNKv5w,", hefty charge for in room .",[hefty],[charge for],['hefty charge for'],5.0,[value-for-money],[negative],"[JJ, NN, IN]","[NN, IN]",5,0.301741,487383_3
3,487385,0,Hotels,"[Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel]","Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel",Sheraton Centre Toronto Hotel,M7FOXsuEuRbwEfxUrNKv5w,Been staying at this hotel for years now and still enjoy the place .,[enjoy],[place],['enjoy place'],0.0,[recommendation],[positive],"[VB, NN]",[NN],13,0.446253,487385_0
4,487385,1,Hotels,"[Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel]","Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel",Sheraton Centre Toronto Hotel,M7FOXsuEuRbwEfxUrNKv5w,It was renovated a few years ago and has a great location .,[great],[location],['great location'],1.0,[location],[positive],"[JJ, NN]",[NN],12,0.595072,487385_1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17563,1038117,3,Restaurants,"[Breakfast & Brunch, Soup, Food, Bagels, Salad, Restaurants, Sandwiches]","Breakfast & Brunch, Soup, Food, Bagels, Salad, Restaurants, Sandwiches",Panera Bread,6bgjcFOy4WHMyw62_1V9Pw,Tomato soup is always perfect .,[always perfect],[tomato soup],['always perfect tomato soup'],4.0,[food -> quality],[positive],"[RB, VB, NN, NN]","[NN, NN]",5,,1038117_3
17564,1038117,4,Restaurants,"[Breakfast & Brunch, Soup, Food, Bagels, Salad, Restaurants, Sandwiches]","Breakfast & Brunch, Soup, Food, Bagels, Salad, Restaurants, Sandwiches",Panera Bread,6bgjcFOy4WHMyw62_1V9Pw,"My favorite location is the Boca Park just because of seating , but this one is suffice !",[favorite],[location],['favorite location'],5.0,[recommendation],[positive],"[JJ, NN]",[NN],16,,1038117_4
17565,1038121,2,Restaurants,"[Breakfast & Brunch, Soup, Food, Bagels, Salad, Restaurants, Sandwiches]","Breakfast & Brunch, Soup, Food, Bagels, Salad, Restaurants, Sandwiches",Panera Bread,6bgjcFOy4WHMyw62_1V9Pw,Place felt dirty and disorganized .,[dirty],[place],['dirty place'],4.0,[restaurant -> atmosphere],[negative],"[JJ, NN]",[NN],5,,1038121_2
17566,1038121,3,Restaurants,"[Breakfast & Brunch, Soup, Food, Bagels, Salad, Restaurants, Sandwiches]","Breakfast & Brunch, Soup, Food, Bagels, Salad, Restaurants, Sandwiches",Panera Bread,6bgjcFOy4WHMyw62_1V9Pw,"Place was deserted on Saturday afternoon , now I get why .",[deserted],[place],['deserted place'],5.0,[restaurant -> atmosphere],[negative],"[JJ, NN]",[NN],10,,1038121_3


## Extract Comments

In [66]:
sent_df = df[~df['id'].isin(kp_df['id'].tolist())]
sent_df = sent_df.drop_duplicates(subset=['domain', 'opinion_aspect_pairs'])

In [67]:
sent_df

Unnamed: 0,index,level_1,domain,categories_list,categories,name,business_id,sentences,opinions,aspects,opinion_aspect_pairs,sid,attributes,sentiments,opinion_aspect_pos,aspect_pos,num_of_token,predicted_WA,id
0,487383,0,Hotels,"[Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel]","Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel",Sheraton Centre Toronto Hotel,M7FOXsuEuRbwEfxUrNKv5w,Well located to most of downtown by foot .,[well],[located],['well located'],2.0,[location],[positive],"[RB, VBN]",[VBN],8,0.490460,487383_0
1,487383,2,Hotels,"[Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel]","Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel",Sheraton Centre Toronto Hotel,M7FOXsuEuRbwEfxUrNKv5w,Free WiFi in lobby,[free],[wifi],['free wifi'],4.0,[wait-time],[positive],"[JJ, NN]",[NN],4,0.282886,487383_2
2,487383,3,Hotels,"[Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel]","Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel",Sheraton Centre Toronto Hotel,M7FOXsuEuRbwEfxUrNKv5w,", hefty charge for in room .",[hefty],[charge for],['hefty charge for'],5.0,[value-for-money],[negative],"[JJ, NN, IN]","[NN, IN]",5,0.301741,487383_3
3,487385,0,Hotels,"[Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel]","Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel",Sheraton Centre Toronto Hotel,M7FOXsuEuRbwEfxUrNKv5w,Been staying at this hotel for years now and still enjoy the place .,[enjoy],[place],['enjoy place'],0.0,[recommendation],[positive],"[VB, NN]",[NN],13,0.446253,487385_0
4,487385,1,Hotels,"[Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel]","Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel",Sheraton Centre Toronto Hotel,M7FOXsuEuRbwEfxUrNKv5w,It was renovated a few years ago and has a great location .,[great],[location],['great location'],1.0,[location],[positive],"[JJ, NN]",[NN],12,0.595072,487385_1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17561,1038117,1,Restaurants,"[Breakfast & Brunch, Soup, Food, Bagels, Salad, Restaurants, Sandwiches]","Breakfast & Brunch, Soup, Food, Bagels, Salad, Restaurants, Sandwiches",Panera Bread,6bgjcFOy4WHMyw62_1V9Pw,Fuji apple salad is my favorite !,[favorite],[fuji apple salad],['favorite fuji apple salad'],2.0,[food -> quality],[positive],"[JJ, NNP, NN, NN]","[NNP, NNP, NN]",6,,1038117_1
17562,1038117,2,Restaurants,"[Breakfast & Brunch, Soup, Food, Bagels, Salad, Restaurants, Sandwiches]","Breakfast & Brunch, Soup, Food, Bagels, Salad, Restaurants, Sandwiches",Panera Bread,6bgjcFOy4WHMyw62_1V9Pw,"Perfect combo of sweet and tangy , and the fruits / veggie combo is really refreshing .",[really refreshing],[fruits / veggie combo],['really refreshing fruits / veggie combo'],3.0,[food -> vegetarian option],[positive],"[RB, JJ, NNS, SYM, NN, NN]","[NNS, SYM, NN, NN]",15,,1038117_2
17563,1038117,3,Restaurants,"[Breakfast & Brunch, Soup, Food, Bagels, Salad, Restaurants, Sandwiches]","Breakfast & Brunch, Soup, Food, Bagels, Salad, Restaurants, Sandwiches",Panera Bread,6bgjcFOy4WHMyw62_1V9Pw,Tomato soup is always perfect .,[always perfect],[tomato soup],['always perfect tomato soup'],4.0,[food -> quality],[positive],"[RB, VB, NN, NN]","[NN, NN]",5,,1038117_3
17566,1038121,3,Restaurants,"[Breakfast & Brunch, Soup, Food, Bagels, Salad, Restaurants, Sandwiches]","Breakfast & Brunch, Soup, Food, Bagels, Salad, Restaurants, Sandwiches",Panera Bread,6bgjcFOy4WHMyw62_1V9Pw,"Place was deserted on Saturday afternoon , now I get why .",[deserted],[place],['deserted place'],5.0,[restaurant -> atmosphere],[negative],"[JJ, NN]",[NN],10,,1038121_3


## Extract KPs

In [68]:
kp_df

Unnamed: 0,index,level_1,predicted_WA,domain,categories_list,categories,name,business_id,key_point,opinions,aspects,opinion_aspect_pairs,sid,attributes,sentiments,opinion_aspect_pos,aspect_pos,num_of_token,id
0,552112,4,0.550564,Hotels,"[Event Planning & Services, Caterers, Hotels, Hotels & Travel]","Event Planning & Services, Caterers, Hotels, Hotels & Travel",Renaissance Pittsburgh Hotel,XAM8bZWsEKBTzOeTOz2qEA,Breakfast is a bit overpriced .,[a bit],[breakfast],[a bit breakfast],5.0,[food -> quality],[negative],"[DT, NN, NN]",[NN],5,552112_4
1,536415,2,0.526827,Hotels,"[Hotels, Hotels & Travel, Event Planning & Services]","Hotels, Hotels & Travel, Event Planning & Services",Omni Charlotte Hotel,IrHAhdmL9QH9g3NscAIsWA,Rooms are a little outdated .,[a little outdated],[rooms],[a little outdated rooms],2.0,[restaurant -> atmosphere],[negative],"[DT, JJ, JJ, NNS]",[NNS],5,536415_2
2,1037067,6,0.474484,Restaurants,"[Dim Sum, Chinese, Restaurants, Vegan, Vegetarian]","Dim Sum, Chinese, Restaurants, Vegan, Vegetarian",DumplingHaus,VfWX3UCKvUnVktdOuR8TwA,Customer Service was A+,[a+],[customer service],[a+ customer service],8.0,[staff],[positive],"[-LRB-, NN, NN]","[NN, NN]",4,1037067_6
3,922097,1,0.506006,Hotels,"[Hotels, Professional Services, Venues & Event Spaces, Hotels & Travel, Resorts, Event Planning & Services]","Hotels, Professional Services, Venues & Event Spaces, Hotels & Travel, Resorts, Event Planning & Services",Sanctuary on Camelback Mountain Resort and Spa,PsTzoIERiCjq6QHrOnk2Lg,Staff goes above and beyond .,[above and beyond],[staff],[above and beyond staff],1.0,[staff],[positive],"[RB, CC, IN, NN]",[NN],5,922097_1
4,1037698,3,0.439098,Restaurants,"[Restaurants, Mexican, Cocktail Bars, Pizza, Bars, Nightlife, Breakfast & Brunch, Sports Bars]","Restaurants, Mexican, Cocktail Bars, Pizza, Bars, Nightlife, Breakfast & Brunch, Sports Bars",No Què No Mexican Grill,GETVAz5SCk8rFziexV54Aw,The service was absolutely terrible .,[absolutely terrible],[service],[absolutely terrible service],8.0,[staff],[negative],"[RB, JJ, NN]",[NN],5,1037698_3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
362,988231,0,0.584501,Automotive,"[Used Car Dealers, Automotive, Car Dealers, Auto Parts & Supplies, Auto Repair]","Used Car Dealers, Automotive, Car Dealers, Auto Parts & Supplies, Auto Repair",AutoNation Honda East Las Vegas,ali4IoPkincXzwHgWSk_rA,Worst customer service .,[worst],[customer service],[worst customer service],0.0,[staff],[negative],"[JJS, NN, NN]","[NN, NN]",3,988231_0
363,970187,0,0.482031,Hotels,"[Hotels & Travel, Hotels, Event Planning & Services]","Hotels & Travel, Hotels, Event Planning & Services",Polo Towers by Diamond Resorts,Ccjo5j4A5-gVaBbuHAqzVA,Worst experience of my life .,[worst],[experience],[worst experience],0.0,[restaurant -> atmosphere],[negative],"[JJS, NN]",[NN],5,970187_0
364,1031366,0,0.431492,Restaurants,"[Restaurants, Italian]","Restaurants, Italian",The Old Spaghetti Factory,J1qzIVBt3lGpiz-8UdjhXg,Worst place to eat,[worst],[place],[worst place],0.0,[recommendation],[negative],"[JJS, NN]",[NN],4,1031366_0
365,981111,2,0.534063,Automotive,"[Towing, Automotive]","Towing, Automotive",777 Towing,iw56fh9lBAIseMOKDcu9_w,WORST SERVICE EVER RECEIVED,[worst],[service],[worst service],5.0,[staff],[negative],"[JJS, NN]",[NN],4,981111_2


## Construct matching examples

### Exact match

In [69]:
# Full match on target
sent_kp_df = sent_df.astype(str).merge(kp_df[['id', 'aspects', 'opinions', 'attributes', 'sentiments', 'opinion_aspect_pairs', 'key_point', 'predicted_WA', 'domain']].astype(str), how='left', 
                                       on=['domain', 'attributes', 'sentiments'])\
    .dropna(subset=['key_point'])
sent_kp_df = sent_kp_df.reset_index(drop=True)
sent_kp_df

Unnamed: 0,index,level_1,domain,categories_list,categories,name,business_id,sentences,opinions_x,aspects_x,...,aspect_pos,num_of_token,predicted_WA_x,id_x,id_y,aspects_y,opinions_y,opinion_aspect_pairs_y,key_point,predicted_WA_y
0,487383,0,Hotels,"['Event Planning & Services', 'Venues & Event Spaces', 'Hotels', 'Hotels & Travel']","Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel",Sheraton Centre Toronto Hotel,M7FOXsuEuRbwEfxUrNKv5w,Well located to most of downtown by foot .,['well'],['located'],...,['VBN'],8,0.4904603362083435,487383_0,487422_1,['location'],['great'],['great location'],Great location and amenities .,0.42586058378219604
1,487383,2,Hotels,"['Event Planning & Services', 'Venues & Event Spaces', 'Hotels', 'Hotels & Travel']","Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel",Sheraton Centre Toronto Hotel,M7FOXsuEuRbwEfxUrNKv5w,Free WiFi in lobby,['free'],['wifi'],...,['NN'],4,0.2828855514526367,487383_2,702558_5,['wifi'],['very fast'],['very fast wifi'],Wifi was very fast ~ 20MBps .,0.5944222211837769
2,487383,3,Hotels,"['Event Planning & Services', 'Venues & Event Spaces', 'Hotels', 'Hotels & Travel']","Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel",Sheraton Centre Toronto Hotel,M7FOXsuEuRbwEfxUrNKv5w,", hefty charge for in room .",['hefty'],['charge for'],...,"['NN', 'IN']",5,0.30174127221107483,487383_3,487392_1,['restaurant'],['too expensive'],['too expensive restaurant'],Restaurant is much too expensive .,0.46929696202278137
3,487385,0,Hotels,"['Event Planning & Services', 'Venues & Event Spaces', 'Hotels', 'Hotels & Travel']","Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel",Sheraton Centre Toronto Hotel,M7FOXsuEuRbwEfxUrNKv5w,Been staying at this hotel for years now and still enjoy the place .,['enjoy'],['place'],...,['NN'],13,0.4462526738643646,487385_0,777430_1,['place'],['family friendly'],['family friendly place'],"Relaxed , quiet , family friendly place .",0.5604457855224609
4,487385,0,Hotels,"['Event Planning & Services', 'Venues & Event Spaces', 'Hotels', 'Hotels & Travel']","Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel",Sheraton Centre Toronto Hotel,M7FOXsuEuRbwEfxUrNKv5w,Been staying at this hotel for years now and still enjoy the place .,['enjoy'],['place'],...,['NN'],13,0.4462526738643646,487385_0,511294_4,['place'],['family oriented'],['family oriented place'],Definitely a family oriented place .,0.43907630443573
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
157255,1038117,3,Restaurants,"['Breakfast & Brunch', 'Soup', 'Food', 'Bagels', 'Salad', 'Restaurants', 'Sandwiches']","Breakfast & Brunch, Soup, Food, Bagels, Salad, Restaurants, Sandwiches",Panera Bread,6bgjcFOy4WHMyw62_1V9Pw,Tomato soup is always perfect .,['always perfect'],['tomato soup'],...,"['NN', 'NN']",5,,1038117_3,1033117_2,['rice'],['seasoned well'],['seasoned well rice'],The rice was seasoned well,0.4282238781452179
157256,1038117,3,Restaurants,"['Breakfast & Brunch', 'Soup', 'Food', 'Bagels', 'Salad', 'Restaurants', 'Sandwiches']","Breakfast & Brunch, Soup, Food, Bagels, Salad, Restaurants, Sandwiches",Panera Bread,6bgjcFOy4WHMyw62_1V9Pw,Tomato soup is always perfect .,['always perfect'],['tomato soup'],...,"['NN', 'NN']",5,,1038117_3,1032527_3,['food'],['wide selection of'],['wide selection of food'],Wide selection of food,0.48513397574424744
157257,1038117,3,Restaurants,"['Breakfast & Brunch', 'Soup', 'Food', 'Bagels', 'Salad', 'Restaurants', 'Sandwiches']","Breakfast & Brunch, Soup, Food, Bagels, Salad, Restaurants, Sandwiches",Panera Bread,6bgjcFOy4WHMyw62_1V9Pw,Tomato soup is always perfect .,['always perfect'],['tomato soup'],...,"['NN', 'NN']",5,,1038117_3,1031927_0,['restaurant'],['wonderful'],['wonderful restaurant'],Wonderful restaurant in Old Montreal .,0.42443251609802246
157258,1038121,3,Restaurants,"['Breakfast & Brunch', 'Soup', 'Food', 'Bagels', 'Salad', 'Restaurants', 'Sandwiches']","Breakfast & Brunch, Soup, Food, Bagels, Salad, Restaurants, Sandwiches",Panera Bread,6bgjcFOy4WHMyw62_1V9Pw,"Place was deserted on Saturday afternoon , now I get why .",['deserted'],['place'],...,['NN'],10,,1038121_3,1038121_2,['place'],['dirty'],['dirty place'],Place felt dirty and disorganized .,0.5058704018592834


In [70]:
import ast

# for col in ['opinions_x', 'opinions_y', 'aspects', 'attributes', 'sentiments', 'opinion_aspect_pairs_x', 'opinion_aspect_pairs_y']:
for col in ['opinions_x', 'opinions_y', 'aspects_x', 'aspects_y', 'attributes', 'opinion_aspect_pairs_x', 'opinion_aspect_pairs_y']:
    sent_kp_df[col] = sent_kp_df[col].apply(lambda x: ast.literal_eval(x))

In [71]:
sent_kp_df = sent_kp_df[sent_kp_df['opinion_aspect_pairs_y'].str.len() < 2]

sent_kp_df['opinion_aspect_pairs_x'] = sent_kp_df['opinion_aspect_pairs_x'].astype(str)
sent_kp_df['opinion_aspect_pairs_y'] = sent_kp_df['opinion_aspect_pairs_y'].astype(str)
sent_kp_df = sent_kp_df.drop_duplicates(subset=['opinion_aspect_pairs_x', 'opinion_aspect_pairs_y'])

# Important
sent_kp_df = sent_kp_df[sent_kp_df['opinion_aspect_pairs_x'] != sent_kp_df['opinion_aspect_pairs_y']]

In [72]:
matched_sent_kp_df = sent_kp_df

In [73]:
matched_sent_kp_df.loc[matched_sent_kp_df['aspects_x'] == matched_sent_kp_df['aspects_y'], 'label'] = 1

In [74]:
matched_sent_kp_df

Unnamed: 0,index,level_1,domain,categories_list,categories,name,business_id,sentences,opinions_x,aspects_x,...,num_of_token,predicted_WA_x,id_x,id_y,aspects_y,opinions_y,opinion_aspect_pairs_y,key_point,predicted_WA_y,label
0,487383,0,Hotels,"['Event Planning & Services', 'Venues & Event Spaces', 'Hotels', 'Hotels & Travel']","Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel",Sheraton Centre Toronto Hotel,M7FOXsuEuRbwEfxUrNKv5w,Well located to most of downtown by foot .,[well],[located],...,8,0.4904603362083435,487383_0,487422_1,[location],[great],['great location'],Great location and amenities .,0.42586058378219604,
1,487383,2,Hotels,"['Event Planning & Services', 'Venues & Event Spaces', 'Hotels', 'Hotels & Travel']","Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel",Sheraton Centre Toronto Hotel,M7FOXsuEuRbwEfxUrNKv5w,Free WiFi in lobby,[free],[wifi],...,4,0.2828855514526367,487383_2,702558_5,[wifi],[very fast],['very fast wifi'],Wifi was very fast ~ 20MBps .,0.5944222211837769,1.0
2,487383,3,Hotels,"['Event Planning & Services', 'Venues & Event Spaces', 'Hotels', 'Hotels & Travel']","Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel",Sheraton Centre Toronto Hotel,M7FOXsuEuRbwEfxUrNKv5w,", hefty charge for in room .",[hefty],[charge for],...,5,0.30174127221107483,487383_3,487392_1,[restaurant],[too expensive],['too expensive restaurant'],Restaurant is much too expensive .,0.46929696202278137,
3,487385,0,Hotels,"['Event Planning & Services', 'Venues & Event Spaces', 'Hotels', 'Hotels & Travel']","Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel",Sheraton Centre Toronto Hotel,M7FOXsuEuRbwEfxUrNKv5w,Been staying at this hotel for years now and still enjoy the place .,[enjoy],[place],...,13,0.4462526738643646,487385_0,777430_1,[place],[family friendly],['family friendly place'],"Relaxed , quiet , family friendly place .",0.5604457855224609,1.0
4,487385,0,Hotels,"['Event Planning & Services', 'Venues & Event Spaces', 'Hotels', 'Hotels & Travel']","Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel",Sheraton Centre Toronto Hotel,M7FOXsuEuRbwEfxUrNKv5w,Been staying at this hotel for years now and still enjoy the place .,[enjoy],[place],...,13,0.4462526738643646,487385_0,511294_4,[place],[family oriented],['family oriented place'],Definitely a family oriented place .,0.43907630443573,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
157255,1038117,3,Restaurants,"['Breakfast & Brunch', 'Soup', 'Food', 'Bagels', 'Salad', 'Restaurants', 'Sandwiches']","Breakfast & Brunch, Soup, Food, Bagels, Salad, Restaurants, Sandwiches",Panera Bread,6bgjcFOy4WHMyw62_1V9Pw,Tomato soup is always perfect .,[always perfect],[tomato soup],...,5,,1038117_3,1033117_2,[rice],[seasoned well],['seasoned well rice'],The rice was seasoned well,0.4282238781452179,
157256,1038117,3,Restaurants,"['Breakfast & Brunch', 'Soup', 'Food', 'Bagels', 'Salad', 'Restaurants', 'Sandwiches']","Breakfast & Brunch, Soup, Food, Bagels, Salad, Restaurants, Sandwiches",Panera Bread,6bgjcFOy4WHMyw62_1V9Pw,Tomato soup is always perfect .,[always perfect],[tomato soup],...,5,,1038117_3,1032527_3,[food],[wide selection of],['wide selection of food'],Wide selection of food,0.48513397574424744,
157257,1038117,3,Restaurants,"['Breakfast & Brunch', 'Soup', 'Food', 'Bagels', 'Salad', 'Restaurants', 'Sandwiches']","Breakfast & Brunch, Soup, Food, Bagels, Salad, Restaurants, Sandwiches",Panera Bread,6bgjcFOy4WHMyw62_1V9Pw,Tomato soup is always perfect .,[always perfect],[tomato soup],...,5,,1038117_3,1031927_0,[restaurant],[wonderful],['wonderful restaurant'],Wonderful restaurant in Old Montreal .,0.42443251609802246,
157258,1038121,3,Restaurants,"['Breakfast & Brunch', 'Soup', 'Food', 'Bagels', 'Salad', 'Restaurants', 'Sandwiches']","Breakfast & Brunch, Soup, Food, Bagels, Salad, Restaurants, Sandwiches",Panera Bread,6bgjcFOy4WHMyw62_1V9Pw,"Place was deserted on Saturday afternoon , now I get why .",[deserted],[place],...,10,,1038121_3,1038121_2,[place],[dirty],['dirty place'],Place felt dirty and disorganized .,0.5058704018592834,1.0


### Handle implicit match

In [75]:
import spacy
nlp = spacy.load('en_core_web_md')



In [76]:
import warnings
warnings.filterwarnings("error")

In [77]:
unfound_tokens = []
def calculate_aspects_semantic_similarity(row):
    tokens = nlp(row['aspects_x'][0] + " | " + row['aspects_y'][0])
    sep_index = [token.i for token in tokens if token.text == '|'][0]
    token1, token2 = tokens[:sep_index], tokens[sep_index+1:]
    row['aspects_x_len'] = len(token1)
    row['aspects_y_len'] = len(token2)
#     print(token1, token2)
    try:
        row['aspects_similarity'] = token2.similarity(token1)
    except:
#         display(row)
#         unfound_tokens += [token1]
        row['aspects_similarity'] = -1
    
    return row

In [78]:
matched_sent_kp_df = matched_sent_kp_df[~matched_sent_kp_df['aspects_x'].apply(lambda x: x[0].strip() == "")]

In [79]:
implicit_match_analyze_mask = matched_sent_kp_df['aspects_x'] != matched_sent_kp_df['aspects_y']
matched_sent_kp_df_implicit = matched_sent_kp_df[implicit_match_analyze_mask].apply(calculate_aspects_semantic_similarity, axis=1)

In [80]:
implicit_match_analyze_mask = matched_sent_kp_df['aspects_x'] != matched_sent_kp_df['aspects_y']
matched_sent_kp_df[implicit_match_analyze_mask] = matched_sent_kp_df_implicit

In [81]:
matched_sent_kp_df.loc[implicit_match_analyze_mask, 'aspects_similarity'] = matched_sent_kp_df_implicit['aspects_similarity']
matched_sent_kp_df.loc[implicit_match_analyze_mask, 'aspects_x_len'] = matched_sent_kp_df_implicit['aspects_x_len']
matched_sent_kp_df.loc[implicit_match_analyze_mask, 'aspects_y_len'] = matched_sent_kp_df_implicit['aspects_y_len']

In [82]:
matched_sent_kp_df.to_pickle("../data/yelp/snippext_yelp_top_5_domains_match_full_pair_semantic_calculated_full.pkl")

In [83]:
import pandas as pd
matched_sent_kp_df = pd.read_pickle("../data/yelp/snippext_yelp_top_5_domains_match_full_pair_semantic_calculated_full.pkl")

In [84]:
matched_sent_kp_df

Unnamed: 0,index,level_1,domain,categories_list,categories,name,business_id,sentences,opinions_x,aspects_x,...,id_y,aspects_y,opinions_y,opinion_aspect_pairs_y,key_point,predicted_WA_y,label,aspects_similarity,aspects_x_len,aspects_y_len
0,487383,0,Hotels,"['Event Planning & Services', 'Venues & Event Spaces', 'Hotels', 'Hotels & Travel']","Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel",Sheraton Centre Toronto Hotel,M7FOXsuEuRbwEfxUrNKv5w,Well located to most of downtown by foot .,[well],[located],...,487422_1,[location],[great],['great location'],Great location and amenities .,0.42586058378219604,,0.653039,1.0,1.0
1,487383,2,Hotels,"['Event Planning & Services', 'Venues & Event Spaces', 'Hotels', 'Hotels & Travel']","Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel",Sheraton Centre Toronto Hotel,M7FOXsuEuRbwEfxUrNKv5w,Free WiFi in lobby,[free],[wifi],...,702558_5,[wifi],[very fast],['very fast wifi'],Wifi was very fast ~ 20MBps .,0.5944222211837769,1.0,,,
2,487383,3,Hotels,"['Event Planning & Services', 'Venues & Event Spaces', 'Hotels', 'Hotels & Travel']","Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel",Sheraton Centre Toronto Hotel,M7FOXsuEuRbwEfxUrNKv5w,", hefty charge for in room .",[hefty],[charge for],...,487392_1,[restaurant],[too expensive],['too expensive restaurant'],Restaurant is much too expensive .,0.46929696202278137,,0.230903,2.0,1.0
3,487385,0,Hotels,"['Event Planning & Services', 'Venues & Event Spaces', 'Hotels', 'Hotels & Travel']","Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel",Sheraton Centre Toronto Hotel,M7FOXsuEuRbwEfxUrNKv5w,Been staying at this hotel for years now and still enjoy the place .,[enjoy],[place],...,777430_1,[place],[family friendly],['family friendly place'],"Relaxed , quiet , family friendly place .",0.5604457855224609,1.0,,,
4,487385,0,Hotels,"['Event Planning & Services', 'Venues & Event Spaces', 'Hotels', 'Hotels & Travel']","Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel",Sheraton Centre Toronto Hotel,M7FOXsuEuRbwEfxUrNKv5w,Been staying at this hotel for years now and still enjoy the place .,[enjoy],[place],...,511294_4,[place],[family oriented],['family oriented place'],Definitely a family oriented place .,0.43907630443573,1.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
157255,1038117,3,Restaurants,"['Breakfast & Brunch', 'Soup', 'Food', 'Bagels', 'Salad', 'Restaurants', 'Sandwiches']","Breakfast & Brunch, Soup, Food, Bagels, Salad, Restaurants, Sandwiches",Panera Bread,6bgjcFOy4WHMyw62_1V9Pw,Tomato soup is always perfect .,[always perfect],[tomato soup],...,1033117_2,[rice],[seasoned well],['seasoned well rice'],The rice was seasoned well,0.4282238781452179,,0.683708,2.0,1.0
157256,1038117,3,Restaurants,"['Breakfast & Brunch', 'Soup', 'Food', 'Bagels', 'Salad', 'Restaurants', 'Sandwiches']","Breakfast & Brunch, Soup, Food, Bagels, Salad, Restaurants, Sandwiches",Panera Bread,6bgjcFOy4WHMyw62_1V9Pw,Tomato soup is always perfect .,[always perfect],[tomato soup],...,1032527_3,[food],[wide selection of],['wide selection of food'],Wide selection of food,0.48513397574424744,,0.483831,2.0,1.0
157257,1038117,3,Restaurants,"['Breakfast & Brunch', 'Soup', 'Food', 'Bagels', 'Salad', 'Restaurants', 'Sandwiches']","Breakfast & Brunch, Soup, Food, Bagels, Salad, Restaurants, Sandwiches",Panera Bread,6bgjcFOy4WHMyw62_1V9Pw,Tomato soup is always perfect .,[always perfect],[tomato soup],...,1031927_0,[restaurant],[wonderful],['wonderful restaurant'],Wonderful restaurant in Old Montreal .,0.42443251609802246,,0.359310,2.0,1.0
157258,1038121,3,Restaurants,"['Breakfast & Brunch', 'Soup', 'Food', 'Bagels', 'Salad', 'Restaurants', 'Sandwiches']","Breakfast & Brunch, Soup, Food, Bagels, Salad, Restaurants, Sandwiches",Panera Bread,6bgjcFOy4WHMyw62_1V9Pw,"Place was deserted on Saturday afternoon , now I get why .",[deserted],[place],...,1038121_2,[place],[dirty],['dirty place'],Place felt dirty and disorganized .,0.5058704018592834,1.0,,,


In [85]:
# Below is the unfound tokens of spacy while doing the comparison (reason, could be typo, food name)
matched_sent_kp_df[matched_sent_kp_df['aspects_similarity'] == -1]['aspects_x'].apply(lambda x: x[0]).unique()

array(['lounge', 'spa', 'rooms appearance', 'valet parking', 'hotels',
       'wifi', 'room', 'walkways', 'hotel', 'walls', 'experience',
       'bathrooms', 'lobby', 'parking', 'locationa', 'rooms', 'day',
       'windows', 'room interior', 'lighting', 'carpet', 'view', 'arcade',
       'music', 'bed', 'regency club', 'place', 'mall', 'indoor pool',
       'bar lobby area', 'bass', 'room floors', 'self parking area',
       'slides', 'park', 'crowds', 'sunsplash', 'pools', 'curtains',
       'parking situation', 'nights rest', 'wireless connection',
       'nascar event', 'guests', 'smell', 'bathroom', 'venue',
       'nightlife', 'street side room', 'drapes', 'noise', 'parking lot',
       'beds', 'activity', 'outdoor water park', 'motel room',
       'accommodations', 'hallway carpet', 'room cleaning', 'hallways',
       'lamps', 'tv', 'tub', 'furniture', 'floors', 'gym', 'air',
       'master bed', 'light bulbs', 'stereos', 'entering the property',
       'closet space', 'motorcycl

In [86]:
import nltk
def label_implicit_matching_pair(row):
    token1 = row['aspects_x'][0]
    token2 = row['aspects_y'][0]
    
    # Best
    if row['aspects_x_len'] >= row['aspects_y_len']:
        if token2 in token1 and len(token1) > len(token2) and row['aspects_x_len'] > row['aspects_y_len']: # e.g. mexican food vs food
            row['label'] = 1
    return row

In [87]:
matched_sent_kp_df[matched_sent_kp_df['aspects_similarity'] > 0.8] = matched_sent_kp_df[matched_sent_kp_df['aspects_similarity'] > 0.8].apply(label_implicit_matching_pair, axis=1)

In [88]:
# Show some pairing results
matched_sent_kp_df[(pd.notnull(matched_sent_kp_df['aspects_similarity'])) & (matched_sent_kp_df['label'] == 1)]\
    [['sentences', 'opinions_x', 'aspects_x', 'key_point', 'opinions_y', 'aspects_y', 'label']]

Unnamed: 0,sentences,opinions_x,aspects_x,key_point,opinions_y,aspects_y,label
49,I was very disappointed by the rooms appearance ( cleanliness and aesthetics ) .,[disappointed],[rooms appearance],Rooms are a little outdated .,[a little outdated],[rooms],1.0
206,Valet parking at $ 50/day is steep so transit in if possible .,[steep],[valet parking],Parking was horrible .,[horrible],[parking],1.0
658,They upgraded me to the new room renos which was great .,[great],[room renos],Room was clean and comfortable .,[clean],[room],1.0
676,They upgraded me to the new room renos which was great .,[great],[room renos],Room was spotless and clean .,[spotless],[room],1.0
2143,"Maintenance , housekeeping , and customer service ( with exception ) are non - existent .",[non - existent],[customer service],Service is poor at best .,[poor at best],[service],1.0
...,...,...,...,...,...,...,...
156945,And I have to say the customer service has been outstanding my emails of complaints helped .,[outstanding],[customer service],The service here is impeccable .,[impeccable],[service],1.0
156947,And I have to say the customer service has been outstanding my emails of complaints helped .,[outstanding],[customer service],"Terrific variety , lovely service !",[lovely],[service],1.0
156952,And I have to say the customer service has been outstanding my emails of complaints helped .,[outstanding],[customer service],Service was prompt and efficient .,[prompt and efficient],[service],1.0
156954,And I have to say the customer service has been outstanding my emails of complaints helped .,[outstanding],[customer service],Prices and service are superior .,[superior],[service],1.0


In [91]:
matched_sent_kp_df = matched_sent_kp_df[matched_sent_kp_df['label'] == 1]

## Construct non-matching pairs

In [93]:
sent_kp_df = sent_df.astype(str).merge(kp_df[['id', 'aspects', 'opinions', 'attributes', 'sentiments', 'opinion_aspect_pairs', 'key_point', 'predicted_WA', 'domain']].astype(str), how='left', 
                                       on=['domain', 'attributes'])\
    .dropna(subset=['key_point'])
sent_kp_df = sent_kp_df.reset_index(drop=True)
sent_kp_df

Unnamed: 0,index,level_1,domain,categories_list,categories,name,business_id,sentences,opinions_x,aspects_x,...,num_of_token,predicted_WA_x,id_x,id_y,aspects_y,opinions_y,sentiments_y,opinion_aspect_pairs_y,key_point,predicted_WA_y
0,487383,0,Hotels,"['Event Planning & Services', 'Venues & Event Spaces', 'Hotels', 'Hotels & Travel']","Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel",Sheraton Centre Toronto Hotel,M7FOXsuEuRbwEfxUrNKv5w,Well located to most of downtown by foot .,['well'],['located'],...,8,0.4904603362083435,487383_0,487422_1,['location'],['great'],['positive'],['great location'],Great location and amenities .,0.42586058378219604
1,487383,2,Hotels,"['Event Planning & Services', 'Venues & Event Spaces', 'Hotels', 'Hotels & Travel']","Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel",Sheraton Centre Toronto Hotel,M7FOXsuEuRbwEfxUrNKv5w,Free WiFi in lobby,['free'],['wifi'],...,4,0.2828855514526367,487383_2,697122_2,['elevators'],['heavily congested'],['negative'],['heavily congested elevators'],Elevators were heavily congested .,0.4851902723312378
2,487383,2,Hotels,"['Event Planning & Services', 'Venues & Event Spaces', 'Hotels', 'Hotels & Travel']","Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel",Sheraton Centre Toronto Hotel,M7FOXsuEuRbwEfxUrNKv5w,Free WiFi in lobby,['free'],['wifi'],...,4,0.2828855514526367,487383_2,520943_6,['elevators'],['painfully slow'],['negative'],['painfully slow elevators'],Elevators are painfully slow .,0.47212493419647217
3,487383,2,Hotels,"['Event Planning & Services', 'Venues & Event Spaces', 'Hotels', 'Hotels & Travel']","Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel",Sheraton Centre Toronto Hotel,M7FOXsuEuRbwEfxUrNKv5w,Free WiFi in lobby,['free'],['wifi'],...,4,0.2828855514526367,487383_2,702558_5,['wifi'],['very fast'],['positive'],['very fast wifi'],Wifi was very fast ~ 20MBps .,0.5944222211837769
4,487383,2,Hotels,"['Event Planning & Services', 'Venues & Event Spaces', 'Hotels', 'Hotels & Travel']","Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel",Sheraton Centre Toronto Hotel,M7FOXsuEuRbwEfxUrNKv5w,Free WiFi in lobby,['free'],['wifi'],...,4,0.2828855514526367,487383_2,737955_5,['wifi'],['very poor'],['negative'],['very poor wifi'],WiFi was very poor also .,0.437238872051239
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
231408,1038121,3,Restaurants,"['Breakfast & Brunch', 'Soup', 'Food', 'Bagels', 'Salad', 'Restaurants', 'Sandwiches']","Breakfast & Brunch, Soup, Food, Bagels, Salad, Restaurants, Sandwiches",Panera Bread,6bgjcFOy4WHMyw62_1V9Pw,"Place was deserted on Saturday afternoon , now I get why .",['deserted'],['place'],...,10,,1038121_3,1031639_2,['neighborhood'],['great'],['positive'],['great neighborhood'],Great neighborhood burger joint .,0.42043861746788025
231409,1038121,3,Restaurants,"['Breakfast & Brunch', 'Soup', 'Food', 'Bagels', 'Salad', 'Restaurants', 'Sandwiches']","Breakfast & Brunch, Soup, Food, Bagels, Salad, Restaurants, Sandwiches",Panera Bread,6bgjcFOy4WHMyw62_1V9Pw,"Place was deserted on Saturday afternoon , now I get why .",['deserted'],['place'],...,10,,1038121_3,1031179_0,['parking'],['minor'],['negative'],['minor parking'],Parking is a minor annoyance .,0.4353095293045044
231410,1038121,3,Restaurants,"['Breakfast & Brunch', 'Soup', 'Food', 'Bagels', 'Salad', 'Restaurants', 'Sandwiches']","Breakfast & Brunch, Soup, Food, Bagels, Salad, Restaurants, Sandwiches",Panera Bread,6bgjcFOy4WHMyw62_1V9Pw,"Place was deserted on Saturday afternoon , now I get why .",['deserted'],['place'],...,10,,1038121_3,1036542_4,['gem'],['perfect'],['positive'],['perfect gem'],Perfect hidden Gem .,0.4555741846561432
231411,1038121,3,Restaurants,"['Breakfast & Brunch', 'Soup', 'Food', 'Bagels', 'Salad', 'Restaurants', 'Sandwiches']","Breakfast & Brunch, Soup, Food, Bagels, Salad, Restaurants, Sandwiches",Panera Bread,6bgjcFOy4WHMyw62_1V9Pw,"Place was deserted on Saturday afternoon , now I get why .",['deserted'],['place'],...,10,,1038121_3,1037995_1,['outdoor seating'],['sunny'],['positive'],['sunny outdoor seating'],"Great , sunny , outdoor seating .",0.47357580065727234


In [94]:
import ast

for col in ['opinions_x', 'opinions_y', 'aspects_x', 'sentiments_x', 'sentiments_y', 'aspects_y', 'attributes', 'opinion_aspect_pairs_x', 'opinion_aspect_pairs_y']:
    sent_kp_df[col] = sent_kp_df[col].apply(lambda x: ast.literal_eval(x))

In [95]:
sent_kp_df = sent_kp_df[sent_kp_df['opinion_aspect_pairs_y'].str.len() < 2]
sent_kp_df

Unnamed: 0,index,level_1,domain,categories_list,categories,name,business_id,sentences,opinions_x,aspects_x,...,num_of_token,predicted_WA_x,id_x,id_y,aspects_y,opinions_y,sentiments_y,opinion_aspect_pairs_y,key_point,predicted_WA_y
0,487383,0,Hotels,"['Event Planning & Services', 'Venues & Event Spaces', 'Hotels', 'Hotels & Travel']","Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel",Sheraton Centre Toronto Hotel,M7FOXsuEuRbwEfxUrNKv5w,Well located to most of downtown by foot .,[well],[located],...,8,0.4904603362083435,487383_0,487422_1,[location],[great],[positive],[great location],Great location and amenities .,0.42586058378219604
1,487383,2,Hotels,"['Event Planning & Services', 'Venues & Event Spaces', 'Hotels', 'Hotels & Travel']","Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel",Sheraton Centre Toronto Hotel,M7FOXsuEuRbwEfxUrNKv5w,Free WiFi in lobby,[free],[wifi],...,4,0.2828855514526367,487383_2,697122_2,[elevators],[heavily congested],[negative],[heavily congested elevators],Elevators were heavily congested .,0.4851902723312378
2,487383,2,Hotels,"['Event Planning & Services', 'Venues & Event Spaces', 'Hotels', 'Hotels & Travel']","Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel",Sheraton Centre Toronto Hotel,M7FOXsuEuRbwEfxUrNKv5w,Free WiFi in lobby,[free],[wifi],...,4,0.2828855514526367,487383_2,520943_6,[elevators],[painfully slow],[negative],[painfully slow elevators],Elevators are painfully slow .,0.47212493419647217
3,487383,2,Hotels,"['Event Planning & Services', 'Venues & Event Spaces', 'Hotels', 'Hotels & Travel']","Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel",Sheraton Centre Toronto Hotel,M7FOXsuEuRbwEfxUrNKv5w,Free WiFi in lobby,[free],[wifi],...,4,0.2828855514526367,487383_2,702558_5,[wifi],[very fast],[positive],[very fast wifi],Wifi was very fast ~ 20MBps .,0.5944222211837769
4,487383,2,Hotels,"['Event Planning & Services', 'Venues & Event Spaces', 'Hotels', 'Hotels & Travel']","Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel",Sheraton Centre Toronto Hotel,M7FOXsuEuRbwEfxUrNKv5w,Free WiFi in lobby,[free],[wifi],...,4,0.2828855514526367,487383_2,737955_5,[wifi],[very poor],[negative],[very poor wifi],WiFi was very poor also .,0.437238872051239
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
231408,1038121,3,Restaurants,"['Breakfast & Brunch', 'Soup', 'Food', 'Bagels', 'Salad', 'Restaurants', 'Sandwiches']","Breakfast & Brunch, Soup, Food, Bagels, Salad, Restaurants, Sandwiches",Panera Bread,6bgjcFOy4WHMyw62_1V9Pw,"Place was deserted on Saturday afternoon , now I get why .",[deserted],[place],...,10,,1038121_3,1031639_2,[neighborhood],[great],[positive],[great neighborhood],Great neighborhood burger joint .,0.42043861746788025
231409,1038121,3,Restaurants,"['Breakfast & Brunch', 'Soup', 'Food', 'Bagels', 'Salad', 'Restaurants', 'Sandwiches']","Breakfast & Brunch, Soup, Food, Bagels, Salad, Restaurants, Sandwiches",Panera Bread,6bgjcFOy4WHMyw62_1V9Pw,"Place was deserted on Saturday afternoon , now I get why .",[deserted],[place],...,10,,1038121_3,1031179_0,[parking],[minor],[negative],[minor parking],Parking is a minor annoyance .,0.4353095293045044
231410,1038121,3,Restaurants,"['Breakfast & Brunch', 'Soup', 'Food', 'Bagels', 'Salad', 'Restaurants', 'Sandwiches']","Breakfast & Brunch, Soup, Food, Bagels, Salad, Restaurants, Sandwiches",Panera Bread,6bgjcFOy4WHMyw62_1V9Pw,"Place was deserted on Saturday afternoon , now I get why .",[deserted],[place],...,10,,1038121_3,1036542_4,[gem],[perfect],[positive],[perfect gem],Perfect hidden Gem .,0.4555741846561432
231411,1038121,3,Restaurants,"['Breakfast & Brunch', 'Soup', 'Food', 'Bagels', 'Salad', 'Restaurants', 'Sandwiches']","Breakfast & Brunch, Soup, Food, Bagels, Salad, Restaurants, Sandwiches",Panera Bread,6bgjcFOy4WHMyw62_1V9Pw,"Place was deserted on Saturday afternoon , now I get why .",[deserted],[place],...,10,,1038121_3,1037995_1,[outdoor seating],[sunny],[positive],[sunny outdoor seating],"Great , sunny , outdoor seating .",0.47357580065727234


In [96]:
import warnings
warnings.filterwarnings("default")

In [97]:
sent_kp_df['opinion_aspect_pairs_x'] = sent_kp_df['opinion_aspect_pairs_x'].astype(str)
sent_kp_df['opinion_aspect_pairs_y'] = sent_kp_df['opinion_aspect_pairs_y'].astype(str)
sent_kp_df = sent_kp_df.drop_duplicates(subset=['opinion_aspect_pairs_x', 'opinion_aspect_pairs_y'])

# Important
sent_kp_df = sent_kp_df[sent_kp_df['opinion_aspect_pairs_x'] != sent_kp_df['opinion_aspect_pairs_y']]

In [98]:
matched_sent_kp_df['id_pair'] = matched_sent_kp_df['id_x'] + "##" + matched_sent_kp_df['id_y']
matched_pair_id = matched_sent_kp_df['id_pair'].tolist()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  matched_sent_kp_df['id_pair'] = matched_sent_kp_df['id_x'] + "##" + matched_sent_kp_df['id_y']


In [99]:
nonmatched_sent_kp_df = sent_kp_df
nonmatched_sent_kp_df['id_pair'] = nonmatched_sent_kp_df['id_x'] + "##" + nonmatched_sent_kp_df['id_y']
nonmatched_sent_kp_df = nonmatched_sent_kp_df[~nonmatched_sent_kp_df['id_pair'].isin(matched_pair_id)]
nonmatched_sent_kp_df

Unnamed: 0,index,level_1,domain,categories_list,categories,name,business_id,sentences,opinions_x,aspects_x,...,predicted_WA_x,id_x,id_y,aspects_y,opinions_y,sentiments_y,opinion_aspect_pairs_y,key_point,predicted_WA_y,id_pair
0,487383,0,Hotels,"['Event Planning & Services', 'Venues & Event Spaces', 'Hotels', 'Hotels & Travel']","Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel",Sheraton Centre Toronto Hotel,M7FOXsuEuRbwEfxUrNKv5w,Well located to most of downtown by foot .,[well],[located],...,0.4904603362083435,487383_0,487422_1,[location],[great],[positive],['great location'],Great location and amenities .,0.42586058378219604,487383_0##487422_1
1,487383,2,Hotels,"['Event Planning & Services', 'Venues & Event Spaces', 'Hotels', 'Hotels & Travel']","Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel",Sheraton Centre Toronto Hotel,M7FOXsuEuRbwEfxUrNKv5w,Free WiFi in lobby,[free],[wifi],...,0.2828855514526367,487383_2,697122_2,[elevators],[heavily congested],[negative],['heavily congested elevators'],Elevators were heavily congested .,0.4851902723312378,487383_2##697122_2
2,487383,2,Hotels,"['Event Planning & Services', 'Venues & Event Spaces', 'Hotels', 'Hotels & Travel']","Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel",Sheraton Centre Toronto Hotel,M7FOXsuEuRbwEfxUrNKv5w,Free WiFi in lobby,[free],[wifi],...,0.2828855514526367,487383_2,520943_6,[elevators],[painfully slow],[negative],['painfully slow elevators'],Elevators are painfully slow .,0.47212493419647217,487383_2##520943_6
4,487383,2,Hotels,"['Event Planning & Services', 'Venues & Event Spaces', 'Hotels', 'Hotels & Travel']","Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel",Sheraton Centre Toronto Hotel,M7FOXsuEuRbwEfxUrNKv5w,Free WiFi in lobby,[free],[wifi],...,0.2828855514526367,487383_2,737955_5,[wifi],[very poor],[negative],['very poor wifi'],WiFi was very poor also .,0.437238872051239,487383_2##737955_5
5,487383,3,Hotels,"['Event Planning & Services', 'Venues & Event Spaces', 'Hotels', 'Hotels & Travel']","Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel",Sheraton Centre Toronto Hotel,M7FOXsuEuRbwEfxUrNKv5w,", hefty charge for in room .",[hefty],[charge for],...,0.30174127221107483,487383_3,697142_6,[valet parking],[pricey],[positive],['pricey valet parking'],The valet parking is pricey .,0.4353298544883728,487383_3##697142_6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
231408,1038121,3,Restaurants,"['Breakfast & Brunch', 'Soup', 'Food', 'Bagels', 'Salad', 'Restaurants', 'Sandwiches']","Breakfast & Brunch, Soup, Food, Bagels, Salad, Restaurants, Sandwiches",Panera Bread,6bgjcFOy4WHMyw62_1V9Pw,"Place was deserted on Saturday afternoon , now I get why .",[deserted],[place],...,,1038121_3,1031639_2,[neighborhood],[great],[positive],['great neighborhood'],Great neighborhood burger joint .,0.42043861746788025,1038121_3##1031639_2
231409,1038121,3,Restaurants,"['Breakfast & Brunch', 'Soup', 'Food', 'Bagels', 'Salad', 'Restaurants', 'Sandwiches']","Breakfast & Brunch, Soup, Food, Bagels, Salad, Restaurants, Sandwiches",Panera Bread,6bgjcFOy4WHMyw62_1V9Pw,"Place was deserted on Saturday afternoon , now I get why .",[deserted],[place],...,,1038121_3,1031179_0,[parking],[minor],[negative],['minor parking'],Parking is a minor annoyance .,0.4353095293045044,1038121_3##1031179_0
231410,1038121,3,Restaurants,"['Breakfast & Brunch', 'Soup', 'Food', 'Bagels', 'Salad', 'Restaurants', 'Sandwiches']","Breakfast & Brunch, Soup, Food, Bagels, Salad, Restaurants, Sandwiches",Panera Bread,6bgjcFOy4WHMyw62_1V9Pw,"Place was deserted on Saturday afternoon , now I get why .",[deserted],[place],...,,1038121_3,1036542_4,[gem],[perfect],[positive],['perfect gem'],Perfect hidden Gem .,0.4555741846561432,1038121_3##1036542_4
231411,1038121,3,Restaurants,"['Breakfast & Brunch', 'Soup', 'Food', 'Bagels', 'Salad', 'Restaurants', 'Sandwiches']","Breakfast & Brunch, Soup, Food, Bagels, Salad, Restaurants, Sandwiches",Panera Bread,6bgjcFOy4WHMyw62_1V9Pw,"Place was deserted on Saturday afternoon , now I get why .",[deserted],[place],...,,1038121_3,1037995_1,[outdoor seating],[sunny],[positive],['sunny outdoor seating'],"Great , sunny , outdoor seating .",0.47357580065727234,1038121_3##1037995_1


In [100]:
nonmatched_sent_kp_df['sentiments_x'] = nonmatched_sent_kp_df['sentiments_x'].astype(str)
nonmatched_sent_kp_df['sentiments_y'] = nonmatched_sent_kp_df['sentiments_y'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nonmatched_sent_kp_df['sentiments_x'] = nonmatched_sent_kp_df['sentiments_x'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nonmatched_sent_kp_df['sentiments_y'] = nonmatched_sent_kp_df['sentiments_y'].astype(str)


**For every KP, we select only top 150 high-quality arguments to avoid redundancy and duplication during training**

In [101]:
matched_sent_kp_df['id_y'].value_counts()

1032483_3    180
1032527_3    180
1036359_0    180
1037643_0    180
1032500_2    179
            ... 
972368_1       1
1032681_2      1
1033470_0      1
866616_2       1
1031382_2      1
Name: id_y, Length: 273, dtype: int64

In [102]:
nonmatched_sent_kp_df = nonmatched_sent_kp_df.groupby(['id_y'])\
    .apply(lambda grp: grp.sort_values(by=['predicted_WA_x'], ascending=False).head(150)).reset_index(drop=True)

In [103]:
nonmatched_sent_kp_df

Unnamed: 0,index,level_1,domain,categories_list,categories,name,business_id,sentences,opinions_x,aspects_x,...,predicted_WA_x,id_x,id_y,aspects_y,opinions_y,sentiments_y,opinion_aspect_pairs_y,key_point,predicted_WA_y,id_pair
0,1023298,2,Arts & Entertainment,"['Cinema', 'Arts & Entertainment']","Cinema, Arts & Entertainment",Pollack Tempe Cinemas,Z96TEJDXyUhCerS-9QllXw,The kettle corn is awesome ! !,[awesome],[kettle corn],...,0.8965851068496704,1023298_2,1000002_2,[family fun],[great],['positive'],['great family fun'],But overall great family fun .,0.4203305244445801,1023298_2##1000002_2
1,1000054,4,Arts & Entertainment,"['Baseball Fields', 'Professional Sports Teams', 'Active Life', 'Arts & Entertainment', 'Stadiums & Arenas']","Baseball Fields, Professional Sports Teams, Active Life, Arts & Entertainment, Stadiums & Arenas",Progressive Field,cHwgtVdvZVn0AQFtWtUXXg,The food options were also top notch .,[top notch],[food options],...,0.8783519268035889,1000054_4,1000002_2,[family fun],[great],['positive'],['great family fun'],But overall great family fun .,0.4203305244445801,1000054_4##1000002_2
2,914735,3,Arts & Entertainment,"['Arts & Entertainment', 'Cinema']","Arts & Entertainment, Cinema",Harkins Camelview at Fashion Square,bCh7vrbbxSKx0mtZnOvQoQ,Keep sniffing that glue Harkins and rest assured I will never PAY DOUBLE for anything especially a lousy movie .,[lousy],[movie],...,0.873248279094696,914735_3,1000002_2,[family fun],[great],['positive'],['great family fun'],But overall great family fun .,0.4203305244445801,914735_3##1000002_2
3,826256,5,Arts & Entertainment,"['Arts & Entertainment', 'Museums']","Arts & Entertainment, Museums",Heard Museum,d9wSnfW2kaJR-_auyK9G4A,However the katina doll collection was awesome .,[awesome],[katina doll collection],...,0.8593790531158447,826256_5,1000002_2,[family fun],[great],['positive'],['great family fun'],But overall great family fun .,0.4203305244445801,826256_5##1000002_2
4,826270,1,Arts & Entertainment,"['Arts & Entertainment', 'Museums']","Arts & Entertainment, Museums",Heard Museum,d9wSnfW2kaJR-_auyK9G4A,A must try for the adventurous foodie .,[adventurous],[foodie],...,0.8460561633110046,826270_1,1000002_2,[family fun],[great],['positive'],['great family fun'],But overall great family fun .,0.4203305244445801,826270_1##1000002_2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50051,880945,1,Arts & Entertainment,"['Performing Arts', 'Arts & Entertainment']","Performing Arts, Arts & Entertainment",Rock of Ages,_F3AMoo_zdl-he384ISQbw,Ushers are all standby to guide you to the seats .,[standby],[ushers],...,0.38839608430862427,880945_1,999990_2,[concessions],[friendly and attentive],['positive'],['friendly and attentive concessions'],Concessions are friendly and attentive .,0.48531198501586914,880945_1##999990_2
50052,929246,2,Arts & Entertainment,"['Cinema', 'Arts & Entertainment']","Cinema, Arts & Entertainment",Cinemark Mesa 16,lqCJiHlxzRpuYt-u_XQUvg,Management was horrible and I 'm beyond disappointed with how rude and unprofessional they were !,[horrible],[management],...,0.3881244957447052,929246_2,999990_2,[concessions],[friendly and attentive],['positive'],['friendly and attentive concessions'],Concessions are friendly and attentive .,0.48531198501586914,929246_2##999990_2
50053,901588,0,Arts & Entertainment,"['Casinos', 'Nightlife', 'Arts & Entertainment', 'Bars', 'Adult Entertainment']","Casinos, Nightlife, Arts & Entertainment, Bars, Adult Entertainment",Playboy Club,3g8zzg_5__kXMUM-8F6V1w,Sexy dealers .,[sexy],[dealers],...,0.3878975212574005,901588_0,999990_2,[concessions],[friendly and attentive],['positive'],['friendly and attentive concessions'],Concessions are friendly and attentive .,0.48531198501586914,901588_0##999990_2
50054,842226,3,Arts & Entertainment,"['Arts & Entertainment', 'Arcades', 'Active Life', 'Venues & Event Spaces', 'Party & Event Planning', 'Amusement Parks', 'Event Planning & Services', 'Mini Golf', 'Laser Tag']","Arts & Entertainment, Arcades, Active Life, Venues & Event Spaces, Party & Event Planning, Amusement Parks, Event Planning & Services, Mini Golf, Laser Tag",King Putt Mini,EH3yMY64M0AQNpPj3EERWQ,Their staff is freaking awesome .,[freaking awesome],[staff],...,0.38782769441604614,842226_3,999990_2,[concessions],[friendly and attentive],['positive'],['friendly and attentive concessions'],Concessions are friendly and attentive .,0.48531198501586914,842226_3##999990_2


In [104]:
nonmatched_sent_kp_df['label'] = 0
nonmatched_sent_kp_df

Unnamed: 0,index,level_1,domain,categories_list,categories,name,business_id,sentences,opinions_x,aspects_x,...,id_x,id_y,aspects_y,opinions_y,sentiments_y,opinion_aspect_pairs_y,key_point,predicted_WA_y,id_pair,label
0,1023298,2,Arts & Entertainment,"['Cinema', 'Arts & Entertainment']","Cinema, Arts & Entertainment",Pollack Tempe Cinemas,Z96TEJDXyUhCerS-9QllXw,The kettle corn is awesome ! !,[awesome],[kettle corn],...,1023298_2,1000002_2,[family fun],[great],['positive'],['great family fun'],But overall great family fun .,0.4203305244445801,1023298_2##1000002_2,0
1,1000054,4,Arts & Entertainment,"['Baseball Fields', 'Professional Sports Teams', 'Active Life', 'Arts & Entertainment', 'Stadiums & Arenas']","Baseball Fields, Professional Sports Teams, Active Life, Arts & Entertainment, Stadiums & Arenas",Progressive Field,cHwgtVdvZVn0AQFtWtUXXg,The food options were also top notch .,[top notch],[food options],...,1000054_4,1000002_2,[family fun],[great],['positive'],['great family fun'],But overall great family fun .,0.4203305244445801,1000054_4##1000002_2,0
2,914735,3,Arts & Entertainment,"['Arts & Entertainment', 'Cinema']","Arts & Entertainment, Cinema",Harkins Camelview at Fashion Square,bCh7vrbbxSKx0mtZnOvQoQ,Keep sniffing that glue Harkins and rest assured I will never PAY DOUBLE for anything especially a lousy movie .,[lousy],[movie],...,914735_3,1000002_2,[family fun],[great],['positive'],['great family fun'],But overall great family fun .,0.4203305244445801,914735_3##1000002_2,0
3,826256,5,Arts & Entertainment,"['Arts & Entertainment', 'Museums']","Arts & Entertainment, Museums",Heard Museum,d9wSnfW2kaJR-_auyK9G4A,However the katina doll collection was awesome .,[awesome],[katina doll collection],...,826256_5,1000002_2,[family fun],[great],['positive'],['great family fun'],But overall great family fun .,0.4203305244445801,826256_5##1000002_2,0
4,826270,1,Arts & Entertainment,"['Arts & Entertainment', 'Museums']","Arts & Entertainment, Museums",Heard Museum,d9wSnfW2kaJR-_auyK9G4A,A must try for the adventurous foodie .,[adventurous],[foodie],...,826270_1,1000002_2,[family fun],[great],['positive'],['great family fun'],But overall great family fun .,0.4203305244445801,826270_1##1000002_2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50051,880945,1,Arts & Entertainment,"['Performing Arts', 'Arts & Entertainment']","Performing Arts, Arts & Entertainment",Rock of Ages,_F3AMoo_zdl-he384ISQbw,Ushers are all standby to guide you to the seats .,[standby],[ushers],...,880945_1,999990_2,[concessions],[friendly and attentive],['positive'],['friendly and attentive concessions'],Concessions are friendly and attentive .,0.48531198501586914,880945_1##999990_2,0
50052,929246,2,Arts & Entertainment,"['Cinema', 'Arts & Entertainment']","Cinema, Arts & Entertainment",Cinemark Mesa 16,lqCJiHlxzRpuYt-u_XQUvg,Management was horrible and I 'm beyond disappointed with how rude and unprofessional they were !,[horrible],[management],...,929246_2,999990_2,[concessions],[friendly and attentive],['positive'],['friendly and attentive concessions'],Concessions are friendly and attentive .,0.48531198501586914,929246_2##999990_2,0
50053,901588,0,Arts & Entertainment,"['Casinos', 'Nightlife', 'Arts & Entertainment', 'Bars', 'Adult Entertainment']","Casinos, Nightlife, Arts & Entertainment, Bars, Adult Entertainment",Playboy Club,3g8zzg_5__kXMUM-8F6V1w,Sexy dealers .,[sexy],[dealers],...,901588_0,999990_2,[concessions],[friendly and attentive],['positive'],['friendly and attentive concessions'],Concessions are friendly and attentive .,0.48531198501586914,901588_0##999990_2,0
50054,842226,3,Arts & Entertainment,"['Arts & Entertainment', 'Arcades', 'Active Life', 'Venues & Event Spaces', 'Party & Event Planning', 'Amusement Parks', 'Event Planning & Services', 'Mini Golf', 'Laser Tag']","Arts & Entertainment, Arcades, Active Life, Venues & Event Spaces, Party & Event Planning, Amusement Parks, Event Planning & Services, Mini Golf, Laser Tag",King Putt Mini,EH3yMY64M0AQNpPj3EERWQ,Their staff is freaking awesome .,[freaking awesome],[staff],...,842226_3,999990_2,[concessions],[friendly and attentive],['positive'],['friendly and attentive concessions'],Concessions are friendly and attentive .,0.48531198501586914,842226_3##999990_2,0


In [105]:
nonmatched_sent_kp_df['domain']

0        Arts & Entertainment
1        Arts & Entertainment
2        Arts & Entertainment
3        Arts & Entertainment
4        Arts & Entertainment
                 ...         
50051    Arts & Entertainment
50052    Arts & Entertainment
50053    Arts & Entertainment
50054    Arts & Entertainment
50055    Arts & Entertainment
Name: domain, Length: 50056, dtype: object

### Create Dataset

In [106]:
sent_kp_dataset_df = pd.concat([matched_sent_kp_df, nonmatched_sent_kp_df])
sent_kp_dataset_df

Unnamed: 0,index,level_1,domain,categories_list,categories,name,business_id,sentences,opinions_x,aspects_x,...,opinion_aspect_pairs_y,key_point,predicted_WA_y,label,aspects_similarity,aspects_x_len,aspects_y_len,id_pair,sentiments_x,sentiments_y
1,487383,2,Hotels,"['Event Planning & Services', 'Venues & Event Spaces', 'Hotels', 'Hotels & Travel']","Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel",Sheraton Centre Toronto Hotel,M7FOXsuEuRbwEfxUrNKv5w,Free WiFi in lobby,[free],[wifi],...,['very fast wifi'],Wifi was very fast ~ 20MBps .,0.5944222211837769,1.0,,,,487383_2##702558_5,,
3,487385,0,Hotels,"['Event Planning & Services', 'Venues & Event Spaces', 'Hotels', 'Hotels & Travel']","Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel",Sheraton Centre Toronto Hotel,M7FOXsuEuRbwEfxUrNKv5w,Been staying at this hotel for years now and still enjoy the place .,[enjoy],[place],...,['family friendly place'],"Relaxed , quiet , family friendly place .",0.5604457855224609,1.0,,,,487385_0##777430_1,,
4,487385,0,Hotels,"['Event Planning & Services', 'Venues & Event Spaces', 'Hotels', 'Hotels & Travel']","Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel",Sheraton Centre Toronto Hotel,M7FOXsuEuRbwEfxUrNKv5w,Been staying at this hotel for years now and still enjoy the place .,[enjoy],[place],...,['family oriented place'],Definitely a family oriented place .,0.43907630443573,1.0,,,,487385_0##511294_4,,
27,487386,3,Hotels,"['Event Planning & Services', 'Venues & Event Spaces', 'Hotels', 'Hotels & Travel']","Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel",Sheraton Centre Toronto Hotel,M7FOXsuEuRbwEfxUrNKv5w,Staff was friendly .,[friendly],[staff],...,['above and beyond staff'],Staff goes above and beyond .,0.5060063004493713,1.0,,,,487386_3##922097_1,,
28,487386,3,Hotels,"['Event Planning & Services', 'Venues & Event Spaces', 'Hotels', 'Hotels & Travel']","Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel",Sheraton Centre Toronto Hotel,M7FOXsuEuRbwEfxUrNKv5w,Staff was friendly .,[friendly],[staff],...,['amazing staff'],Amazing staff & customer service .,0.5119611024856567,1.0,,,,487386_3##884704_1,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50051,880945,1,Arts & Entertainment,"['Performing Arts', 'Arts & Entertainment']","Performing Arts, Arts & Entertainment",Rock of Ages,_F3AMoo_zdl-he384ISQbw,Ushers are all standby to guide you to the seats .,[standby],[ushers],...,['friendly and attentive concessions'],Concessions are friendly and attentive .,0.48531198501586914,0.0,,,,880945_1##999990_2,['negative'],['positive']
50052,929246,2,Arts & Entertainment,"['Cinema', 'Arts & Entertainment']","Cinema, Arts & Entertainment",Cinemark Mesa 16,lqCJiHlxzRpuYt-u_XQUvg,Management was horrible and I 'm beyond disappointed with how rude and unprofessional they were !,[horrible],[management],...,['friendly and attentive concessions'],Concessions are friendly and attentive .,0.48531198501586914,0.0,,,,929246_2##999990_2,['negative'],['positive']
50053,901588,0,Arts & Entertainment,"['Casinos', 'Nightlife', 'Arts & Entertainment', 'Bars', 'Adult Entertainment']","Casinos, Nightlife, Arts & Entertainment, Bars, Adult Entertainment",Playboy Club,3g8zzg_5__kXMUM-8F6V1w,Sexy dealers .,[sexy],[dealers],...,['friendly and attentive concessions'],Concessions are friendly and attentive .,0.48531198501586914,0.0,,,,901588_0##999990_2,['positive'],['positive']
50054,842226,3,Arts & Entertainment,"['Arts & Entertainment', 'Arcades', 'Active Life', 'Venues & Event Spaces', 'Party & Event Planning', 'Amusement Parks', 'Event Planning & Services', 'Mini Golf', 'Laser Tag']","Arts & Entertainment, Arcades, Active Life, Venues & Event Spaces, Party & Event Planning, Amusement Parks, Event Planning & Services, Mini Golf, Laser Tag",King Putt Mini,EH3yMY64M0AQNpPj3EERWQ,Their staff is freaking awesome .,[freaking awesome],[staff],...,['friendly and attentive concessions'],Concessions are friendly and attentive .,0.48531198501586914,0.0,,,,842226_3##999990_2,['positive'],['positive']


In [107]:
sent_kp_dataset_df.loc[pd.isnull(sent_kp_dataset_df['sentiments_x']), 'sentiments_x'] = sent_kp_dataset_df.loc[pd.isnull(sent_kp_dataset_df['sentiments_x']), 'sentiments']
sent_kp_dataset_df.loc[pd.isnull(sent_kp_dataset_df['sentiments_y']), 'sentiments_y'] = sent_kp_dataset_df.loc[pd.isnull(sent_kp_dataset_df['sentiments_y']), 'sentiments']

In [108]:
sent_kp_dataset_df

Unnamed: 0,index,level_1,domain,categories_list,categories,name,business_id,sentences,opinions_x,aspects_x,...,opinion_aspect_pairs_y,key_point,predicted_WA_y,label,aspects_similarity,aspects_x_len,aspects_y_len,id_pair,sentiments_x,sentiments_y
1,487383,2,Hotels,"['Event Planning & Services', 'Venues & Event Spaces', 'Hotels', 'Hotels & Travel']","Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel",Sheraton Centre Toronto Hotel,M7FOXsuEuRbwEfxUrNKv5w,Free WiFi in lobby,[free],[wifi],...,['very fast wifi'],Wifi was very fast ~ 20MBps .,0.5944222211837769,1.0,,,,487383_2##702558_5,['positive'],['positive']
3,487385,0,Hotels,"['Event Planning & Services', 'Venues & Event Spaces', 'Hotels', 'Hotels & Travel']","Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel",Sheraton Centre Toronto Hotel,M7FOXsuEuRbwEfxUrNKv5w,Been staying at this hotel for years now and still enjoy the place .,[enjoy],[place],...,['family friendly place'],"Relaxed , quiet , family friendly place .",0.5604457855224609,1.0,,,,487385_0##777430_1,['positive'],['positive']
4,487385,0,Hotels,"['Event Planning & Services', 'Venues & Event Spaces', 'Hotels', 'Hotels & Travel']","Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel",Sheraton Centre Toronto Hotel,M7FOXsuEuRbwEfxUrNKv5w,Been staying at this hotel for years now and still enjoy the place .,[enjoy],[place],...,['family oriented place'],Definitely a family oriented place .,0.43907630443573,1.0,,,,487385_0##511294_4,['positive'],['positive']
27,487386,3,Hotels,"['Event Planning & Services', 'Venues & Event Spaces', 'Hotels', 'Hotels & Travel']","Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel",Sheraton Centre Toronto Hotel,M7FOXsuEuRbwEfxUrNKv5w,Staff was friendly .,[friendly],[staff],...,['above and beyond staff'],Staff goes above and beyond .,0.5060063004493713,1.0,,,,487386_3##922097_1,['positive'],['positive']
28,487386,3,Hotels,"['Event Planning & Services', 'Venues & Event Spaces', 'Hotels', 'Hotels & Travel']","Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel",Sheraton Centre Toronto Hotel,M7FOXsuEuRbwEfxUrNKv5w,Staff was friendly .,[friendly],[staff],...,['amazing staff'],Amazing staff & customer service .,0.5119611024856567,1.0,,,,487386_3##884704_1,['positive'],['positive']
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50051,880945,1,Arts & Entertainment,"['Performing Arts', 'Arts & Entertainment']","Performing Arts, Arts & Entertainment",Rock of Ages,_F3AMoo_zdl-he384ISQbw,Ushers are all standby to guide you to the seats .,[standby],[ushers],...,['friendly and attentive concessions'],Concessions are friendly and attentive .,0.48531198501586914,0.0,,,,880945_1##999990_2,['negative'],['positive']
50052,929246,2,Arts & Entertainment,"['Cinema', 'Arts & Entertainment']","Cinema, Arts & Entertainment",Cinemark Mesa 16,lqCJiHlxzRpuYt-u_XQUvg,Management was horrible and I 'm beyond disappointed with how rude and unprofessional they were !,[horrible],[management],...,['friendly and attentive concessions'],Concessions are friendly and attentive .,0.48531198501586914,0.0,,,,929246_2##999990_2,['negative'],['positive']
50053,901588,0,Arts & Entertainment,"['Casinos', 'Nightlife', 'Arts & Entertainment', 'Bars', 'Adult Entertainment']","Casinos, Nightlife, Arts & Entertainment, Bars, Adult Entertainment",Playboy Club,3g8zzg_5__kXMUM-8F6V1w,Sexy dealers .,[sexy],[dealers],...,['friendly and attentive concessions'],Concessions are friendly and attentive .,0.48531198501586914,0.0,,,,901588_0##999990_2,['positive'],['positive']
50054,842226,3,Arts & Entertainment,"['Arts & Entertainment', 'Arcades', 'Active Life', 'Venues & Event Spaces', 'Party & Event Planning', 'Amusement Parks', 'Event Planning & Services', 'Mini Golf', 'Laser Tag']","Arts & Entertainment, Arcades, Active Life, Venues & Event Spaces, Party & Event Planning, Amusement Parks, Event Planning & Services, Mini Golf, Laser Tag",King Putt Mini,EH3yMY64M0AQNpPj3EERWQ,Their staff is freaking awesome .,[freaking awesome],[staff],...,['friendly and attentive concessions'],Concessions are friendly and attentive .,0.48531198501586914,0.0,,,,842226_3##999990_2,['positive'],['positive']


In [109]:
sent_kp_dataset_df = sent_kp_dataset_df.drop(columns=['sentiments'])
sent_kp_dataset_df

Unnamed: 0,index,level_1,domain,categories_list,categories,name,business_id,sentences,opinions_x,aspects_x,...,opinion_aspect_pairs_y,key_point,predicted_WA_y,label,aspects_similarity,aspects_x_len,aspects_y_len,id_pair,sentiments_x,sentiments_y
1,487383,2,Hotels,"['Event Planning & Services', 'Venues & Event Spaces', 'Hotels', 'Hotels & Travel']","Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel",Sheraton Centre Toronto Hotel,M7FOXsuEuRbwEfxUrNKv5w,Free WiFi in lobby,[free],[wifi],...,['very fast wifi'],Wifi was very fast ~ 20MBps .,0.5944222211837769,1.0,,,,487383_2##702558_5,['positive'],['positive']
3,487385,0,Hotels,"['Event Planning & Services', 'Venues & Event Spaces', 'Hotels', 'Hotels & Travel']","Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel",Sheraton Centre Toronto Hotel,M7FOXsuEuRbwEfxUrNKv5w,Been staying at this hotel for years now and still enjoy the place .,[enjoy],[place],...,['family friendly place'],"Relaxed , quiet , family friendly place .",0.5604457855224609,1.0,,,,487385_0##777430_1,['positive'],['positive']
4,487385,0,Hotels,"['Event Planning & Services', 'Venues & Event Spaces', 'Hotels', 'Hotels & Travel']","Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel",Sheraton Centre Toronto Hotel,M7FOXsuEuRbwEfxUrNKv5w,Been staying at this hotel for years now and still enjoy the place .,[enjoy],[place],...,['family oriented place'],Definitely a family oriented place .,0.43907630443573,1.0,,,,487385_0##511294_4,['positive'],['positive']
27,487386,3,Hotels,"['Event Planning & Services', 'Venues & Event Spaces', 'Hotels', 'Hotels & Travel']","Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel",Sheraton Centre Toronto Hotel,M7FOXsuEuRbwEfxUrNKv5w,Staff was friendly .,[friendly],[staff],...,['above and beyond staff'],Staff goes above and beyond .,0.5060063004493713,1.0,,,,487386_3##922097_1,['positive'],['positive']
28,487386,3,Hotels,"['Event Planning & Services', 'Venues & Event Spaces', 'Hotels', 'Hotels & Travel']","Event Planning & Services, Venues & Event Spaces, Hotels, Hotels & Travel",Sheraton Centre Toronto Hotel,M7FOXsuEuRbwEfxUrNKv5w,Staff was friendly .,[friendly],[staff],...,['amazing staff'],Amazing staff & customer service .,0.5119611024856567,1.0,,,,487386_3##884704_1,['positive'],['positive']
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50051,880945,1,Arts & Entertainment,"['Performing Arts', 'Arts & Entertainment']","Performing Arts, Arts & Entertainment",Rock of Ages,_F3AMoo_zdl-he384ISQbw,Ushers are all standby to guide you to the seats .,[standby],[ushers],...,['friendly and attentive concessions'],Concessions are friendly and attentive .,0.48531198501586914,0.0,,,,880945_1##999990_2,['negative'],['positive']
50052,929246,2,Arts & Entertainment,"['Cinema', 'Arts & Entertainment']","Cinema, Arts & Entertainment",Cinemark Mesa 16,lqCJiHlxzRpuYt-u_XQUvg,Management was horrible and I 'm beyond disappointed with how rude and unprofessional they were !,[horrible],[management],...,['friendly and attentive concessions'],Concessions are friendly and attentive .,0.48531198501586914,0.0,,,,929246_2##999990_2,['negative'],['positive']
50053,901588,0,Arts & Entertainment,"['Casinos', 'Nightlife', 'Arts & Entertainment', 'Bars', 'Adult Entertainment']","Casinos, Nightlife, Arts & Entertainment, Bars, Adult Entertainment",Playboy Club,3g8zzg_5__kXMUM-8F6V1w,Sexy dealers .,[sexy],[dealers],...,['friendly and attentive concessions'],Concessions are friendly and attentive .,0.48531198501586914,0.0,,,,901588_0##999990_2,['positive'],['positive']
50054,842226,3,Arts & Entertainment,"['Arts & Entertainment', 'Arcades', 'Active Life', 'Venues & Event Spaces', 'Party & Event Planning', 'Amusement Parks', 'Event Planning & Services', 'Mini Golf', 'Laser Tag']","Arts & Entertainment, Arcades, Active Life, Venues & Event Spaces, Party & Event Planning, Amusement Parks, Event Planning & Services, Mini Golf, Laser Tag",King Putt Mini,EH3yMY64M0AQNpPj3EERWQ,Their staff is freaking awesome .,[freaking awesome],[staff],...,['friendly and attentive concessions'],Concessions are friendly and attentive .,0.48531198501586914,0.0,,,,842226_3##999990_2,['positive'],['positive']


In [110]:
sent_kp_dataset_df = sent_kp_dataset_df.rename(columns={'opinion_aspect_pairs_x': 'opinion_phrases_x',
                                  'opinion_aspect_pairs_y': 'opinion_phrases_y'})
for col in ['opinion_phrases_x', 'opinion_phrases_y', 'sentiments_x', 'sentiments_y']:
    sent_kp_dataset_df[col] = sent_kp_dataset_df[col].apply(lambda x: ast.literal_eval(x))

In [112]:
sent_kp_dataset_df.to_csv("../data/training/train_data.csv", index=False)

In [113]:
sent_kp_dataset_df.to_pickle("../data/training/train_data.pkl")