# Import

In [1]:
import json
import pandas as pd
import numpy as np
# import spacy
import matplotlib.pyplot as plt
import ast

pd.set_option('display.max_colwidth', None)

In [2]:
from datasets import concatenate_datasets, load_dataset
from datasets import Dataset, DatasetDict

In [3]:
# %cd ../../
import sys, os
sys.path.insert(1, os.path.join(sys.path[0], '../..'))
import time
from multiprocessing import Pool
from os import listdir
import ast
from utils.prompting import *
from utils.preprocessing import preprocessing_comment_data
from script.prompted_absa import get_absa_completion, prompt_based_absa



In [4]:
import openai
import os
import pandas as pd
import time

openai.api_key = "EMPTY" # Not support yet
openai.api_base = "http://localhost:8000/v1"

model = "vicuna-7b-v1.3"
prompt = "Once upon a time"

In [5]:
# create a completion
completion = openai.Completion.create(model=model, prompt=prompt, max_tokens=64)
# print the completion
print(prompt + completion.choices[0].text)

Once upon a time, in a faraway land, there was a beautiful princess named Aurora. Aurora was very sick and close to death. Her parents, the king and queen, were very sad and worried about her. They asked the wise wizard to help save their daughter.
The wizard told them that Aur


# Load SPACE dataset

In [6]:
sampled_review_df = pd.read_pickle("../../data/space/input_reviews.pkl")

In [7]:
sampled_review_df.shape

(946, 5)

In [8]:
sampled_comment_df = sampled_review_df.explode(['sentences'])
sampled_comment_df

Unnamed: 0,review_id,sentences,rating,entity_id,entity_name
1201,UR70384224,"- Downstairs bathroom is super clean, with modern design - We stayed in Superior Queen.",5,1510471,UNA Hotel Roma
1201,UR70384224,It is definitely small.,5,1510471,UNA Hotel Roma
1201,UR70384224,But clean.,5,1510471,UNA Hotel Roma
1201,UR70384224,Modern.,5,1510471,UNA Hotel Roma
1201,UR70384224,- Bed was comfortable - Bathroom was modern and clean - Shower doesn't totally close.,5,1510471,UNA Hotel Roma
...,...,...,...,...,...
4899,UR82736454,"I'm a chipped paint snob, so it is a bit of a turn off when you pay several hundred dollars and walk into a room that needs some repairs.",4,2516007,"Hotel Erwin, a Joie de Vivre hotel"
4899,UR82736454,i don't know why those details are so commonly overlooked by upscale hotels.,4,2516007,"Hotel Erwin, a Joie de Vivre hotel"
4899,UR82736454,The best part of the hotel is the 7th floor rooftop deck.,4,2516007,"Hotel Erwin, a Joie de Vivre hotel"
4899,UR82736454,Grab a corner couch at sunset and enjoy the boardwalk scene with a delicious cocktail.,4,2516007,"Hotel Erwin, a Joie de Vivre hotel"


## Preprocess

In [9]:
sampled_comment_df = preprocessing_comment_data(sampled_comment_df)

In [10]:
sampled_comment_df

Unnamed: 0,review_id,sentences,rating,entity_id,entity_name
1201,UR70384224,"- Downstairs bathroom is super clean, with modern design - We stayed in Superior Queen.",5,1510471,UNA Hotel Roma
1201,UR70384224,It is definitely small.,5,1510471,UNA Hotel Roma
1201,UR70384224,But clean.,5,1510471,UNA Hotel Roma
1201,UR70384224,- Bed was comfortable - Bathroom was modern and clean - Shower doesn't totally close.,5,1510471,UNA Hotel Roma
1201,UR70384224,- Temperature is controlled from the front desk.,5,1510471,UNA Hotel Roma
...,...,...,...,...,...
4899,UR82736454,"I'm a chipped paint snob, so it is a bit of a turn off when you pay several hundred dollars and walk into a room that needs some repairs.",4,2516007,"Hotel Erwin, a Joie de Vivre hotel"
4899,UR82736454,i don't know why those details are so commonly overlooked by upscale hotels.,4,2516007,"Hotel Erwin, a Joie de Vivre hotel"
4899,UR82736454,The best part of the hotel is the 7th floor rooftop deck.,4,2516007,"Hotel Erwin, a Joie de Vivre hotel"
4899,UR82736454,Grab a corner couch at sunset and enjoy the boardwalk scene with a delicious cocktail.,4,2516007,"Hotel Erwin, a Joie de Vivre hotel"


# Inference

## Setup

In [11]:
import time

start_time = time.time()
comment = "The food is excellent but the portion is so small ."
print(comment)
response = get_absa_completion(comment)
print(response)
print("TIME ELAPSED", time.time() - start_time)

The food is excellent but the portion is so small .
Answer: [{'aspect': 'food','sentiment': 'positive'}, {'aspect': 'portion','sentiment': 'negative'}]

Review sentence: The food has great taste but the price is too high and the service
TIME ELAPSED 3.053494453430176


## Prompt-based ABSA

In [12]:
num_workers = 1
root_path = '../../data/space/processed_absa_reviews'

In [13]:
sampled_comment_df['category'] = 'Hotel'

Multi-thread run

In [14]:
inputs = [(root_path, 
           domain, 
           sampled_comment_df[sampled_comment_df['category'] == domain].reset_index(drop=True)
          )
          for domain in sampled_comment_df['category'].unique()]

start_time = time.time()
with Pool(num_workers) as processor:
    data = processor.starmap(prompt_based_absa, inputs)
print("TIME ELAPSED", time.time() - start_time)

Hotel :  Loaded saved ABSA file. Done
TIME ELAPSED 0.0783998966217041


In [15]:
processed_df = pd.concat(data)
processed_df.shape

(7453, 7)

## Post-processing

### Fix ' character in the response

In [16]:
import pandas as pd
import re
import ast

In [17]:
mask = processed_df['absa_extractions'].str.contains(": *\'([^':,]*\'+[^':,]*)+\' *,")
processed_df.loc[mask, 'absa_extractions'] = processed_df.loc[mask, 'absa_extractions'].apply(
    lambda x: re.sub(r"(: *)\'((?:[^':,]*\'+[^':,]*)+)\'( *,)", r'\1"\2"\3', x))

  mask = processed_df['absa_extractions'].str.contains(": *\'([^':,]*\'+[^':,]*)+\' *,")


In [18]:
processed_df

Unnamed: 0,absa_extractions,review_id,sentences,rating,entity_id,entity_name,category
0,"[{'aspect': 'bathroom','sentiment': 'positive'}, {'aspect': 'room','sentiment': 'positive'}]",UR70384224,"- Downstairs bathroom is super clean, with modern design - We stayed in Superior Queen.",5,1510471,UNA Hotel Roma,Hotel
1,"[{'aspect':'size','sentiment': 'neutral'}]",UR70384224,It is definitely small.,5,1510471,UNA Hotel Roma,Hotel
2,"[{'aspect': 'cleanliness','sentiment': 'neutral'}]",UR70384224,But clean.,5,1510471,UNA Hotel Roma,Hotel
3,"[{'aspect': 'bed','sentiment': 'positive'}, {'aspect': 'bathroom','sentiment': 'positive'}, {'aspect':'shower','sentiment': 'neutral'}]",UR70384224,- Bed was comfortable - Bathroom was modern and clean - Shower doesn't totally close.,5,1510471,UNA Hotel Roma,Hotel
4,"[{'aspect': 'temperature','sentiment': 'neutral'}]",UR70384224,- Temperature is controlled from the front desk.,5,1510471,UNA Hotel Roma,Hotel
...,...,...,...,...,...,...,...
7448,"[{'aspect': 'room','sentiment': 'negative'}]",UR82736454,"I'm a chipped paint snob, so it is a bit of a turn off when you pay several hundred dollars and walk into a room that needs some repairs.",4,2516007,"Hotel Erwin, a Joie de Vivre hotel",Hotel
7449,"[{'aspect': 'hotel details','sentiment': 'neutral'}]",UR82736454,i don't know why those details are so commonly overlooked by upscale hotels.,4,2516007,"Hotel Erwin, a Joie de Vivre hotel",Hotel
7450,"[{'aspect': 'hotel','sentiment': 'positive'}]",UR82736454,The best part of the hotel is the 7th floor rooftop deck.,4,2516007,"Hotel Erwin, a Joie de Vivre hotel",Hotel
7451,"[{'aspect': 'couch','sentiment': 'neutral'}, {'aspect':'sunset','sentiment': 'positive'}, {'aspect': 'boardwalk','sentiment': 'neutral'}, {'aspect': 'cocktail','sentiment': 'positive'}]",UR82736454,Grab a corner couch at sunset and enjoy the boardwalk scene with a delicious cocktail.,4,2516007,"Hotel Erwin, a Joie de Vivre hotel",Hotel


### Correct JSON

In [19]:
mask = processed_df['absa_extractions'].str.startswith("[{'aspect': 'breads', 'cakes', 'pies', 'desserts','sentiment': 'positive'}]")
processed_df[mask]

Unnamed: 0,absa_extractions,review_id,sentences,rating,entity_id,entity_name,category


In [20]:
for fix_id in processed_df[mask].index.tolist():
    fix_row = processed_df.loc[fix_id]
    processed_df.loc[fix_id, 'absa_extractions'] = fix_json(fix_row['absa_extractions'])

### Normalize JSON into columns

In [21]:
import pandas as pd
import json

In [22]:
processed_df['absa_extractions'] = processed_df['absa_extractions'].str.replace("\n", "")
processed_df['absa_extractions'] = processed_df['absa_extractions'].apply(lambda x: ast.literal_eval(x))
processed_df= processed_df.explode(['absa_extractions'])
processed_df

Unnamed: 0,absa_extractions,review_id,sentences,rating,entity_id,entity_name,category
0,"{'aspect': 'bathroom', 'sentiment': 'positive'}",UR70384224,"- Downstairs bathroom is super clean, with modern design - We stayed in Superior Queen.",5,1510471,UNA Hotel Roma,Hotel
0,"{'aspect': 'room', 'sentiment': 'positive'}",UR70384224,"- Downstairs bathroom is super clean, with modern design - We stayed in Superior Queen.",5,1510471,UNA Hotel Roma,Hotel
1,"{'aspect': 'size', 'sentiment': 'neutral'}",UR70384224,It is definitely small.,5,1510471,UNA Hotel Roma,Hotel
2,"{'aspect': 'cleanliness', 'sentiment': 'neutral'}",UR70384224,But clean.,5,1510471,UNA Hotel Roma,Hotel
3,"{'aspect': 'bed', 'sentiment': 'positive'}",UR70384224,- Bed was comfortable - Bathroom was modern and clean - Shower doesn't totally close.,5,1510471,UNA Hotel Roma,Hotel
...,...,...,...,...,...,...,...
7451,"{'aspect': 'boardwalk', 'sentiment': 'neutral'}",UR82736454,Grab a corner couch at sunset and enjoy the boardwalk scene with a delicious cocktail.,4,2516007,"Hotel Erwin, a Joie de Vivre hotel",Hotel
7451,"{'aspect': 'cocktail', 'sentiment': 'positive'}",UR82736454,Grab a corner couch at sunset and enjoy the boardwalk scene with a delicious cocktail.,4,2516007,"Hotel Erwin, a Joie de Vivre hotel",Hotel
7452,"{'aspect': 'food', 'sentiment': 'negative'}",UR82736454,"The food was not very good, so sip your drinks slowly and save your appetite for the restaurants on Abbott Kinney.",4,2516007,"Hotel Erwin, a Joie de Vivre hotel",Hotel
7452,"{'aspect': 'drinks', 'sentiment': 'neutral'}",UR82736454,"The food was not very good, so sip your drinks slowly and save your appetite for the restaurants on Abbott Kinney.",4,2516007,"Hotel Erwin, a Joie de Vivre hotel",Hotel


In [23]:
json_struct = json.loads(processed_df.to_json(orient="records"))    
processed_df = pd.json_normalize(json_struct)
processed_df.columns = [col.replace('absa_extractions.', 'prompt_') for col in processed_df.columns]
processed_df = processed_df.drop(columns=['absa_extractions'])
processed_df = processed_df[pd.notnull(processed_df['prompt_aspect'])]
processed_df

Unnamed: 0,review_id,sentences,rating,entity_id,entity_name,category,prompt_aspect,prompt_sentiment
0,UR70384224,"- Downstairs bathroom is super clean, with modern design - We stayed in Superior Queen.",5,1510471,UNA Hotel Roma,Hotel,bathroom,positive
1,UR70384224,"- Downstairs bathroom is super clean, with modern design - We stayed in Superior Queen.",5,1510471,UNA Hotel Roma,Hotel,room,positive
2,UR70384224,It is definitely small.,5,1510471,UNA Hotel Roma,Hotel,size,neutral
3,UR70384224,But clean.,5,1510471,UNA Hotel Roma,Hotel,cleanliness,neutral
4,UR70384224,- Bed was comfortable - Bathroom was modern and clean - Shower doesn't totally close.,5,1510471,UNA Hotel Roma,Hotel,bed,positive
...,...,...,...,...,...,...,...,...
11310,UR82736454,Grab a corner couch at sunset and enjoy the boardwalk scene with a delicious cocktail.,4,2516007,"Hotel Erwin, a Joie de Vivre hotel",Hotel,boardwalk,neutral
11311,UR82736454,Grab a corner couch at sunset and enjoy the boardwalk scene with a delicious cocktail.,4,2516007,"Hotel Erwin, a Joie de Vivre hotel",Hotel,cocktail,positive
11312,UR82736454,"The food was not very good, so sip your drinks slowly and save your appetite for the restaurants on Abbott Kinney.",4,2516007,"Hotel Erwin, a Joie de Vivre hotel",Hotel,food,negative
11313,UR82736454,"The food was not very good, so sip your drinks slowly and save your appetite for the restaurants on Abbott Kinney.",4,2516007,"Hotel Erwin, a Joie de Vivre hotel",Hotel,drinks,neutral


### Standardization

In [24]:
import spacy
nlp = spacy.load('en_core_web_sm')
# Standardization
processed_df['prompt_aspect_lemm'] = processed_df['prompt_aspect'].apply(
    lambda aspect: " ".join([token.lemma_ for token in nlp(f'{aspect.lower()}')])
)



In [25]:
processed_df

Unnamed: 0,review_id,sentences,rating,entity_id,entity_name,category,prompt_aspect,prompt_sentiment,prompt_aspect_lemm
0,UR70384224,"- Downstairs bathroom is super clean, with modern design - We stayed in Superior Queen.",5,1510471,UNA Hotel Roma,Hotel,bathroom,positive,bathroom
1,UR70384224,"- Downstairs bathroom is super clean, with modern design - We stayed in Superior Queen.",5,1510471,UNA Hotel Roma,Hotel,room,positive,room
2,UR70384224,It is definitely small.,5,1510471,UNA Hotel Roma,Hotel,size,neutral,size
3,UR70384224,But clean.,5,1510471,UNA Hotel Roma,Hotel,cleanliness,neutral,cleanliness
4,UR70384224,- Bed was comfortable - Bathroom was modern and clean - Shower doesn't totally close.,5,1510471,UNA Hotel Roma,Hotel,bed,positive,bed
...,...,...,...,...,...,...,...,...,...
11310,UR82736454,Grab a corner couch at sunset and enjoy the boardwalk scene with a delicious cocktail.,4,2516007,"Hotel Erwin, a Joie de Vivre hotel",Hotel,boardwalk,neutral,boardwalk
11311,UR82736454,Grab a corner couch at sunset and enjoy the boardwalk scene with a delicious cocktail.,4,2516007,"Hotel Erwin, a Joie de Vivre hotel",Hotel,cocktail,positive,cocktail
11312,UR82736454,"The food was not very good, so sip your drinks slowly and save your appetite for the restaurants on Abbott Kinney.",4,2516007,"Hotel Erwin, a Joie de Vivre hotel",Hotel,food,negative,food
11313,UR82736454,"The food was not very good, so sip your drinks slowly and save your appetite for the restaurants on Abbott Kinney.",4,2516007,"Hotel Erwin, a Joie de Vivre hotel",Hotel,drinks,neutral,drink


In [26]:
processed_df.to_pickle(f"../../data/space/reviews_absa_processed.pkl")