In [1]:
import torch
torch.cuda.empty_cache()

In [2]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
from sentence_transformers import SentenceTransformer, InputExample, LoggingHandler, losses, models, util
import torch



In [4]:
from datasets import concatenate_datasets, load_dataset
from datasets import Dataset, DatasetDict
from sklearn.metrics import recall_score, precision_score

In [5]:
pd.set_option('display.max_colwidth', None)

# Load Test dataset

In [6]:
df = pd.read_pickle(f"../data/Evaluation/test_data.pkl")

In [7]:
df

Unnamed: 0,HITId,comment_id,label,full_key_point,key_point_id,comment,full_comment,attributes,topic,isMultiAspect,...,old_key_point_id,key_point,aspects_x,opinions_x,opinion_aspect_pairs_x,sentiments_x,aspects_y,opinions_y,opinion_aspect_pairs_y,sentiments_y
0,302U8RURJZ1WH0D09XZ8R7HSOP5NVW,arg_0_537,1,Terrible administration and management .,kp_0_2,"[uninviting experience <SEP> negative, rude employees <SEP> negative]",Employees are rude and make the experience uninviting .,"[restaurant -> atmosphere, staff]",Arts & Entertainment,True,...,kp_0_2,terrible administration <SEP> negative,"[experience, employees]","[uninviting, rude]","[uninviting experience, rude employees]","[negative, negative]",administration,terrible,['terrible administration'],['negative']
1,302U8RURJZ1WH0D09XZ8R7HSOP5NVW,arg_0_537,1,Rude and Unprofessional staff .,kp_0_24,"[uninviting experience <SEP> negative, rude employees <SEP> negative]",Employees are rude and make the experience uninviting .,"[restaurant -> atmosphere, staff]",Arts & Entertainment,True,...,kp_0_24,unprofessional staff <SEP> negative,"[experience, employees]","[uninviting, rude]","[uninviting experience, rude employees]","[negative, negative]",staff,unprofessional,['unprofessional staff'],['negative']
2,302U8RURJZ1WH0D09XZ8R7HSOP5NVW,arg_0_537,0,Great for conventions and concerts ! !,kp_0_4,"[uninviting experience <SEP> negative, rude employees <SEP> negative]",Employees are rude and make the experience uninviting .,"[restaurant -> atmosphere, staff]",Arts & Entertainment,True,...,kp_0_4,great for conventions <SEP> positive,"[experience, employees]","[uninviting, rude]","[uninviting experience, rude employees]","[negative, negative]",conventions,great for,['great for conventions'],['positive']
3,302U8RURJZ1WH0D09XZ8R7HSOP5NVW,arg_0_537,0,Horrible customer service .,kp_0_5,"[uninviting experience <SEP> negative, rude employees <SEP> negative]",Employees are rude and make the experience uninviting .,"[restaurant -> atmosphere, staff]",Arts & Entertainment,True,...,kp_0_5,horrible customer service <SEP> negative,"[experience, employees]","[uninviting, rude]","[uninviting experience, rude employees]","[negative, negative]",customer service,horrible,['horrible customer service'],['negative']
4,302U8RURJZ1WH0D09XZ8R7HSOP5NVW,arg_0_537,0,It had wonderful acoustics .,kp_0_0,"[uninviting experience <SEP> negative, rude employees <SEP> negative]",Employees are rude and make the experience uninviting .,"[restaurant -> atmosphere, staff]",Arts & Entertainment,True,...,kp_0_0,wonderful acoustics <SEP> positive,"[experience, employees]","[uninviting, rude]","[uninviting experience, rude employees]","[negative, negative]",acoustics,wonderful,['wonderful acoustics'],['positive']
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6794,3ZXV7Q5FJBOOUZUJKSLQTAKU1UIFC3,arg_4_6440,0,Great family owned restaurant .,kp_4_25,[friendly staff <SEP> positive],The staff are friendly and attentive .,[staff],Restaurants,False,...,kp_4_25,great family owned restaurant <SEP> positive,[staff],[friendly],[friendly staff],[positive],restaurant,great family owned,['great family owned restaurant'],['positive']
6795,3ZXV7Q5FJBOOUZUJKSLQTAKU1UIFC3,arg_4_6440,0,Worst service in the world !,kp_4_27,[friendly staff <SEP> positive],The staff are friendly and attentive .,[staff],Restaurants,False,...,kp_4_27,worst service <SEP> negative,[staff],[friendly],[friendly staff],[positive],service,worst,['worst service'],['negative']
6796,3ZXV7Q5FJBOOUZUJKSLQTAKU1UIFC3,arg_4_6440,0,Rude and inattentive staff .,kp_4_30,[friendly staff <SEP> positive],The staff are friendly and attentive .,[staff],Restaurants,False,...,kp_4_30,inattentive staff <SEP> negative,[staff],[friendly],[friendly staff],[positive],staff,inattentive,['inattentive staff'],['negative']
6797,3ZXV7Q5FJBOOUZUJKSLQTAKU1UIFC3,arg_4_6440,0,Great sushi and sashimi .,kp_4_31,[friendly staff <SEP> positive],The staff are friendly and attentive .,[staff],Restaurants,False,...,kp_4_31,great sushi <SEP> positive,[staff],[friendly],[friendly staff],[positive],sushi,great,['great sushi'],['positive']


In [8]:
df['label'].value_counts()

0    6380
1     419
Name: label, dtype: int64

# Evaluation

In [9]:
from utils.KeyPointEvaluator import *
from utils.KeyPointEvaluatorRev import *

In [10]:
load_predictions

<function utils.KeyPointEvaluator.load_predictions(preds)>

In [11]:
def get_model(base_model_dir, category):
    models_dir = f'{base_model_dir}/{category}'
    latest_checkpoint = os.listdir(models_dir)[-1]
    models_path = f'{models_dir}/{latest_checkpoint}/'

    return SentenceTransformer(models_path)

In [12]:
def do_eval(df, model, in_category=True):
    comment_df, kp_df, labels_df = prepare_comment_kp_label_input(df)
    
    #Perform prediction on the validation/test dataframes
    preds = perform_preds(model, comment_df, kp_df, in_category)

    # Get the best predicted KP for every review sentence
    merged_df = get_predictions(preds, labels_df, comment_df)
    merged_df = merged_df.drop_duplicates(subset=['comment_id', 'key_point_id'])
    
    precisions = calc_mean_average_precision(merged_df, "label")
    
    return merged_df, precisions

## In-category Evaluation

### All Comments in test data

In [13]:
print(f"########## IN-CATEGORY EVALUATION ##########")
perf = []
for category in sorted(df['topic'].unique()):
#     models_path = 'quangantang/roberta-large-askpa-kp-matching'
#     pred_output_path = './training_material/inference-results/'
    base_model_dir = f'./model/ASKPA/in-category/'
    model = get_model(base_model_dir, category)
    
    merged_df, precisions = do_eval(df[df['topic'] == category], model)
    del model
    torch.cuda.empty_cache()
    
    perf += [pd.Series({'Business Category': category, 'Average Precision': precisions})]

perf_df = pd.concat(perf, axis=1).T
perf_df

########## IN-CATEGORY EVALUATION ##########


Unnamed: 0,Business Category,Average Precision
0,Arts & Entertainment,1.0
1,Automotive,0.774513
2,Beauty & Spas,0.977106
3,Hotels,0.988942
4,Restaurants,0.865774


### Comments containing multiple opinions in test data

In [14]:
print(f"########## IN-CATEGORY EVALUATION ##########")
perf = []
for category in sorted(df['topic'].unique()):    
    base_model_dir = f'./model/ASKPA/in-category/'
    model = get_model(base_model_dir, category)
    
    merged_df, precisions = do_eval(df[(df['topic'] == category) & (df['isMultiAspect'] == True)], model)
    del model
    torch.cuda.empty_cache()
    
    perf += [pd.Series({'Business Category': category, 'Average Precision': precisions})]

perf_df = pd.concat(perf, axis=1).T
perf_df

########## IN-CATEGORY EVALUATION ##########


Unnamed: 0,Business Category,Average Precision
0,Arts & Entertainment,1.0
1,Automotive,0.79619
2,Beauty & Spas,0.942857
3,Hotels,0.926429
4,Restaurants,0.827243


## Out-of-category Evaluation

### All Comments in test data

In [15]:
print(f"########## OUT-OF-CATEGORY EVALUATION ##########")
perf = []
for category in sorted(df['topic'].unique()):    
    base_model_dir = f'./model/ASKPA/out-of-category/'
    model = get_model(base_model_dir, category)
    
    merged_df, precisions = do_eval(df[df['topic'] == category], model, in_category=False)
    del model
    torch.cuda.empty_cache()
    
    perf += [pd.Series({'Business Category': category, 'Average Precision': precisions})]

perf_df = pd.concat(perf, axis=1).T
perf_df

########## OUT-OF-CATEGORY EVALUATION ##########


Unnamed: 0,Business Category,Average Precision
0,Arts & Entertainment,0.979641
1,Automotive,0.763807
2,Beauty & Spas,0.939028
3,Hotels,0.983829
4,Restaurants,0.892744


### Comments containing multiple opinions in test data

In [16]:
print(f"########## OUT-OF-CATEGORY EVALUATION ##########")
perf = []
for category in sorted(df['topic'].unique()):    
    base_model_dir = f'./model/ASKPA/out-of-category/'
    model = get_model(base_model_dir, category)
    
    merged_df, precisions = do_eval(df[(df['topic'] == category) & (df['isMultiAspect'] == True)], model, in_category=False)
    del model
    torch.cuda.empty_cache()
    
    perf += [pd.Series({'Business Category': category, 'Average Precision': precisions})]

perf_df = pd.concat(perf, axis=1).T
perf_df

########## OUT-OF-CATEGORY EVALUATION ##########


Unnamed: 0,Business Category,Average Precision
0,Arts & Entertainment,1.0
1,Automotive,0.642857
2,Beauty & Spas,0.770833
3,Hotels,0.917582
4,Restaurants,0.77234


In [17]:
merged_df

Unnamed: 0,comment_id_new,comment_id,topic,key_point_id,score,label
0,arg_4_1456_0,arg_4_1456,Restaurants,kp_4_29,0.999401,1
1,arg_4_1456_1,arg_4_1456,Restaurants,kp_4_6,1.0,1
2,arg_4_1766_0,arg_4_1766,Restaurants,kp_4_25,0.992791,0
4,arg_4_1766_1,arg_4_1766,Restaurants,kp_4_12,0.995837,0
6,arg_4_1829_0,arg_4_1829,Restaurants,kp_4_26,0.45731,0
7,arg_4_1829_1,arg_4_1829,Restaurants,kp_4_8,0.648316,0
8,arg_4_1973_0,arg_4_1973,Restaurants,kp_4_3,0.510202,0
9,arg_4_1973_1,arg_4_1973,Restaurants,kp_4_17,0.461653,1
10,arg_4_2015_0,arg_4_2015,Restaurants,kp_4_17,0.625933,0
11,arg_4_2015_1,arg_4_2015,Restaurants,kp_4_10,0.941599,1


# Quantitative KP Coverage and Precision/Recall

In [18]:
from sklearn.metrics import recall_score, precision_score

def get_top_kp_coverages(merged_df, df, category, top=None, selected_kps=[]):
    comment_df, kp_df, labels_df = prepare_comment_kp_label_input(df)

    valid_df = df[(df['label'] == 1) & (df['topic'] == category)]
    valid_kp = valid_df['full_key_point'].value_counts()
    
    top_kp = merged_df[merged_df['topic'] == category].merge(kp_df, on=['key_point_id'])['full_key_point'].value_counts()
    result = top_kp[top_kp.index.isin(valid_kp.index.tolist())]
    if top != None:
        result = result[result.index.isin(valid_kp.head(top).index)]
    elif len(selected_kps) > 0:
        result = result[result.index.isin(selected_kps)]
    
    if top != None:
        valid_df = df[df['full_key_point'].isin(valid_kp.head(top).index)]
    elif len(selected_kps) > 0:
        valid_df = valid_df[valid_df['full_key_point'].isin(selected_kps)]
    
    pred_df = merged_df[merged_df['topic'] == category].merge(kp_df, on=['key_point_id'])
    if top != None:
        pred_df = pred_df[pred_df['full_key_point'].isin(valid_kp.head(top).index)]
    elif len(selected_kps) > 0:
        pred_df = pred_df[pred_df['full_key_point'].isin(selected_kps)]
        
    return valid_df, pred_df

In [19]:
def calculate_precision_recall(valid_df, pred_df):
    pred_df['pred_label'] = 1
    
    # Calculate Precision
    new_valid_df = valid_df.merge(pred_df[['comment_id', 'key_point_id', 'pred_label']], how='right')
    new_valid_df['label'] = new_valid_df['label'].fillna(0)
    precision = precision_score(new_valid_df['label'], new_valid_df['pred_label'])

    # Calculate Recall
    new_valid_df = valid_df.merge(pred_df[['comment_id', 'key_point_id', 'pred_label']], how='left')
    new_valid_df['pred_label'] = new_valid_df['pred_label'].fillna(0)
    recall = recall_score(new_valid_df['label'], new_valid_df['pred_label'])
    
    return precision, recall

In [20]:
def generate_coverage_table(valid_df, pred_df): 
    coverage_compare_table = pred_df['full_key_point'].value_counts().reset_index()
    coverage_compare_table.columns = ['Key Points', 'Predicted Coverage']
    coverage_compare_table['Actual Coverage'] = valid_df['full_key_point'].value_counts().tolist()
    return coverage_compare_table

**Use only in-catgeory result to evaluate**

## Arts & Entertainment

In [21]:
category = "Arts & Entertainment"

In [22]:
base_model_dir = f'./model/ASKPA/in-category/'
model = get_model(base_model_dir, category)
merged_df, precisions = do_eval(df[(df['topic'] == category)], model)
del model
torch.cuda.empty_cache()

Calculate Precision/Recall of model performance on important KPs (top 3 KPs from the human annotation)

In [23]:
selected_kps = ['Friendly and helpful staff .', 'Seats are adequately comfortable .   ', 'Horrible customer service .']
valid_df, pred_df = get_top_kp_coverages(merged_df, df, 'Arts & Entertainment', None, selected_kps)

In [24]:
generate_coverage_table(valid_df, pred_df)

Unnamed: 0,Key Points,Predicted Coverage,Actual Coverage
0,Friendly and helpful staff .,10,14
1,Seats are adequately comfortable .,4,4
2,Horrible customer service .,2,3


In [25]:
precision, recall = calculate_precision_recall(valid_df, pred_df)
print("Precision: {:.2f}".format(precision))
print("Recall: {:.2f}".format(recall))

Precision: 1.00
Recall: 0.76


## Automotive

In [26]:
category = "Automotive"

In [27]:
base_model_dir = f'./model/ASKPA/in-category/'
model = get_model(base_model_dir, category)
merged_df, precisions = do_eval(df[(df['topic'] == category)], model)
del model
torch.cuda.empty_cache()

Calculate Precision/Recall of model performance on important KPs (top 3 KPs from the human annotation)

In [28]:
selected_kps = ['They have excellent customer service .', 'The employees here are wonderful !', 'Very professional staff .']
valid_df, pred_df = get_top_kp_coverages(merged_df, df, 'Automotive', None, selected_kps)

In [29]:
generate_coverage_table(valid_df, pred_df)

Unnamed: 0,Key Points,Predicted Coverage,Actual Coverage
0,They have excellent customer service .,6,29
1,Very professional staff .,4,13
2,The employees here are wonderful !,3,13


In [30]:
precision, recall = calculate_precision_recall(valid_df, pred_df)
print("Precision: {:.2f}".format(precision))
print("Recall: {:.2f}".format(recall))

Precision: 0.92
Recall: 0.22


## Beauty & Spas

In [31]:
category = "Beauty & Spas"

In [32]:
base_model_dir = f'./model/ASKPA/in-category/'
model = get_model(base_model_dir, category)
merged_df, precisions = do_eval(df[(df['topic'] == category)], model)
del model
torch.cuda.empty_cache()

Calculate Precision/Recall of model performance on important KPs (top 3 KPs from the human annotation)

In [33]:
selected_kps = ['Staff is friendly and accommodating .', 'Customer service- EXCELLENT !', 'Amazing & professional service .']
valid_df, pred_df = get_top_kp_coverages(merged_df, df, "Beauty & Spas", None, selected_kps)

In [34]:
generate_coverage_table(valid_df, pred_df)

Unnamed: 0,Key Points,Predicted Coverage,Actual Coverage
0,Staff is friendly and accommodating .,14,18
1,Customer service- EXCELLENT !,5,14
2,Amazing & professional service .,3,13


In [35]:
precision, recall = calculate_precision_recall(valid_df, pred_df)
print("Precision: {:.2f}".format(precision))
print("Recall: {:.2f}".format(recall))

Precision: 0.91
Recall: 0.44


## Hotels

In [36]:
category = "Hotels"

In [37]:
base_model_dir = f'./model/ASKPA/in-category/'
model = get_model(base_model_dir, category)
merged_df, precisions = do_eval(df[(df['topic'] == category)], model)
del model
torch.cuda.empty_cache()

Calculate Precision/Recall of model performance on important KPs (top 3 KPs from the human annotation)

In [38]:
selected_kps = ['Friendly and helpful staff .', 'Clean and comfortable rooms .', 'The ambience is wonderfully peaceful .']
valid_df, pred_df = get_top_kp_coverages(merged_df, df, 'Hotels', None, selected_kps)

In [39]:
generate_coverage_table(valid_df, pred_df)

Unnamed: 0,Key Points,Predicted Coverage,Actual Coverage
0,Friendly and helpful staff .,19,21
1,Clean and comfortable rooms .,9,13
2,The ambience is wonderfully peaceful .,1,1


In [40]:
precision, recall = calculate_precision_recall(valid_df, pred_df)
print("Precision: {:.2f}".format(precision))
print("Recall: {:.2f}".format(recall))

Precision: 0.93
Recall: 0.77


## Restaurants

In [41]:
category = "Restaurants"

In [42]:
base_model_dir = f'./model/ASKPA/in-category/'
model = get_model(base_model_dir, category)
merged_df, precisions = do_eval(df[(df['topic'] == category)], model)
del model
torch.cuda.empty_cache()

Calculate Precision/Recall of model performance on important KPs (top 3 KPs from the human annotation)

In [43]:
selected_kps = ['Staff was courteous and accommodating .', 
                'Fresh food , using local produce .', 'The service here was exceptional .  ']
valid_df, pred_df = get_top_kp_coverages(merged_df, df, 'Restaurants', None, selected_kps)

In [44]:
generate_coverage_table(valid_df, pred_df)

Unnamed: 0,Key Points,Predicted Coverage,Actual Coverage
0,Staff was courteous and accommodating .,10,19
1,The service here was exceptional .,5,5
2,"Fresh food , using local produce .",2,5


In [45]:
precision, recall = calculate_precision_recall(valid_df, pred_df)
print("Precision: {:.2f}".format(precision))
print("Recall: {:.2f}".format(recall))

Precision: 0.88
Recall: 0.52
