In [1]:
import torch
torch.cuda.empty_cache()

In [2]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
from sentence_transformers import SentenceTransformer, InputExample, LoggingHandler, losses, models, util
import torch



In [4]:
from datasets import concatenate_datasets, load_dataset
from datasets import Dataset, DatasetDict
from sklearn.metrics import recall_score, precision_score

In [5]:
pd.set_option('display.max_colwidth', None)

# Load Test dataset

In [6]:
df = pd.read_pickle(f"../data/Evaluation/test_data.pkl")

In [7]:
df

Unnamed: 0,HITId,comment_id,label,full_key_point,key_point_id,comment,full_comment,attributes,topic,isMultiAspect,predicted_WA,num_of_token,old_key_point_id,key_point
0,302U8RURJZ1WH0D09XZ8R7HSOP5NVW,arg_0_537,1,Terrible administration and management .,kp_0_2,"[uninviting experience <SEP> negative, rude employees <SEP> negative]",Employees are rude and make the experience uninviting .,"[restaurant -> atmosphere, staff]",Arts & Entertainment,True,0.786076,8,kp_0_2,terrible administration <SEP> negative
1,302U8RURJZ1WH0D09XZ8R7HSOP5NVW,arg_0_537,1,Rude and Unprofessional staff .,kp_0_24,"[uninviting experience <SEP> negative, rude employees <SEP> negative]",Employees are rude and make the experience uninviting .,"[restaurant -> atmosphere, staff]",Arts & Entertainment,True,0.786076,8,kp_0_24,unprofessional staff <SEP> negative
2,302U8RURJZ1WH0D09XZ8R7HSOP5NVW,arg_0_537,0,Great for conventions and concerts ! !,kp_0_4,"[uninviting experience <SEP> negative, rude employees <SEP> negative]",Employees are rude and make the experience uninviting .,"[restaurant -> atmosphere, staff]",Arts & Entertainment,True,0.786076,8,kp_0_4,great for conventions <SEP> positive
3,302U8RURJZ1WH0D09XZ8R7HSOP5NVW,arg_0_537,0,Horrible customer service .,kp_0_5,"[uninviting experience <SEP> negative, rude employees <SEP> negative]",Employees are rude and make the experience uninviting .,"[restaurant -> atmosphere, staff]",Arts & Entertainment,True,0.786076,8,kp_0_5,horrible customer service <SEP> negative
4,302U8RURJZ1WH0D09XZ8R7HSOP5NVW,arg_0_537,0,It had wonderful acoustics .,kp_0_0,"[uninviting experience <SEP> negative, rude employees <SEP> negative]",Employees are rude and make the experience uninviting .,"[restaurant -> atmosphere, staff]",Arts & Entertainment,True,0.786076,8,kp_0_0,wonderful acoustics <SEP> positive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6794,3ZXV7Q5FJBOOUZUJKSLQTAKU1UIFC3,arg_4_6440,0,Great family owned restaurant .,kp_4_25,[friendly staff <SEP> positive],The staff are friendly and attentive .,[staff],Restaurants,False,0.656356,6,kp_4_25,great family owned restaurant <SEP> positive
6795,3ZXV7Q5FJBOOUZUJKSLQTAKU1UIFC3,arg_4_6440,0,Worst service in the world !,kp_4_27,[friendly staff <SEP> positive],The staff are friendly and attentive .,[staff],Restaurants,False,0.656356,6,kp_4_27,worst service <SEP> negative
6796,3ZXV7Q5FJBOOUZUJKSLQTAKU1UIFC3,arg_4_6440,0,Rude and inattentive staff .,kp_4_30,[friendly staff <SEP> positive],The staff are friendly and attentive .,[staff],Restaurants,False,0.656356,6,kp_4_30,inattentive staff <SEP> negative
6797,3ZXV7Q5FJBOOUZUJKSLQTAKU1UIFC3,arg_4_6440,0,Great sushi and sashimi .,kp_4_31,[friendly staff <SEP> positive],The staff are friendly and attentive .,[staff],Restaurants,False,0.656356,6,kp_4_31,great sushi <SEP> positive


In [8]:
df['label'].value_counts()

0    6380
1     419
Name: label, dtype: int64

# Evaluation

In [9]:
def match_comment_with_keypoints(result, kp_dict, comment_dict):
    
    for comment, comment_embedding in comment_dict.items():
        result[comment] = {}
        for kp, kp_embedding in kp_dict.items():
            result[comment][kp] = util.pytorch_cos_sim(comment_embedding, kp_embedding).item()
        
        #Applying softmax
        kp_scores = list(result[comment].items())
        kp_ids, kp_scores = zip(*kp_scores)
        result[comment] = {kp_id:score for kp_id, score in zip(kp_ids, kp_scores)}
        

    return result

def predict(model, comment_df, keypoint_df, output_path, append_topic=False):
    comment_keypoints = {}
    for topic in comment_df.topic.unique():
        for stance in [-1, 1]:
            topic_keypoints_ids = keypoint_df[(keypoint_df.topic==topic) & (keypoint_df.stance==stance)]['key_point_id'].tolist()
            topic_keypoints = keypoint_df[(keypoint_df.topic==topic) & (keypoint_df.stance==stance)]['key_point'].tolist()
            if append_topic:
                topic_keypoints = [topic + ' <SEP> ' + x for x in topic_keypoints]
                
            topic_keypoints_embeddings = model.encode(topic_keypoints)
            topic_kp_embed = dict(zip(topic_keypoints_ids, topic_keypoints_embeddings))

            topic_comments_ids = comment_df[(comment_df.topic==topic) & (comment_df.stance==stance)]['comment_id'].tolist()
            topic_comments = comment_df[(comment_df.topic==topic) & (comment_df.stance==stance)]['comment'].tolist()
            topic_comments_embeddings = model.encode(topic_comments)
            topic_comment_embed= dict(zip(topic_comments_ids, topic_comments_embeddings))

            comment_keypoints = match_comment_with_keypoints(comment_keypoints, topic_kp_embed, topic_comment_embed)
    
    json.dump(comment_keypoints, open(output_path, 'w'))
    
    return comment_keypoints

In [10]:
from KeyPointEvaluator import *

In [11]:
def get_predictions(preds, labels_df, comment_df):
    comment_df = comment_df[["comment_id", "comment_id_sent", "topic"]]
    predictions_df = load_predictions(preds)
    #make sure each comment_id has a prediction
    predictions_df = pd.merge(comment_df, predictions_df, how="left", on="comment_id")
    predictions_df = predictions_df.rename(columns={'comment_id': 'comment_id_new', 'comment_id_sent': 'comment_id'})

    #handle comments with no matching key point
    predictions_df["key_point_id"] = predictions_df["key_point_id"].fillna("dummy_id")
    predictions_df["score"] = predictions_df["score"].fillna(0)

    #merge predicted comment-KP pair with the gold labels
    merged_df = pd.merge(predictions_df, labels_df, how="left", on=["comment_id", "key_point_id"])

    merged_df.loc[merged_df['key_point_id'] == "dummy_id", 'label'] = 0
    
    return merged_df

In [12]:
def prepare_comment_kp_label_input(df):
    comment_df = df[['comment_id', 'topic', 'comment', 'full_comment', 'isMultiAspect']]\
        .drop_duplicates(subset=['comment_id']).reset_index(drop=True)
    comment_df = comment_df.explode(['comment'])
    comment_df = comment_df.groupby(['comment_id']).apply(lambda x: x.reset_index(drop=True).reset_index()).reset_index(drop=True)
    comment_df = comment_df.rename(columns={'comment_id': 'comment_id_sent'})
    comment_df['comment_id'] = comment_df['comment_id_sent'] + "_" + comment_df['index'].astype(str)
    
    kp_df = df[['key_point_id', 'topic', 'key_point', 'full_key_point']].drop_duplicates(subset=['key_point_id']).reset_index(drop=True)        
    
    labels_df = df[['comment_id', 'key_point_id', 'label']]
    
    return comment_df, kp_df, labels_df

In [13]:
def calc_all_ap(merged_df):
    precisions = [(topic.capitalize(), get_ap(group, 'label')) for topic, group in merged_df.groupby(["topic"])]
    return precisions

In [23]:
def get_model(base_model_dir, category):
    models_dir = f'{base_model_dir}/{category}'
    latest_checkpoint = os.listdir(models_dir)[-1]
    models_path = f'{models_dir}/{latest_checkpoint}/'

    return SentenceTransformer(models_path)

In [14]:
def do_eval(df, model, verbose=False):
    comment_df, kp_df, labels_df = prepare_comment_kp_label_input(df)
    
    #Perform prediction on the validation/test dataframes
    preds = perform_preds(model, comment_df, kp_df)

    # Get the best predicted KP for every review sentence
    merged_df = get_predictions(preds, labels_df, comment_df)
    merged_df = merged_df.drop_duplicates(subset=['comment_id', 'key_point_id'])
    
    precisions = calc_all_ap(merged_df)
    
    if verbose:
        for category, precision in precisions:
            print(f"{category}: Average Precision = {precision}")
    
    return merged_df, precisions

## In-category Evaluation

### All Comments in test data

In [26]:
print(f"########## IN-CATEGORY EVALUATION ##########")
perf = []
for category in sorted(df['topic'].unique()):
#     models_path = 'quangantang/roberta-large-askpa-kp-matching'
#     pred_output_path = './training_material/inference-results/'
    base_model_dir = f'./model/ASKPA/in-category/'
    model = get_model(base_model_dir, category)
    
    merged_df, precisions = do_eval(df[df['topic'] == category], model)
    del model
    torch.cuda.empty_cache()
    
    perf += [pd.Series({'Business Category': category, 'Average Precision': precisions[0][1]})]

perf_df = pd.concat(perf, axis=1).T
perf_df

########## IN-CATEGORY EVALUATION ##########


Unnamed: 0,Business Category,Average Precision
0,Arts & Entertainment,1.0
1,Automotive,0.774513
2,Beauty & Spas,0.977106
3,Hotels,0.997619
4,Restaurants,0.865774


### Comments containing multiple opinions in test data

In [27]:
print(f"########## IN-CATEGORY EVALUATION ##########")
perf = []
for category in sorted(df['topic'].unique()):    
    base_model_dir = f'./model/ASKPA/in-category/'
    model = get_model(base_model_dir, category)
    
    merged_df, precisions = do_eval(df[(df['topic'] == category) & (df['isMultiAspect'] == True)], model)
    del model
    torch.cuda.empty_cache()
    
    perf += [pd.Series({'Business Category': category, 'Average Precision': precisions[0][1]})]

perf_df = pd.concat(perf, axis=1).T
perf_df

########## IN-CATEGORY EVALUATION ##########


Unnamed: 0,Business Category,Average Precision
0,Arts & Entertainment,1.0
1,Automotive,0.79619
2,Beauty & Spas,0.942857
3,Hotels,0.980519
4,Restaurants,0.827243


## Out-of-category Evaluation

### All Comments in test data

In [28]:
print(f"########## OUT-OF-CATEGORY EVALUATION ##########")
perf = []
for category in sorted(df['topic'].unique()):    
    base_model_dir = f'./model/ASKPA/out-of-category/'
    model = get_model(base_model_dir, category)
    
    merged_df, precisions = do_eval(df[df['topic'] == category], model)
    del model
    torch.cuda.empty_cache()
    
    perf += [pd.Series({'Business Category': category, 'Average Precision': precisions[0][1]})]

perf_df = pd.concat(perf, axis=1).T
perf_df

########## OUT-OF-CATEGORY EVALUATION ##########


Unnamed: 0,Business Category,Average Precision
0,Arts & Entertainment,0.979641
1,Automotive,0.763807
2,Beauty & Spas,0.939028
3,Hotels,0.983829
4,Restaurants,0.892744


### Comments containing multiple opinions in test data

In [29]:
print(f"########## OUT-OF-CATEGORY EVALUATION ##########")
perf = []
for category in sorted(df['topic'].unique()):    
    base_model_dir = f'./model/ASKPA/out-of-category/'
    model = get_model(base_model_dir, category)
    
    merged_df, precisions = do_eval(df[(df['topic'] == category) & (df['isMultiAspect'] == True)], model)
    del model
    torch.cuda.empty_cache()
    
    perf += [pd.Series({'Business Category': category, 'Average Precision': precisions[0][1]})]

perf_df = pd.concat(perf, axis=1).T
perf_df

########## OUT-OF-CATEGORY EVALUATION ##########


Unnamed: 0,Business Category,Average Precision
0,Arts & Entertainment,1.0
1,Automotive,0.642857
2,Beauty & Spas,0.770833
3,Hotels,0.917582
4,Restaurants,0.77234


# Quantitative Coverage and Precision/Recall

In [None]:
def generate_coverage_table()

**Use only in-catgeory result to evaluate**

## Hotel

In [30]:
category = "Hotels"

In [34]:
base_model_dir = f'./model/ASKPA/in-category/'
model = get_model(base_model_dir, category)
comment_df, kp_df, labels_df = prepare_comment_kp_label_input(df)
merged_df, precisions = do_eval(df[(df['topic'] == category) & (df['isMultiAspect'] == True)], model)
del model
torch.cuda.empty_cache()

Calculate Precision/Recall of model performance on important KPs (top 3 KPs from the human annotation)

In [44]:
actual_pair_df = df[(df['label'] == 1) & (df['topic'] == category)]
actual_kp_df = actual_pair_df['full_key_point'].value_counts()
actual_pair_df = actual_pair_df[actual_pair_df['full_key_point'].isin(actual_kp_df.head(3).index)]

In [45]:
hotel_pred_df = merged_df[merged_df['topic'] == category].merge(kp_df, on=['key_point_id'])
hotel_pred_df = hotel_pred_df[hotel_pred_df['full_key_point'].isin(actual_kp_df.head(3).index)]
hotel_pred_df['pred_label'] = 1

In [48]:
coverage_compare_table = hotel_pred_df['full_key_point'].value_counts().reset_index()
coverage_compare_table.columns = ['Key Points', 'Predicted Coverage']
# coverage_compare_table['Actual Coverage'] = actual_pair_df['full_key_point'].value_counts().tolist()
coverage_compare_table

Unnamed: 0,Key Points,Predicted Coverage
0,Friendly and helpful staff .,5
1,Clean and comfortable rooms .,4


#### Precision

In [23]:
merged_pair_df = actual_pair_df.merge(hotel_pred_df[['comment_id', 'key_point_id', 'pred_label']], how='right')
merged_pair_df['label'] = merged_pair_df['label'].fillna(0)
precision_score(merged_pair_df['label'], merged_pair_df['pred_label'])

0.8709677419354839

#### Recall

In [24]:
merged_pair_df = actual_pair_df.merge(hotel_pred_df[['comment_id', 'key_point_id', 'pred_label']], how='left')
merged_pair_df['pred_label'] = merged_pair_df['pred_label'].fillna(0)
recall_score(merged_pair_df['label'], merged_pair_df['pred_label'])

0.7714285714285715

### Restaurant

In [25]:
actual_pair_df = df[(df['label'] == 1) & (df['comment_id'].str.startswith('comment_1'))]
actual_kp_df = actual_pair_df['full_key_point'].value_counts()
actual_pair_df = actual_pair_df[actual_pair_df['full_key_point'].isin(actual_kp_df.head(3).index)]

In [26]:
restaurant_pred_df = merged_df[merged_df['comment_id'].str.startswith('comment_1')].merge(kp_df, on=['key_point_id'])
restaurant_pred_df = restaurant_pred_df[restaurant_pred_df['full_key_point'].isin(actual_kp_df.head(3).index)]
restaurant_pred_df['pred_label'] = 1

In [27]:
coverage_compare_table = restaurant_pred_df['full_key_point'].value_counts().reset_index()
coverage_compare_table.columns = ['Key Points', 'Predicted Coverage']
coverage_compare_table['Actual Coverage'] = actual_pair_df['full_key_point'].value_counts().tolist()
coverage_compare_table

Unnamed: 0,Key Points,Predicted Coverage,Actual Coverage
0,: The food quality was excellent,12,24
1,Service is friendly and attentive,10,16
2,Extremely polite and knowledgeable staff !,6,10


#### Precision

In [28]:
merged_pair_df = actual_pair_df.merge(restaurant_pred_df[['comment_id', 'key_point_id', 'pred_label']], how='right')
merged_pair_df['label'] = merged_pair_df['label'].fillna(0)
precision_score(merged_pair_df['label'], merged_pair_df['pred_label'])

0.8571428571428571

#### Recall

In [29]:
merged_pair_df = actual_pair_df.merge(restaurant_pred_df[['comment_id', 'key_point_id', 'pred_label']], how='left')
merged_pair_df['pred_label'] = merged_pair_df['pred_label'].fillna(0)
recall_score(merged_pair_df['label'], merged_pair_df['pred_label'])

0.48