In [1]:
import torch
torch.cuda.empty_cache()

In [2]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
from sentence_transformers import SentenceTransformer, InputExample, LoggingHandler, losses, models, util
import torch

In [4]:
from datasets import concatenate_datasets, load_dataset
from datasets import Dataset, DatasetDict

In [5]:
pd.set_option('display.max_colwidth', None)

# Load Test dataset

Load data with different agreement score for positive labels

In [6]:
agreement_scores = [0.3, 0.4, 0.5, 0.6]

In [7]:
dfs = {}

In [8]:
for score in agreement_scores:
    df = pd.read_pickle(f"../data/Evaluation/test_data_agreement_{score}.pkl")
    dfs[score] = df
    
    print("##########")
    print(f"AGREEMENT SCORE FOR POSITIVE LABEL: {score}")
    print(df['label'].value_counts())
    print()

##########
AGREEMENT SCORE FOR POSITIVE LABEL: 0.3
0    4850
1     190
Name: label, dtype: int64

##########
AGREEMENT SCORE FOR POSITIVE LABEL: 0.4
0    4912
1     128
Name: label, dtype: int64

##########
AGREEMENT SCORE FOR POSITIVE LABEL: 0.5
0    4969
1      71
Name: label, dtype: int64

##########
AGREEMENT SCORE FOR POSITIVE LABEL: 0.6
0    4969
1      71
Name: label, dtype: int64



In [9]:
dfs[0.3]

Unnamed: 0,comment_id,label,key_point_id,comment,full_comment,attributes,topic,isMultiAspect,key_point,full_key_point
0,comment_1_12626,1,kp_1_11,[quick and courteous service <SEP> positive],The service was quick and courteous .,[staff],restaurant,False,generous servings <SEP> positive,Servings were generous .
1,comment_1_13411,0,kp_1_11,[always courteous and friendly waitstaff <SEP> positive],Their waitstaff are always courteous and friendly .,[staff],restaurant,False,generous servings <SEP> positive,Servings were generous .
2,comment_1_4972,0,kp_1_11,[overwhelmed waiters <SEP> negative],I just wish they would be better staffed because the waiters seem overwhelmed .,[staff],restaurant,False,generous servings <SEP> positive,Servings were generous .
3,comment_1_10784,0,kp_1_11,[fresh food <SEP> positive],The food is fresh and innovative without being pretentious .,[food -> quality],restaurant,False,generous servings <SEP> positive,Servings were generous .
4,comment_1_6045,0,kp_1_11,"[lovely setting <SEP> positive, attentive staff <SEP> positive]",It is a lovely setting with friendly and attentive staff .,"[restaurant -> atmosphere, staff]",restaurant,True,generous servings <SEP> positive,Servings were generous .
...,...,...,...,...,...,...,...,...,...,...
5035,comment_0_5900,0,kp_0_1,[damp pool area <SEP> negative],The pool area is damp and suffocating .,[restaurant -> atmosphere],hotel,False,clean washrooms <SEP> positive,Washrooms clean and modern .
5036,comment_0_4308,0,kp_0_1,[well appointed rooms <SEP> positive],Rooms are well appointed and well maintained .,[restaurant -> atmosphere],hotel,False,clean washrooms <SEP> positive,Washrooms clean and modern .
5037,comment_0_7934,0,kp_0_1,[worst customer service <SEP> negative],Worst customer service I 've ever experienced .,[staff],hotel,False,clean washrooms <SEP> positive,Washrooms clean and modern .
5038,comment_0_4495,0,kp_0_1,"[slow service <SEP> negative, mediocre food <SEP> negative]",Slow service and mediocre food .,"[wait-time, food -> quality]",hotel,True,clean washrooms <SEP> positive,Washrooms clean and modern .


# Evaluation

## Mean Average Precision

In [10]:
def match_comment_with_keypoints(result, kp_dict, comment_dict):
    
    for comment, comment_embedding in comment_dict.items():
        result[comment] = {}
        for kp, kp_embedding in kp_dict.items():
            result[comment][kp] = util.pytorch_cos_sim(comment_embedding, kp_embedding).item()
        
        #Applying softmax
        kp_scores = list(result[comment].items())
        kp_ids, kp_scores = zip(*kp_scores)
        result[comment] = {kp_id:score for kp_id, score in zip(kp_ids, kp_scores)}
        

    return result

def predict(model, comment_df, keypoint_df, output_path, append_topic=False):
    comment_keypoints = {}
    for topic in comment_df.topic.unique():
        for stance in [-1, 1]:
            topic_keypoints_ids = keypoint_df[(keypoint_df.topic==topic) & (keypoint_df.stance==stance)]['key_point_id'].tolist()
            topic_keypoints = keypoint_df[(keypoint_df.topic==topic) & (keypoint_df.stance==stance)]['key_point'].tolist()
            if append_topic:
                topic_keypoints = [topic + ' <SEP> ' + x for x in topic_keypoints]
                
            topic_keypoints_embeddings = model.encode(topic_keypoints)
            topic_kp_embed = dict(zip(topic_keypoints_ids, topic_keypoints_embeddings))

            topic_comments_ids = comment_df[(comment_df.topic==topic) & (comment_df.stance==stance)]['comment_id'].tolist()
            topic_comments = comment_df[(comment_df.topic==topic) & (comment_df.stance==stance)]['comment'].tolist()
            topic_comments_embeddings = model.encode(topic_comments)
            topic_comment_embed= dict(zip(topic_comments_ids, topic_comments_embeddings))

            comment_keypoints = match_comment_with_keypoints(comment_keypoints, topic_kp_embed, topic_comment_embed)
    
    json.dump(comment_keypoints, open(output_path, 'w'))
    
    return comment_keypoints

In [11]:
from KeyPointEvaluator import *

In [12]:
def get_predictions(preds, labels_df, comment_df):
    comment_df = comment_df[["comment_id", "comment_id_sent", "topic"]]
    predictions_df = load_predictions(preds)
    #make sure each comment_id has a prediction
    predictions_df = pd.merge(comment_df, predictions_df, how="left", on="comment_id")
    predictions_df = predictions_df.rename(columns={'comment_id': 'comment_id_new', 'comment_id_sent': 'comment_id'})

    #handle comments with no matching key point
    predictions_df["key_point_id"] = predictions_df["key_point_id"].fillna("dummy_id")
    predictions_df["score"] = predictions_df["score"].fillna(0)

    #merge predicted comment-KP pair with the gold labels
    merged_df = pd.merge(predictions_df, labels_df, how="left", on=["comment_id", "key_point_id"])

    merged_df.loc[merged_df['key_point_id'] == "dummy_id", 'label'] = 0
    merged_df["label_strict"] = merged_df["label"].fillna(0)
    merged_df["label_relaxed"] = merged_df["label"].fillna(1)
    return merged_df


In [13]:
def prepare_comment_kp_label_input(df):
    comment_df = df[['comment_id', 'topic', 'comment', 'full_comment', 'isMultiAspect']]\
        .drop_duplicates(subset=['comment_id']).reset_index(drop=True)
    comment_df = comment_df.explode(['comment'])
    comment_df = comment_df.groupby(['comment_id']).apply(lambda x: x.reset_index(drop=True).reset_index()).reset_index(drop=True)
    comment_df = comment_df.rename(columns={'comment_id': 'comment_id_sent'})
    comment_df['comment_id'] = comment_df['comment_id_sent'] + "_" + comment_df['index'].astype(str)
    
    kp_df = df[['key_point_id', 'topic', 'key_point', 'full_key_point']].drop_duplicates(subset=['key_point_id']).reset_index(drop=True)        
    
    labels_df = df[['comment_id', 'key_point_id', 'label']]
    
    return comment_df, kp_df, labels_df

Perform Evaluation

In [14]:
models_path = './training_material/siamese-models/roberta-large-yelp-pretrained-contrastive-10-epochs-2023-02-15_23-31-51/'
pred_output_path = './training_material/inference-results/'

In [15]:
model = SentenceTransformer(models_path)

In [16]:
def do_eval(df):
    comment_df, kp_df, labels_df = prepare_comment_kp_label_input(df)
    append_topic = False
    
    #Perform prediction on the validation/test dataframes
    preds = perform_preds(model, comment_df, kp_df, append_topic)

    # Get the best predicted KP for every review sentence
    merged_df = get_predictions(preds, labels_df, comment_df)
    merged_df = merged_df.drop_duplicates(subset=['comment_id', 'key_point_id'])

    #Perform evaluation
    mAP_strict, mAP_relaxed = evaluate_predictions(merged_df)

    print(f"mAP strict= {mAP_strict} ; mAP relaxed = {mAP_relaxed}")

    logger.info("mAP strict:   \t{:.2f}".format(mAP_strict*100))
    logger.info("mAP relaxed:   \t{:.2f}".format(mAP_relaxed*100))
    
    return merged_df

In [17]:
print("########## EVALUATION RESULTS ##########")
for score in agreement_scores:
    print("##########")
    print(f"AGREEMENT SCORE FOR POSITIVE LABEL: {score}")
    do_eval(dfs[score])
    print()

########## EVALUATION RESULTS ##########
##########
AGREEMENT SCORE FOR POSITIVE LABEL: 0.3
[('Hotel AP', 0.9570323447167297), ('Restaurant AP', 0.7685450784655687)]
[('Hotel AP', 0.9570323447167297), ('Restaurant AP', 0.7685450784655687)]
mAP strict= 0.8627887115911492 ; mAP relaxed = 0.8627887115911492

##########
AGREEMENT SCORE FOR POSITIVE LABEL: 0.4
[('Hotel AP', 0.937393025297412), ('Restaurant AP', 0.7279593165613089)]
[('Hotel AP', 0.937393025297412), ('Restaurant AP', 0.7279593165613089)]
mAP strict= 0.8326761709293604 ; mAP relaxed = 0.8326761709293604

##########
AGREEMENT SCORE FOR POSITIVE LABEL: 0.5
[('Hotel AP', 0.7587330655913238), ('Restaurant AP', 0.4725415968537123)]
[('Hotel AP', 0.7587330655913238), ('Restaurant AP', 0.4725415968537123)]
mAP strict= 0.6156373312225181 ; mAP relaxed = 0.6156373312225181

##########
AGREEMENT SCORE FOR POSITIVE LABEL: 0.6
[('Hotel AP', 0.7587330655913238), ('Restaurant AP', 0.4725415968537123)]
[('Hotel AP', 0.7587330655913238), ('R

## Quantitative Coverage and Accuracy

In [18]:
df = dfs[0.3]
comment_df, kp_df, labels_df = prepare_comment_kp_label_input(dfs[0.3])
merged_df = do_eval(dfs[0.3])

[('Hotel AP', 0.9570323447167297), ('Restaurant AP', 0.7685450784655687)]
[('Hotel AP', 0.9570323447167297), ('Restaurant AP', 0.7685450784655687)]
mAP strict= 0.8627887115911492 ; mAP relaxed = 0.8627887115911492


### Hotel

Select Key points that are found to be common with ground truth in the prediction results

In [19]:
selected_kps = ["Friendly and courteous staff .  ",  
    "Clean and comfortable rooms .",
    "Outstanding service .",
    "Great front desk customer service .",
    "Parking is easy .",
    "Beds were comfortable ."
]

In [20]:
actual_pair_df = df[(df['label'] == 1) & (df['comment_id'].str.startswith('comment_0'))]
actual_kp_df = actual_pair_df['full_key_point'].value_counts()
actual_pair_df = actual_pair_df[actual_pair_df['full_key_point'].isin(selected_kps)]

In [21]:
hotel_pred_df = merged_df[merged_df['comment_id'].str.startswith('comment_0')].merge(kp_df, on=['key_point_id'])
hotel_pred_df = hotel_pred_df[hotel_pred_df['full_key_point'].isin(selected_kps)]
hotel_pred_df['pred_label'] = 1

In [22]:
coverage_compare_table = hotel_pred_df['full_key_point'].value_counts().reset_index()
coverage_compare_table.columns = ['Key Points', 'Predicted Coverage']
coverage_compare_table['Actual Coverage'] = actual_pair_df['full_key_point'].value_counts().tolist()
coverage_compare_table

Unnamed: 0,Key Points,Predicted Coverage,Actual Coverage
0,Friendly and courteous staff .,15,18
1,Clean and comfortable rooms .,8,9
2,Outstanding service .,6,8
3,Great front desk customer service .,6,4
4,Parking is easy .,3,3
5,Beds were comfortable .,3,2


In [23]:
merged_pair_df = actual_pair_df.merge(hotel_pred_df[['comment_id', 'key_point_id', 'pred_label']], how='left')
merged_pair_df['pred_label'] = merged_pair_df['pred_label'].fillna(0)

In [24]:
from sklearn.metrics import accuracy_score
accuracy_score(merged_pair_df['label'], merged_pair_df['pred_label'])

0.7045454545454546

### Restaurant

In [25]:
selected_kps = [
    "Ambiance is casual and comfortable .",
    "Extremely polite and knowledgeable staff !",
    "Prices were extremely reasonable .  ",
    "Great beer selection",
    "Over priced for the quality .  ",
    "Good selection of coffee choices ."
]

In [26]:
actual_pair_df = df[(df['label'] == 1) & (df['comment_id'].str.startswith('comment_1'))]
actual_kp_df = actual_pair_df['full_key_point'].value_counts()
actual_pair_df = actual_pair_df[actual_pair_df['full_key_point'].isin(selected_kps)]

In [27]:
hotel_pred_df = merged_df[merged_df['comment_id'].str.startswith('comment_1')].merge(kp_df, on=['key_point_id'])
hotel_pred_df = hotel_pred_df[hotel_pred_df['full_key_point'].isin(selected_kps)]
hotel_pred_df['pred_label'] = 1

In [28]:
coverage_compare_table = hotel_pred_df['full_key_point'].value_counts().reset_index()
coverage_compare_table.columns = ['Key Points', 'Predicted Coverage']
coverage_compare_table['Actual Coverage'] = actual_pair_df['full_key_point'].value_counts().tolist()
coverage_compare_table

Unnamed: 0,Key Points,Predicted Coverage,Actual Coverage
0,Ambiance is casual and comfortable .,13,10
1,Extremely polite and knowledgeable staff !,6,10
2,Prices were extremely reasonable .,3,4
3,Great beer selection,2,1
4,Over priced for the quality .,1,1
5,Good selection of coffee choices .,1,1


In [29]:
merged_pair_df = actual_pair_df.merge(hotel_pred_df[['comment_id', 'key_point_id', 'pred_label']], how='left')
merged_pair_df['pred_label'] = merged_pair_df['pred_label'].fillna(0)

In [30]:
 from sklearn.metrics import accuracy_score
accuracy_score(merged_pair_df['label'], merged_pair_df['pred_label'])

0.5925925925925926