We compare our classification results against the ground_truth file, whose labels were manually annotated by us. We assume this file to represent the correct ground truth.

In [1]:
import pandas as pd
from sklearn.metrics import classification_report

We load the model test_file and the ground_truth file

In [2]:
test_file = pd.read_csv('pipeline_output.csv')
test_file["is_review_ad"] = test_file["is_text_ad"] | test_file["is_image_ad"]
test_file['is_review_irrelevant'] = test_file["is_image_irrelevant"] | test_file["is_text_irrelevant"]
ground_truth = pd.read_csv('vt_merged_validation.csv')

In [3]:
print(test_file.columns)
print(ground_truth.columns)

Index(['Unnamed: 0', 'review_id', 'user_id', 'time', 'rating', 'text',
       'pics_collapsed', 'resp_collapsed', 'name', 'description', 'category',
       'url', 'image', 'is_image_ad', 'is_image_irrelevant', 'is_text_ad',
       'is_text_irrelevant', 'is_text_rant', 'is_review_ad',
       'is_review_irrelevant', 'helpfulness', 'sensibility'],
      dtype='object')
Index(['Unnamed: 0', 'review_id', 'user_id', 'time', 'rating', 'text',
       'pics_collapsed', 'resp_collapsed', 'name', 'description', 'category',
       'url', 'image', 'is_text_rant', 'is_review_ad', 'is_review_irrelevant',
       'helpfulness', 'sensibility'],
      dtype='object')


We evalute precision, recall, and F1-score for each class (True/False) and the overall weighted/macro scores.

In [4]:
targets = ["is_review_irrelevant", "is_review_ad", "is_text_rant"]

pred_cols = ["review_id"] + [f"{col}" for col in targets] + ["sensibility", "helpfulness"]
gt_cols = ["review_id"] + [f"{col}" for col in targets] + ["sensibility", "helpfulness"]


test_subset = test_file[pred_cols]
gt_subset = ground_truth[gt_cols]

df = test_subset.merge(gt_subset, on="review_id", suffixes=("_pred", "_true"))

for col in targets:
    y_true = df[f"{col}_true"]
    y_pred = df[f"{col}_pred"]
    print(f"=== {col} ===")
    print(classification_report(y_true, y_pred, digits=3, zero_division=0))


=== is_review_irrelevant ===
              precision    recall  f1-score   support

       False      1.000     0.857     0.923        98
        True      0.125     1.000     0.222         2

    accuracy                          0.860       100
   macro avg      0.562     0.929     0.573       100
weighted avg      0.983     0.860     0.909       100

=== is_review_ad ===
              precision    recall  f1-score   support

       False      1.000     0.980     0.990       100
        True      0.000     0.000     0.000         0

    accuracy                          0.980       100
   macro avg      0.500     0.490     0.495       100
weighted avg      1.000     0.980     0.990       100

=== is_text_rant ===
              precision    recall  f1-score   support

       False      1.000     0.810     0.895       100
        True      0.000     0.000     0.000         0

    accuracy                          0.810       100
   macro avg      0.500     0.405     0.448       100
wei

In [5]:
df[10:20]

Unnamed: 0,review_id,is_review_irrelevant_pred,is_review_ad_pred,is_text_rant_pred,sensibility_pred,helpfulness_pred,is_review_irrelevant_true,is_review_ad_true,is_text_rant_true,sensibility_true,helpfulness_true
10,195832,False,False,True,True,very_helpful,False,False,False,True,very_helpful
11,49532,False,False,False,True,helpful,False,False,False,True,helpful
12,222641,False,False,False,True,helpful,False,False,False,True,helpful
13,86238,False,False,False,True,not_helpful,False,False,False,True,not_helpful
14,188701,True,False,False,True,not_helpful,False,False,False,True,helpful
15,206144,False,False,False,True,helpful,False,False,False,True,helpful
16,47760,False,False,True,True,helpful,False,False,False,True,very_helpful
17,171679,False,False,False,True,not_helpful,False,False,False,True,not_helpful
18,203732,True,False,True,True,not_helpful,True,False,False,True,not_helpful
19,268597,False,False,False,True,not_helpful,False,False,False,True,not_helpful


We subsequently iterated the process with alternative prompts. The prompts listed below were tested but yielded inferior performance.