In [None]:
import os
import ast
import json
import nltk
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm import tqdm
from nltk.tokenize import sent_tokenize
from collections import Counter, defaultdict
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from utils import *

# review comparison

This corresponds to **Section 5.2 Review comparison** in the paper.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
metric = 'jaccard'
if metric == 'jaccard':
    similarity_metric = jaccard_similarity
if metric == 'dice':
    similarity_metric = dice_coefficient
if metric == 'overlap':
    similarity_metric = overlap_coefficient
if metric == 'kulczynski':
    similarity_metric = kulczynski_similarity

In [None]:
config = pd.read_csv('config-inference.txt', sep='\t')
run_id = config['run_id'][(config['type_of_labels'] == 'fine') & (config['source'] == 'llm_generated_reviews_iclr24_ours')].to_list()[0]
source = config['source'][config['run_id'] == run_id].to_list()[0]
number_of_data = config['number_of_data'][config['run_id'] == run_id].to_list()[0]
type_of_labels = config['type_of_labels'][config['run_id'] == run_id].to_list()[0]

with open(f'results/agr_detection-{run_id}.json') as file:
    results = json.loads(file.read())

review_aspects = defaultdict()
for paper_id in list(results.keys()):
    review_aspects[paper_id] = defaultdict(list)
    for reviewer_id in results[paper_id]:
        aspects = set()
        for _, values in results[paper_id][reviewer_id].items():
            aspects.update(values)
        if '-' in aspects:
            aspects.remove('-')
        if 'O' in aspects:
            aspects.remove('O')
        review_aspects[paper_id][reviewer_id] = aspects

similarities_inter = defaultdict()
similarities_inter['human_review'] = defaultdict()
for paper_id_i in review_aspects:
    similarities_inter['human_review'][paper_id_i] = defaultdict(list)
    for i in review_aspects[paper_id_i]:
        if i != 'llm_review':
            for paper_id_j in review_aspects:
                for j in review_aspects[paper_id_j]:
                    if j != 'llm_review':
                        similarities_inter['human_review'][paper_id_i][i].append(similarity_metric(review_aspects[paper_id_i][i], review_aspects[paper_id_j][j]))

similarities_inter['llm_review'] = defaultdict()
for paper_id_i in review_aspects:
    similarities_inter['llm_review'][paper_id_i] = defaultdict(list)
    for i in review_aspects[paper_id_i]:
        if i == 'llm_review':
            for paper_id_j in review_aspects:
                for j in review_aspects[paper_id_j]:
                    if j == 'llm_review':
                        similarities_inter['llm_review'][paper_id_i][i].append(similarity_metric(review_aspects[paper_id_i][i], review_aspects[paper_id_j][j]))


plt.rcdefaults()
plt.rcParams['font.size'] = 12
fig = plt.figure(figsize=(5.6, 2))
gs = fig.add_gridspec(1, 3, width_ratios=[1, 1, 0.05])
ax1 = fig.add_subplot(gs[0, 0])
ax2 = fig.add_subplot(gs[0, 1])
cbar_ax = fig.add_subplot(gs[0, 2])
for ax, review_type in zip([ax1, ax2], ['human_review', 'llm_review']):

    similarities, temp = [], []
    for paper_id in similarities_inter[review_type]:
        for reviewer_id, scores in similarities_inter[review_type][paper_id].items():
            similarities.append(scores)
            temp.append((paper_id, reviewer_id))
    
    df = pd.DataFrame(similarities, index=[i for i in range(len(similarities))])

    if review_type == 'human_review':
        sns.heatmap(df, cmap='coolwarm', vmin=0, vmax=1.0, cbar=False, ax=ax)
    else:
        sns.heatmap(df, cmap='coolwarm', vmin=0, vmax=1.0, cbar_ax=cbar_ax, cbar_kws={'ticks': [0, 0.5, 1.0]}, ax=ax)

    if review_type == 'human_review':
        ticks = [0, 160, 315]
        if source == 'ReviewCritique':
            ticks = [0, 38, 76]
    else:
        ticks = [i for i in range(0, len(similarities)+50, 50)]
        if source == 'ReviewCritique':
            ticks = [0, 10, 20]
    ax.set_xticks(ticks)
    ax.set_xticklabels(ticks, rotation=0)
    avg = round(df.values.mean(), 4)
    ax.set_xlabel(f'average={avg:.4f}')
    
    ax.invert_yaxis()
    ax.set_yticks(ticks)
    if review_type == 'human_review':
        ax.set_ylabel('paper')
        ax.set_yticklabels(ticks, rotation=0)
    else:
        ax.set_yticklabels([])
    
    ax.set_title({'human_review': 'human-written', 'llm_review': 'LLM-generated'}[review_type], fontweight='bold')

fig.subplots_adjust(wspace=0.3)
pos = cbar_ax.get_position()
cbar_ax.set_position([pos.x0-0.025, pos.y0, pos.width, pos.height])
plt.savefig(f'plots/heatmap-comparison-{source}-{type_of_labels}.png', format='png', bbox_inches='tight', dpi=300)
plt.show()

# LLM-generated review detection

This corresponds to **Section 5.3 LLM-generated review detection** in the paper.

In [None]:
metric = 'jaccard'
if metric == 'jaccard':
    similarity_metric = jaccard_similarity
if metric == 'dice':
    similarity_metric = dice_coefficient
if metric == 'overlap':
    similarity_metric = overlap_coefficient
if metric == 'kulczynski':
    similarity_metric = kulczynski_similarity

In [None]:
# random baseline
sources = ['ReviewCritique']
random_baselines_1, random_baselines_2 = defaultdict(list), defaultdict(list)
for source in sources:
    for seed in [2266,105,86379]:
        random.seed(seed)
        with open(f'preprocessed/preprocessed-{source}.json') as file:
            data = json.loads(file.read())
    
        detection = []
        for paper_id in data:
            if len(data[paper_id].keys()) >= 2:
                detection.append(random.sample(list(data[paper_id].keys()), 2))
            else:
                detection.append(random.sample(list(data[paper_id].keys()), 1))
        
        correct_at_1, correct_at_2 = 0, 0
        for item in detection:
            if item[0] == 'llm_review':
                correct_at_1 += 1
            if 'llm_review' in item:
                correct_at_2 += 1
        
        correctness_at_1 = correct_at_1 / len(detection)
        correctness_at_2 = correct_at_2 / len(detection)

        random_baselines_1[source].append(correctness_at_1)
        random_baselines_2[source].append(correctness_at_2)

for source in sources:
    random_baselines_1[source] = sum(random_baselines_1[source]) / len(random_baselines_1[source])
    random_baselines_2[source] = sum(random_baselines_2[source]) / len(random_baselines_2[source])

In [None]:
random_baselines_1

In [None]:
config = pd.read_csv('config-inference.txt', sep='\t')
for run_id in config['run_id'].to_list():

    source = config['source'][config['run_id'] == run_id].to_list()[0]
    number_of_data = config['number_of_data'][config['run_id'] == run_id].to_list()[0]
    type_of_labels = config['type_of_labels'][config['run_id'] == run_id].to_list()[0]

    with open(f'results/agr_detection-{run_id}.json') as file:
        results = json.loads(file.read())

    review_aspects = defaultdict()
    for paper_id in list(results.keys()):
        review_aspects[paper_id] = defaultdict(list)
        for reviewer_id in results[paper_id]:
            aspects = set()
            for _, values in results[paper_id][reviewer_id].items():
                aspects.update(values)
            if '-' in aspects:
                aspects.remove('-')
            if 'O' in aspects:
                aspects.remove('O')
            review_aspects[paper_id][reviewer_id] = aspects
    
    similarities_intra = defaultdict()
    for paper_id in review_aspects:
        similarities_intra[paper_id] = defaultdict(list)
        for i in review_aspects[paper_id]:
            for j in review_aspects[paper_id]:
                if j != i:
                    similarities_intra[paper_id][i].append(similarity_metric(review_aspects[paper_id][i], review_aspects[paper_id][j]))
            if len(similarities_intra[paper_id][i]) == 0:
                similarities_intra[paper_id][i] = 0
            else:
                similarities_intra[paper_id][i] = sum(similarities_intra[paper_id][i]) / len(similarities_intra[paper_id][i])
    
    similarities_inter = defaultdict()
    for paper_id_i in review_aspects:
        similarities_inter[paper_id_i] = defaultdict(list)
        for i in review_aspects[paper_id_i]:
            for paper_id_j in review_aspects:
                if paper_id_j != paper_id_i:
                    for j in review_aspects[paper_id_j]:
                        similarities_inter[paper_id_i][i].append(similarity_metric(review_aspects[paper_id_i][i], review_aspects[paper_id_j][j]))
            similarities_inter[paper_id_i][i] = sum(similarities_inter[paper_id_i][i]) / len(similarities_inter[paper_id_i][i])
    
    detection = defaultdict()
    for paper_id in similarities_intra:
        detection[paper_id] = defaultdict()
        for reviewer_id in similarities_intra[paper_id]:
            diff = similarities_intra[paper_id][reviewer_id] - similarities_inter[paper_id][reviewer_id]
            detection[paper_id][reviewer_id] = diff
        detection[paper_id] = dict(sorted(detection[paper_id].items(), key=lambda x: x[1]))
    
    correct_at_1, correct_at_2 = 0, 0
    for paper_id, item in detection.items():
        if len(detection[paper_id]) != 0:
            if sorted(detection[paper_id].items(), key=lambda x: x[1])[0][0] == 'llm_review':
                correct_at_1 += 1
            if sorted(detection[paper_id].items(), key=lambda x: x[1])[0][0] == 'llm_review' or sorted(detection[paper_id].items(), key=lambda x: x[1])[1][0] == 'llm_review':
                correct_at_2 += 1
    
    correctness_at_1 = correct_at_1 / len(detection)
    correctness_at_2 = correct_at_2 / len(detection)
    
    with open('evaluation_scores-llm_generated_review_detection.txt', 'a') as file:
        file.write(f'{run_id}\t{source}\t{number_of_data}\t{type_of_labels}\t{metric}\t{correctness_at_1}\t{correctness_at_2}\n')