In [None]:
import os
import ast
import json
import nltk
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm import tqdm
from nltk.tokenize import sent_tokenize
from collections import Counter, defaultdict
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from utils import *

## preprocess

This is to segment the review into sentences.

In [None]:
for venue in ['emnlp23', 'iclr20', 'iclr21', 'iclr22', 'iclr23', 'iclr24']:
    
    with open(f'data/{venue}.json') as file:
        data = json.loads(file.read())
    with open(f'selected_papers/{venue}.txt') as file:
        selected_papers = file.readlines()

    selected_paper_ids = []
    for title in selected_papers:
        for paper_id in data:
            if data[paper_id]['Title'] == title.rstrip('\n'):
                selected_paper_ids.append(paper_id)
                break

    output = defaultdict()
    for paper_id in data:
        output[paper_id] = defaultdict(dict)
        for reviewer_id in data[paper_id]['Reviews']:
            if venue not in ['emnlp23', 'iclr24']:
                review = data[paper_id]['Reviews'][reviewer_id]['Reasons']
            else:
                review = data[paper_id]['Reviews'][reviewer_id]['Reasons_to_accept'] + '\n\n' + data[paper_id]['Reviews'][reviewer_id]['Reasons_to_reject']
            sentences = sent_tokenize(review, language='english')
            for sentence in sentences:
                if len(sentence) > 2:
                    output[paper_id][reviewer_id][str(len(output[paper_id][reviewer_id]))] = sentence

    with open(f'preprocessed/preprocessed-{venue}.json', 'w') as file:
        json.dump(output, file, indent=4, ensure_ascii=False)

In [None]:
for venue in ['nlpeer']:
    
    with open(f'data/{venue}.json') as file:
        data = json.loads(file.read())
    with open(f'selected_papers/{venue}.txt') as file:
        selected_papers = file.readlines()

    selected_paper_ids = []
    for title in selected_papers:
        for paper_id in data:
            if data[paper_id]['Title'] == title.rstrip('\n'):
                selected_paper_ids.append(paper_id)
                break

    output = defaultdict()
    for paper_id in selected_paper_ids:
        output[paper_id] = defaultdict(dict)
        for reviewer_id in data[paper_id]['Reviews']:
            review = data[paper_id]['Reviews'][reviewer_id]
            sentences = sent_tokenize(review, language='english')
            for sentence in sentences:
                output[paper_id][reviewer_id][str(len(output[paper_id][reviewer_id]))] = sentence

    with open(f'preprocessed/preprocessed-{venue}.json', 'w') as file:
        json.dump(output, file, indent=4, ensure_ascii=False)

## postprocess

This is to postprocess the LLM annotations to find the most frequent aspects.

In [None]:
with open('synonyms.json') as file:
    synonyms = {k.lower(): v for k, v in json.loads(file.read()).items()}

In [None]:
annotation_1, counter = [], Counter()
annotation = pd.read_csv('annotation - llm.csv')
for i in range(len(annotation)):
    item = merge_synonyms(str(annotation['annotation_1'].tolist()[i]).replace(' and ', ', ').split(', '))
    annotation_1.extend(item)
    counter.update(item)