In [None]:
import os
import json
import time
import PyPDF2
import random
import requests
import numpy as np
import pandas as pd

from tqdm import tqdm
from openai import AzureOpenAI
from collections import Counter, defaultdict
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from utils import *

In [None]:
client = AzureOpenAI(
    api_key='',
    api_version='',
    azure_endpoint = ''
    )

# aspect identification

Here we leverage GPT-4o to identify aspects from reviews. This corresponds to **Section 3 Aspect set construction** in the paper.

In [None]:
prompts = {
    'i': "Identify the aspect(s) that each of the given sentences focuses on. Format the output in a json dictionary. For example, given a dictionary as follows:\n\n{\"1\": \"The methodology is convincing, and the improvement is noticeable.\", \"2\": \"A dataset is assembled.\"}\n\nThe output should be:\n\n{\"1\": \"Methodology, Improvement\", \"2\": \"Dataset\"}",
    'i2': "Identify the facet(s) that each of the given sentences focuses on. Format the output in a json dictionary. For example, given a dictionary as follows:\n\n{\"1\": \"The methodology is convincing, and the improvement is noticeable.\", \"2\": \"A dataset is assembled.\"}\n\nThe output should be:\n\n{\"1\": \"Methodology, Improvement\", \"2\": \"Dataset\"}"
    }

In [None]:
note = 'sentence'
for venue in ['emnlp23', 'iclr20', 'iclr21', 'iclr22', 'iclr23', 'iclr24', 'nlpeer']:
    with open(f'preprocessed/preprocessed-{venue}.json') as file:
        data = json.loads(file.read())
    
    experiment_id = time.strftime('%Y%m%d_%H%M%S', time.localtime())
    prompt_type = 'i2'
    model = 'gpt-4o'
    temperature = 1
    max_tokens = 2048
    
    output = defaultdict(dict)
    with tqdm(total=len(data)) as t:
        for paper_id in data:
            for reviewer_id in data[paper_id]:
                output[paper_id][reviewer_id] = {}
                response = client.chat.completions.create(
                    model=model,
                    messages=[
                        {'role': 'system', 'content': prompts[prompt_type]},
                        {'role': 'user', 'content': str(data[paper_id][reviewer_id])}
                        ],
                    temperature=temperature,
                    max_tokens=max_tokens,
                    top_p=1,
                    frequency_penalty=0,
                    presence_penalty=0,
                    response_format={'type': 'json_object'},
                    seed=2266
                    )
                output[paper_id][reviewer_id]['aspects'] = response.choices[0].message.content
    
            with open(f'results-{experiment_id}.json', 'w') as file:
                json.dump(output, file, ensure_ascii=False, indent=4)
            
            t.update(1)
            time.sleep(0.1)
    
    number_of_data = len(output)
    with open('config-aspect_identification.txt', 'a') as file:
        file.write(f'{experiment_id}\t{venue}\t{number_of_data}\t{prompt_type}\t{model}\t{temperature}\t{max_tokens}\t{note}\n')

# aspect prediction

Here we consider two tasks: predicting the aspects that should be focused on given a paper (paper aspect prediction, **PAP**), and identifying the aspects that are covered in the review (review aspect prediction, **RAP**). This corresponds to **Section 4 Aspect prediction** in the paper.

## paper aspect prediction (PAP)

In [None]:
type_of_labels = 'fine'
column_name = 'COARSE'
if type_of_labels == 'fine':
    column_name = 'FINE'

category_to_aspect = pd.read_csv(f'aspects - {type_of_labels}.csv')
aspect_to_category = defaultdict(set)
for i in range(len(category_to_aspect)):
    if (type_of_labels == 'fine' and category_to_aspect[column_name].to_list()[i] not in ['Contribution', 'Definition', 'Description', 'Detail', 'Discussion', 'Explanation', 'Interpretation', 'Intuition', 'Justification', 'Motivation', 'Validation', 'Novelty', 'Clarity', 'Confusion', 'Figure', 'Grammar', 'Notation', 'Presentation', 'Table', 'Terminology', 'Typo', 'Related Work', 'Impact', 'Importance', 'Significance']) or (type_of_labels == 'coarse' and category_to_aspect[column_name].to_list()[i] not in ['Contribution', 'Definition/Description/Detail/Discussion/Explanation/Interpretation', 'Intuition/Justification/Motivation/Validation', 'Novelty', 'Presentation', 'Related Work', 'Significance']):
        aspect_to_category[category_to_aspect['LLM annotation'].to_list()[i]].add(category_to_aspect[column_name].to_list()[i])

data = defaultdict()
for venue in ['iclr20', 'iclr21', 'iclr22', 'iclr23', 'iclr24']:
    with open(f'data/{venue}.json') as file:
        data[venue] = json.loads(file.read())

annotation = pd.read_csv('annotation - llm.csv')
result = defaultdict(list)
for venue in ['iclr20', 'iclr21', 'iclr22', 'iclr23', 'iclr24']:
    with open(f'preprocessed/preprocessed-{venue}.json') as file:
        preprocessed = json.loads(file.read())
    for paper_id in preprocessed:
        with open(f'data/papers/{paper_id}.txt') as file:
            paper = file.read()
        aspects = []
        for item in annotation['annotation_1'][(annotation['venue'] == venue) & (annotation['paper_id'] == paper_id)].tolist():
            aspects.extend(merge_synonyms(str(item).replace(' and ', ', ').split(', ')))
        result['venue'].append(venue)
        result['paper_id'].append(paper_id)
        result['abstract'].append(data[venue][paper_id]['Abstract'])
        result['keywords'].append(', '.join(data[venue][paper_id]['Keywords']))
        result['title'].append(data[venue][paper_id]['Title'])
        result['paper'].append(paper.split('\nREFERENCES\n')[0])
        result['aspects'].append(list(set(aspects)))

In [None]:
part = 'title'
texts = result[part]
labels = result['aspects']

categories = []
for aspect, category in aspect_to_category.items():
    categories.extend(list(category))
categories = sorted(list(set(categories)))
labels_one_hot = []
for item in labels:
    output = np.zeros(len(categories))
    for aspect in item:
        if aspect in aspect_to_category:
            for category in aspect_to_category[aspect]:
                output[categories.index(category)] = 1
    labels_one_hot.append(output)

number_of_data = int(len(texts)*0.9)
eval_texts = texts[number_of_data:]
eval_labels = labels_one_hot[number_of_data:]

In [None]:
prompts = {
    'zero_coarse_paper': 'Identify the aspect(s) of the given paper that should be focused on during a review. Aspects: Ablation, Analysis, Comparison, Contribution, Data/Task, Definition/Description/Detail/Discussion/Explanation/Interpretation, Evaluation, Experiment, Intuition/Justification/Motivation/Validation, Methodology, Novelty, Presentation, Related Work, Result, Significance, Theory. If there are multiple aspects that should be focused on, select all applicable aspects. Format the output in a json dictionary: {\"Aspects\": [...]}.',
    'zero_fine_paper': 'Identify the aspect(s) of the given paper that should be focused on during a review. Aspects: AI, Ablation, Accuracy, Adaptation, Adversarial, Agent, Algorithm, Analysis, Annotation, Application, Approach, Architecture, Assumption, Attention, Baseline, Benchmark, Clarity, Comparison, Complexity, Confusion, Contribution, Data, Definition, Description, Detail, Discussion, Effectiveness, Efficiency, Embeddings, Evaluation, Evidence, Experiment, Explanation, Figure, Findings, Fine-tuning, Framework, Generalization, Grammar, Hypothesis, Impact, Implementation, Importance, Improvement, Interpretation, Intuition, Justification, Method, Metric, Model, Motivation, Notation, Novelty, Parameter, Performance, Presentation, Prompt, Related Work, Result, Robustness, Significance, Statistical Significance, Table, Task, Technique, Terminology, Theory, Training, Transformer, Typo, Validation. If there are multiple aspects that should be focused on, select all applicable aspects. Format the output in a json dictionary: {\"Aspects\": [...]}.',
    'zero_coarse_abstract': 'Identify the aspect(s) that should be focused on during a review given a paper abstract. Aspects: Ablation, Analysis, Comparison, Contribution, Data/Task, Definition/Description/Detail/Discussion/Explanation/Interpretation, Evaluation, Experiment, Intuition/Justification/Motivation/Validation, Methodology, Novelty, Presentation, Related Work, Result, Significance, Theory. If there are multiple aspects that should be focused on, select all applicable aspects. Format the output in a json dictionary: {\"Aspects\": [...]}.',
    'zero_fine_abstract': 'Identify the aspect(s) that should be focused on during a review given a paper abstract. Aspects: AI, Ablation, Accuracy, Adaptation, Adversarial, Agent, Algorithm, Analysis, Annotation, Application, Approach, Architecture, Assumption, Attention, Baseline, Benchmark, Clarity, Comparison, Complexity, Confusion, Contribution, Data, Definition, Description, Detail, Discussion, Effectiveness, Efficiency, Embeddings, Evaluation, Evidence, Experiment, Explanation, Figure, Findings, Fine-tuning, Framework, Generalization, Grammar, Hypothesis, Impact, Implementation, Importance, Improvement, Interpretation, Intuition, Justification, Method, Metric, Model, Motivation, Notation, Novelty, Parameter, Performance, Presentation, Prompt, Related Work, Result, Robustness, Significance, Statistical Significance, Table, Task, Technique, Terminology, Theory, Training, Transformer, Typo, Validation. If there are multiple aspects that should be focused on, select all applicable aspects. Format the output in a json dictionary: {\"Aspects\": [...]}.',
    'few_coarse_keywords': 'Identify the aspect(s) that should be focused on during a review given paper keywords. Aspects: Ablation, Analysis, Comparison, Contribution, Data/Task, Definition/Description/Detail/Discussion/Explanation/Interpretation, Evaluation, Experiment, Intuition/Justification/Motivation/Validation, Methodology, Novelty, Presentation, Related Work, Result, Significance, Theory. If there are multiple aspects that should be focused on, select all applicable aspects. Format the output in a json dictionary: {\"Aspects\": [...]}. Here are some examples:\n\nINPUT: large language models, robotic manipulation, code generation\nOUTPUT: {"Aspects": ["Evaluation", "Methodology", "Data/Task", "Comparison", "Result", "Experiment", "Ablation"]}\n\nINPUT: Reinforcement Learning, Language Models, Reward Maximization, Distribution Matching, Energy Based Models, Controlled Text Generation\nOUTPUT: {"Aspects": ["Evaluation", "Methodology", "Analysis", "Comparison", "Data/Task", "Result", "Experiment"]}\n\nINPUT: Privacy and Security, Large Language Model, Lattice, Generative Models, NLP\nOUTPUT: {"Aspects": ["Evaluation", "Methodology", "Data/Task", "Result", "Experiment"]}\n\nINPUT: Invariants, Software Engineering, Programming Languages\nOUTPUT: {"Aspects": ["Evaluation", "Methodology", "Analysis", "Comparison", "Data/Task", "Result", "Experiment"]}\n\nINPUT: hybrid AI systems, retrieval augmentation, natural language generation, efficient AI\nOUTPUT: {"Aspects": ["Evaluation", "Methodology", "Data/Task", "Comparison", "Result", "Experiment", "Ablation"]}\n\nINPUT: multi-agent communication, self-play, emergent languages\nOUTPUT: {"Aspects": ["Evaluation", "Methodology", "Data/Task", "Analysis", "Comparison", "Result", "Experiment"]}\n\nINPUT: Language Models, Controllability, Prompting, Zero-Shot Learning, Editing\nOUTPUT: {"Aspects": ["Evaluation", "Methodology", "Data/Task", "Comparison", "Analysis", "Result", "Experiment"]}\n\nINPUT: Machine Translation, Multilingualism, Linguistic similarity, Dataset, African languages, Multi-task learning\nOUTPUT: {"Aspects": ["Evaluation", "Methodology", "Data/Task", "Analysis", "Comparison", "Result", "Experiment"]}\n\nINPUT: Simultaneous machine translation, Machine translation, Natural language processing, Transformer\nOUTPUT: {"Aspects": ["Evaluation", "Methodology", "Data/Task", "Comparison", "Analysis", "Result", "Experiment"]}\n\nINPUT: natural language processing, LSTM, timescale, hierarchy, temporal context\nOUTPUT: {"Aspects": ["Methodology", "Analysis", "Data/Task", "Comparison", "Result", "Experiment", "Theory", "Ablation"]}\n\nINPUT: multi-agent reinforcement learning, language acquisition, emergent communication, acoustic communication, continuous signalling\nOUTPUT: {"Aspects": ["Evaluation", "Methodology", "Data/Task", "Comparison", "Analysis", "Result", "Experiment", "Ablation"]}\n\nINPUT: FP8, quantisation, low-precision training, low-precision inference, post-training quantisation, large language models, hardware\nOUTPUT: {"Aspects": ["Evaluation", "Methodology", "Comparison", "Data/Task", "Result", "Experiment"]}\n\nINPUT: cross-lingual transfer, sentence embeddings, polyglot language models, knowledge distillation, natural language inference, embedding alignment, embedding mapping\nOUTPUT: {"Aspects": ["Evaluation", "Methodology", "Comparison", "Data/Task", "Result", "Experiment"]}\n\nINPUT: Audio Generation, Audio Synthesis, Large Language Models (LLMs), AIGC, Computational Creativity\nOUTPUT: {"Aspects": ["Evaluation", "Methodology", "Data/Task", "Comparison", "Analysis", "Result", "Experiment", "Ablation"]}\n\nINPUT: visually-grounded language modeling, visual commonsense reasoning, pre-trained visually-augmented language model\nOUTPUT: {"Aspects": ["Evaluation", "Methodology", "Data/Task", "Comparison", "Analysis", "Result", "Experiment", "Theory", "Ablation"]}\n\nINPUT: Transformer, Attention, Natural Language Processing, Language Model Pre-training, Position Encoding\nOUTPUT: {"Aspects": ["Evaluation", "Methodology", "Comparison", "Data/Task", "Result", "Experiment", "Ablation"]}\n\nINPUT: Language translation, image classification, transformer\nOUTPUT: {"Aspects": ["Evaluation", "Methodology", "Comparison", "Analysis", "Result", "Experiment"]}\n\nINPUT: Emergent Language, Emergent Communication, Transfer Learning\nOUTPUT: {"Aspects": ["Evaluation", "Methodology", "Data/Task", "Comparison", "Analysis", "Result", "Experiment", "Ablation"]}\n\nINPUT: proteins, language modeling, structure prediction, unsupervised learning, explainable\nOUTPUT: {"Aspects": ["Evaluation", "Methodology", "Data/Task", "Comparison", "Analysis", "Result", "Experiment", "Ablation"]}\n\nINPUT: Explainability, hallucination, controllability, generative language models\nOUTPUT: {"Aspects": ["Methodology", "Analysis", "Data/Task", "Result", "Experiment", "Theory"]}\n\nINPUT: hybrid AI systems, retrieval augmentation, natural language generation, efficient AI\nOUTPUT: {"Aspects": ["Evaluation", "Methodology", "Data/Task", "Comparison", "Result", "Experiment", "Ablation"]}\n\nINPUT: multi-agent communication, self-play, emergent languages\nOUTPUT: {"Aspects": ["Evaluation", "Methodology", "Data/Task", "Analysis", "Comparison", "Result", "Experiment"]}\n\nINPUT: Language Models, Controllability, Prompting, Zero-Shot Learning, Editing\nOUTPUT: {"Aspects": ["Evaluation", "Methodology", "Data/Task", "Comparison", "Analysis", "Result", "Experiment"]}\n\nINPUT: Machine Translation, Multilingualism, Linguistic similarity, Dataset, African languages, Multi-task learning\nOUTPUT: {"Aspects": ["Evaluation", "Methodology", "Data/Task", "Analysis", "Comparison", "Result", "Experiment"]}\n\nINPUT: hybrid AI systems, retrieval augmentation, natural language generation, efficient AI\nOUTPUT: {"Aspects": ["Evaluation", "Methodology", "Data/Task", "Comparison", "Result", "Experiment", "Ablation"]}\n\nINPUT: Language models, natural language processing, reasoning\nOUTPUT: {"Aspects": ["Data/Task", "Methodology", "Experiment", "Result"]}\n\nINPUT: curriculum learning, natural language processing\nOUTPUT: {"Aspects": ["Evaluation", "Methodology", "Data/Task", "Comparison", "Analysis", "Result", "Experiment", "Ablation"]}\n\nINPUT: Large Language Models, Transformers, Compression, Arithmetic Coding, Zip, Lossless Text Compression\nOUTPUT: {"Aspects": ["Methodology", "Data/Task", "Comparison", "Result", "Experiment"]}\n\nINPUT: dropout, language\nOUTPUT: {"Aspects": ["Evaluation", "Methodology", "Analysis", "Data/Task", "Result", "Experiment", "Theory"]}\n\nINPUT: emergent language, reinforcement learning, neural networks\nOUTPUT: {"Aspects": ["Evaluation", "Methodology", "Comparison", "Data/Task", "Analysis", "Result", "Experiment", "Theory"]}\n\nINPUT: word embedding, natural language processing\nOUTPUT: {"Aspects": ["Methodology", "Data/Task", "Analysis", "Comparison", "Result", "Experiment", "Theory"]}\n\nINPUT: RNN, pushdown automata, nondeterminism, formal languages, language modeling\nOUTPUT: {"Aspects": ["Evaluation", "Methodology", "Comparison", "Data/Task", "Analysis", "Result", "Experiment", "Theory"]}\n\nINPUT: Explainable AI, Named Entity Recognition, Language Models, Natural Language Processing\nOUTPUT: {"Aspects": ["Evaluation", "Methodology", "Comparison", "Analysis", "Data/Task", "Result", "Experiment", "Ablation"]}\n\nINPUT: Edit, Representation Learning, Source-code, natural language editing\nOUTPUT: {"Aspects": ["Evaluation", "Methodology", "Data/Task", "Comparison", "Result", "Experiment", "Theory", "Ablation"]}\n\nINPUT: knn, memory-augmented networks, language generation, dialogue\nOUTPUT: {"Aspects": ["Evaluation", "Methodology", "Data/Task", "Comparison", "Analysis", "Result", "Experiment", "Ablation"]}\n\nINPUT: Language Models, Knowledge Graphs, Information Extraction\nOUTPUT: {"Aspects": ["Evaluation", "Methodology", "Data/Task", "Comparison", "Analysis", "Result", "Experiment", "Ablation"]}\n\n',
    'few_fine_keywords': 'Identify the aspect(s) that should be focused on during a review given paper keywords. Aspects: AI, Ablation, Accuracy, Adaptation, Adversarial, Agent, Algorithm, Analysis, Annotation, Application, Approach, Architecture, Assumption, Attention, Baseline, Benchmark, Clarity, Comparison, Complexity, Confusion, Contribution, Data, Definition, Description, Detail, Discussion, Effectiveness, Efficiency, Embeddings, Evaluation, Evidence, Experiment, Explanation, Figure, Findings, Fine-tuning, Framework, Generalization, Grammar, Hypothesis, Impact, Implementation, Importance, Improvement, Interpretation, Intuition, Justification, Method, Metric, Model, Motivation, Notation, Novelty, Parameter, Performance, Presentation, Prompt, Related Work, Result, Robustness, Significance, Statistical Significance, Table, Task, Technique, Terminology, Theory, Training, Transformer, Typo, Validation. If there are multiple aspects that should be focused on, select all applicable aspects. Format the output in a json dictionary: {\"Aspects\": [...]}. Here are some examples:\n\nINPUT: word embeddings, natural language processing, model reduction\nOUTPUT: {"Aspects": ["Embeddings", "Comparison", "Efficiency", "Experiment", "Training", "Model", "Transformer", "Application", "Benchmark", "Result", "Baseline", "Analysis", "Complexity", "Method", "Parameter", "Evaluation", "Task", "Performance", "Algorithm"]}\n\nINPUT: curriculum learning, natural language processing\nOUTPUT: {"Aspects": ["Comparison", "Experiment", "Training", "Model", "Result", "Baseline", "Ablation", "Approach", "Analysis", "Improvement", "Method", "Data", "Implementation", "Framework", "Parameter", "Evaluation", "Task", "Performance", "Algorithm"]}\n\nINPUT: Language-Guided Task Completion, Multimodal learning, Neural SLAM.\nOUTPUT: {"Aspects": ["Comparison", "Assumption", "Experiment", "Agent", "Model", "Application", "Result", "Benchmark", "Approach", "Analysis", "Improvement", "Robustness", "Complexity", "Method", "Architecture", "Parameter", "Evaluation", "Task", "Generalization", "Performance"]}\n\nINPUT: representation learning, natural language processing\nOUTPUT: {"Aspects": ["Comparison", "Assumption", "Efficiency", "Experiment", "Accuracy", "Training", "Model", "Result", "Benchmark", "Theory", "Ablation", "Baseline", "Improvement", "Complexity", "Method", "Data", "Effectiveness", "Implementation", "Parameter", "Task", "Hypothesis", "Generalization", "Performance", "Evidence", "Algorithm"]}\n\nINPUT: hybrid AI systems, retrieval augmentation, natural language generation, efficient AI\nOUTPUT: {"Aspects": ["Comparison", "Efficiency", "Experiment", "Technique", "Accuracy", "Training", "Model", "Metric", "Application", "Fine-tuning", "Ablation", "Baseline", "Approach", "Improvement", "Complexity", "Method", "Data", "Effectiveness", "Framework", "Evaluation", "Task", "Generalization", "Performance", "Algorithm"]}\n\nINPUT: Large Language Model, Tool Use, Tree Search, A* Search\nOUTPUT: {"Aspects": ["Comparison", "Efficiency", "Experiment", "Agent", "Accuracy", "Model", "Prompt", "Result", "Ablation", "Baseline", "Approach", "Analysis", "Improvement", "Method", "Data", "Effectiveness", "Implementation", "Framework", "Parameter", "Evaluation", "Task", "Generalization", "Performance", "Algorithm"]}\n\nINPUT: data augmentation, natural language understanding, consistency training, contrastive learning\nOUTPUT: {"Aspects": ["Comparison", "Experiment", "Technique", "Training", "Model", "Transformer", "Application", "Benchmark", "Result", "Statistical Significance", "Ablation", "Approach", "Analysis", "Improvement", "Complexity", "Method", "Data", "Framework", "Task", "Adversarial", "Performance"]}\n\nINPUT: Regularization, Model Calibration, Adaptive Label Smoothing, Self-Knowledge Distillation, Overconfidence, Natural Language Generation\nOUTPUT: {"Aspects": ["Comparison", "Assumption", "Experiment", "Accuracy", "Training", "Model", "Metric", "Result", "Benchmark", "Theory", "Ablation", "Baseline", "Analysis", "Method", "Data", "Effectiveness", "Parameter", "Evaluation", "Performance"]}\n\nINPUT: Natural Language Processing, Text Classification, Information Geomtery, Sentiment Analysis\nOUTPUT: {"Aspects": ["Comparison", "Experiment", "Technique", "Accuracy", "Training", "Model", "Metric", "Application", "Result", "Fine-tuning", "Theory", "Baseline", "Analysis", "Improvement", "Robustness", "Method", "Data", "Evaluation", "Task", "Generalization", "Adversarial", "Performance", "Evidence"]}\n\nINPUT: cognitive science, language, perception, representational similarity\nOUTPUT: {"Aspects": ["Model", "Evaluation", "Performance", "Approach", "Findings", "Task", "Comparison", "AI", "Annotation", "Method", "Benchmark", "Data", "Implementation", "Experiment", "Technique"]}\n\nINPUT: large language models, safety, auditing, robustness\nOUTPUT: {"Aspects": ["Prompt", "Evaluation", "Approach", "Comparison", "Improvement", "Efficiency", "Result", "Effectiveness", "Experiment", "Algorithm"]}\n\nINPUT: representation learning, natural language processing, attention\nOUTPUT: {"Aspects": ["Comparison", "Experiment", "Model", "Transformer", "Application", "Result", "Theory", "Baseline", "Analysis", "Complexity", "Method", "Data", "Architecture", "Parameter", "Attention", "Evaluation", "Task", "Performance", "Algorithm"]}\n\nINPUT: Mixture-of-Experts, Neural Machine Translation, Multilingual, Multi-Task Learning, Conditional Computation, Natural Language Processing\nOUTPUT: {"Aspects": ["Model", "Approach", "Findings", "Task", "Adaptation", "Comparison", "Analysis", "Complexity", "Accuracy", "Result", "Efficiency", "Data", "Implementation", "Framework", "Experiment", "Parameter", "Performance", "Training"]}\n\nINPUT: Pre-trained model, language model, Document understanding, Document intelligence, OCR\nOUTPUT: {"Aspects": ["Embeddings", "Comparison", "Experiment", "Training", "Model", "Metric", "Result", "Ablation", "Baseline", "Analysis", "Improvement", "Robustness", "Method", "Data", "Effectiveness", "Architecture", "Attention", "Task", "Performance"]}\n\nINPUT: quality diversity, large language models, derivative-free optimization, AI feedback\nOUTPUT: {"Aspects": ["Prompt", "Evaluation", "Approach", "Task", "Metric", "Analysis", "AI", "Comparison", "Application", "Method", "Data", "Generalization", "Experiment", "Ablation", "Baseline", "Performance", "Evidence", "Algorithm"]}\n\nINPUT: visually-grounded speech, self-supervised learning, discrete representation learning, vision and language, vision and speech, hierarchical representation learning\nOUTPUT: {"Aspects": ["Model", "Evaluation", "Approach", "Task", "Metric", "Comparison", "Analysis", "Method", "Benchmark", "Result", "Hypothesis", "Experiment", "Architecture", "Performance", "Evidence", "Training"]}\n\nINPUT: NLP, self-supervised learning, language model pre-training, knowledge distillation, BERT, compact models\nOUTPUT: {"Aspects": ["Model", "Findings", "Task", "Comparison", "Complexity", "Method", "Result", "Benchmark", "Hypothesis", "Fine-tuning", "Experiment", "Baseline", "Ablation", "Technique", "Training"]}\n\nINPUT: pretrained language model, zero-shot transfer, parsing, natural language inference\nOUTPUT: {"Aspects": ["Model", "Evaluation", "Embeddings", "Approach", "Task", "Adaptation", "Comparison", "Complexity", "Method", "Data", "Efficiency", "Fine-tuning", "Experiment", "Baseline", "Parameter", "Performance", "Training"]}\n\nINPUT: multilinguality, science for NLP, fundamental science in the era of AI/DL, representation learning for language, conditional language modeling, Transformer, Double Descent, non-monotonicity, fairness, meta evaluation, visualization or interpretation of learned representations\nOUTPUT: {"Aspects": ["Model", "Transformer", "Task", "Comparison", "Improvement", "Complexity", "Method", "Data", "Result", "Generalization", "Experiment", "Statistical Significance", "Performance"]}\n\nINPUT: dropout, language\nOUTPUT: {"Aspects": ["Model", "Task", "Metric", "Analysis", "Improvement", "Assumption", "Method", "Result", "Data", "Generalization", "Experiment", "Theory", "Parameter", "Performance", "Training"]}\n\nINPUT: large language model, evolutionary algorithm, prompt engineering, natural language processing\nOUTPUT: {"Aspects": ["Comparison", "Experiment", "Model", "Prompt", "Application", "Result", "Benchmark", "Baseline", "Ablation", "Approach", "Analysis", "Robustness", "Method", "Implementation", "Effectiveness", "Data", "Framework", "Parameter", "Evaluation", "Generalization", "Performance", "Algorithm"]}\n\nINPUT: Explainable AI, Named Entity Recognition, Language Models, Natural Language Processing\nOUTPUT: {"Aspects": ["Model", "Evaluation", "Approach", "Task", "Metric", "Comparison", "Analysis", "Method", "Data", "Effectiveness", "Fine-tuning", "Experiment", "Ablation", "Baseline", "Performance", "Accuracy", "Algorithm"]}\n\nINPUT: multimodal, few-shot learning, meta-learning, transformers, vision and language models\nOUTPUT: {"Aspects": ["Model", "Approach", "Task", "Comparison", "Improvement", "Analysis", "Application", "Efficiency", "Result", "Data", "Complexity", "Method", "Experiment", "Ablation", "Baseline", "Performance", "Training"]}\n\nINPUT: Retrieval Augmentation; Non-Knowledge-Intensive Task; Natural Language Understanding;\nOUTPUT: {"Aspects": ["Comparison", "Efficiency", "Experiment", "Model", "Prompt", "Metric", "Application", "Result", "Fine-tuning", "Baseline", "Approach", "Adaptation", "Improvement", "Method", "Effectiveness", "Architecture", "Parameter", "Evaluation", "Task", "Performance", "Evidence"]}\n\nINPUT: Language translation, image classification, transformer\nOUTPUT: {"Aspects": ["Model", "Evaluation", "Comparison", "Analysis", "Improvement", "Efficiency", "Method", "Accuracy", "Result", "Implementation", "Experiment", "Architecture", "Baseline", "Performance", "Evidence", "Training"]}\n\nINPUT: regularizers, vision, language, vqa, visual question answering\nOUTPUT: {"Aspects": ["Model", "Performance", "Embeddings", "Task", "Comparison", "Improvement", "Complexity", "Application", "Data", "Result", "Method", "Generalization", "Framework", "Experiment", "Architecture", "Parameter", "Technique", "Training"]}\n\nINPUT: Representation Learning, Natural Language Processing, Contrastive Learning, Set Operation, Querying Framework, Sentence Embedding, Deep Learning\nOUTPUT: {"Aspects": ["Embeddings", "Comparison", "Experiment", "Training", "Model", "Application", "Result", "Theory", "Baseline", "Analysis", "Robustness", "Improvement", "Method", "Data", "Effectiveness", "Framework", "Parameter", "Evaluation", "Annotation", "Generalization", "Performance"]}\n\nINPUT: Perceiver, BERT, natural language processing, optical flow, computer vision, multimodal, GLUE, ImageNet, StarCraft\nOUTPUT: {"Aspects": ["Embeddings", "Comparison", "Experiment", "Training", "Model", "Transformer", "Application", "Benchmark", "Result", "Ablation", "Baseline", "Adaptation", "Analysis", "Improvement", "Complexity", "Method", "Data", "Architecture", "Parameter", "Attention", "Evaluation", "Task", "Performance", "Evidence"]}\n\nINPUT: Language modeling ·Document embedding ·Natural language processing ·Machine learning\nOUTPUT: {"Aspects": ["Model", "Evaluation", "Embeddings", "Comparison", "Analysis", "Complexity", "Method", "Data", "Effectiveness", "Result", "Framework", "Experiment", "Baseline", "Performance", "Training", "Algorithm"]}\n\nINPUT: language emergence\nOUTPUT: {"Aspects": ["Comparison", "Assumption", "Experiment", "Agent", "Training", "Model", "Findings", "Result", "Ablation", "Approach", "Analysis", "Improvement", "Robustness", "Complexity", "Method", "Implementation", "Parameter", "Task", "Generalization", "Performance", "Evidence"]}\n\nINPUT: Conditional language generation model, speech recognition, neural machine translation, calibration, exposure bias\nOUTPUT: {"Aspects": ["Model", "Evaluation", "Attention", "Approach", "Task", "Metric", "Comparison", "Improvement", "Analysis", "Method", "Data", "Implementation", "Result", "Generalization", "Experiment", "Baseline", "Performance", "Training"]}\n\nINPUT: large language models, safety, auditing, robustness\nOUTPUT: {"Aspects": ["Prompt", "Evaluation", "Approach", "Comparison", "Improvement", "Efficiency", "Result", "Effectiveness", "Experiment", "Algorithm"]}\n\nINPUT: natural language processing, domain adaptation\nOUTPUT: {"Aspects": ["Comparison", "Experiment", "Model", "Metric", "Application", "Result", "Ablation", "Baseline", "Approach", "Adaptation", "Analysis", "Improvement", "Method", "Data", "Effectiveness", "Framework", "Parameter", "Attention", "Evaluation", "Task"]}\n\nINPUT: Language Model, LSTM, timescales\nOUTPUT: {"Aspects": ["Comparison", "Experiment", "Technique", "Training", "Model", "Findings", "Result", "Benchmark", "Theory", "Baseline", "Approach", "Analysis", "Improvement", "Method", "Data", "Architecture", "Parameter", "Evaluation", "Task", "Hypothesis", "Generalization", "Performance"]}\n\nINPUT: Retrieval Augmentation; Non-Knowledge-Intensive Task; Natural Language Understanding;\nOUTPUT: {"Aspects": ["Comparison", "Efficiency", "Experiment", "Model", "Prompt", "Metric", "Application", "Result", "Fine-tuning", "Baseline", "Approach", "Adaptation", "Improvement", "Method", "Effectiveness", "Architecture", "Parameter", "Evaluation", "Task", "Performance", "Evidence"]}\n\nINPUT: logical reasoning, machine reading comprehension, language understanding\nOUTPUT: {"Aspects": ["Comparison", "Experiment", "Technique", "Accuracy", "Model", "Benchmark", "Result", "Ablation", "Baseline", "Approach", "Robustness", "Improvement", "Analysis", "Complexity", "Method", "Data", "Effectiveness", "Parameter", "Attention", "Task", "Performance"]}\n\nINPUT: Natural Language Processing, Representation Learning\nOUTPUT: {"Aspects": ["Comparison", "Efficiency", "Experiment", "Training", "Model", "Transformer", "Application", "Result", "Benchmark", "Ablation", "Baseline", "Approach", "Analysis", "Improvement", "Complexity", "Method", "Data", "Effectiveness", "Framework", "Architecture", "Parameter", "Attention", "Evaluation", "Task", "Generalization", "Performance"]}\n\nINPUT: recurrent neural network, neural network, language modeling, minimum description length, genetic algorithm, semantics, syntax\nOUTPUT: {"Aspects": ["Comparison", "Experiment", "Accuracy", "Training", "Model", "Metric", "Application", "Result", "Baseline", "Approach", "Improvement", "Complexity", "Method", "Data", "Framework", "Architecture", "Evaluation", "Task", "Hypothesis", "Generalization", "Performance", "Algorithm"]}\n\nINPUT: quality diversity, large language models, derivative-free optimization, AI feedback\nOUTPUT: {"Aspects": ["Prompt", "Evaluation", "Approach", "Task", "Metric", "Analysis", "AI", "Comparison", "Application", "Method", "Data", "Generalization", "Experiment", "Ablation", "Baseline", "Performance", "Evidence", "Algorithm"]}\n\nINPUT: Large Language Models, Red Teaming Language Models, Game Theory, Safety AI\nOUTPUT: {"Aspects": ["Comparison", "Assumption", "Experiment", "Technique", "Training", "Model", "Prompt", "Metric", "Application", "Result", "Benchmark", "Theory", "Ablation", "Baseline", "Approach", "Improvement", "Complexity", "Method", "Framework", "Parameter", "Evaluation", "Task", "Annotation", "Adversarial", "Performance", "Algorithm"]}\n\nINPUT: Emergent Language, Expressivity\nOUTPUT: {"Aspects": ["Comparison", "Assumption", "Efficiency", "Experiment", "Agent", "Training", "Accuracy", "Model", "Transformer", "Metric", "Application", "Result", "Statistical Significance", "Analysis", "Complexity", "Method", "Data", "Evaluation", "Task", "Hypothesis", "Generalization", "Performance"]}\n\nINPUT: prompt-tuning, pre-trained language model, few-shot learning\nOUTPUT: {"Aspects": ["Embeddings", "Comparison", "Efficiency", "Experiment", "Training", "Prompt", "Model", "Application", "Result", "Fine-tuning", "Ablation", "Approach", "Analysis", "Improvement", "Method", "Data", "Framework", "Architecture", "Parameter", "Task", "Generalization", "Performance"]}\n\nINPUT: interpretation of learned representations, language and visual processing, language-biased image classification, cognitive science\nOUTPUT: {"Aspects": ["Model", "Evaluation", "Findings", "Task", "Analysis", "Comparison", "Method", "Data", "Result", "Hypothesis", "Fine-tuning", "Adversarial", "Performance", "Evidence", "Training"]}\n\nINPUT: language models, pretraining, finetuning, GPU memory\nOUTPUT: {"Aspects": ["Model", "Performance", "Transformer", "Approach", "Task", "Comparison", "Improvement", "Assumption", "Efficiency", "Method", "Result", "Implementation", "Experiment", "Parameter", "Technique", "Training", "Algorithm"]}\n\nINPUT: language emergence\nOUTPUT: {"Aspects": ["Comparison", "Assumption", "Experiment", "Agent", "Training", "Model", "Findings", "Result", "Ablation", "Approach", "Analysis", "Improvement", "Robustness", "Complexity", "Method", "Implementation", "Parameter", "Task", "Generalization", "Performance", "Evidence"]}\n\nINPUT: nmt, nlp, neural machine translation, natural language processing, deep learning, machine learning, machine translation, mt\nOUTPUT: {"Aspects": ["Evaluation", "Task", "Metric", "Comparison", "Improvement", "Annotation", "Analysis", "Method", "Data", "Effectiveness", "Result", "Experiment", "Baseline", "Performance", "Training", "Algorithm"]}\n\n',
    'few_coarse_title': 'Identify the aspect(s) that should be focused on during a review given a paper title. Aspects: Ablation, Analysis, Comparison, Contribution, Data/Task, Definition/Description/Detail/Discussion/Explanation/Interpretation, Evaluation, Experiment, Intuition/Justification/Motivation/Validation, Methodology, Novelty, Presentation, Related Work, Result, Significance, Theory. If there are multiple aspects that should be focused on, select all applicable aspects. Format the output in a json dictionary: {\"Aspects\": [...]}. Here are some examples:\n\nINPUT: Instruct2Act: Mapping Multi-modality Instructions to Robotic Arm Actions with Large Language Model\nOUTPUT: {"Aspects": ["Evaluation", "Methodology", "Data/Task", "Comparison", "Result", "Experiment", "Ablation"]}\n\nINPUT: On Reward Maximization and Distribution Matching for Fine-Tuning Language Models\nOUTPUT: {"Aspects": ["Evaluation", "Methodology", "Analysis", "Comparison", "Data/Task", "Result", "Experiment"]}\n\nINPUT: LatticeGen: A Cooperative Framework Which Hides Generated Text in A Lattice For Privacy-Aware Generation on Cloud\nOUTPUT: {"Aspects": ["Evaluation", "Methodology", "Data/Task", "Result", "Experiment"]}\n\nINPUT: Learning to Infer Run-Time Invariants from Source code\nOUTPUT: {"Aspects": ["Evaluation", "Methodology", "Analysis", "Comparison", "Data/Task", "Result", "Experiment"]}\n\nINPUT: Hybrid Retrieval-Augmented Generation for Real-time Composition Assistance\nOUTPUT: {"Aspects": ["Evaluation", "Methodology", "Data/Task", "Comparison", "Result", "Experiment", "Ablation"]}\n\nINPUT: On the interaction between supervision and self-play in emergent communication\nOUTPUT: {"Aspects": ["Evaluation", "Methodology", "Data/Task", "Analysis", "Comparison", "Result", "Experiment"]}\n\nINPUT: PEER: A Collaborative Language Model\nOUTPUT: {"Aspects": ["Evaluation", "Methodology", "Data/Task", "Comparison", "Analysis", "Result", "Experiment"]}\n\nINPUT: On the use of linguistic similarities to improve Neural Machine Translation for African Languages\nOUTPUT: {"Aspects": ["Evaluation", "Methodology", "Data/Task", "Analysis", "Comparison", "Result", "Experiment"]}\n\nINPUT: Hidden Markov Transformer for Simultaneous Machine Translation\nOUTPUT: {"Aspects": ["Evaluation", "Methodology", "Data/Task", "Comparison", "Analysis", "Result", "Experiment"]}\n\nINPUT: Mapping the Timescale Organization of Neural Language Models\nOUTPUT: {"Aspects": ["Methodology", "Analysis", "Data/Task", "Comparison", "Result", "Experiment", "Theory", "Ablation"]}\n\nINPUT: Towards Learning to Speak and Hear Through Multi-Agent Communication over a Continuous Acoustic Channel\nOUTPUT: {"Aspects": ["Evaluation", "Methodology", "Data/Task", "Comparison", "Analysis", "Result", "Experiment", "Ablation"]}\n\nINPUT: Training and inference of large language models using 8-bit floating point\nOUTPUT: {"Aspects": ["Evaluation", "Methodology", "Comparison", "Data/Task", "Result", "Experiment"]}\n\nINPUT: XD: Cross-lingual Knowledge Distillation for Polyglot Sentence Embeddings\nOUTPUT: {"Aspects": ["Evaluation", "Methodology", "Comparison", "Data/Task", "Result", "Experiment"]}\n\nINPUT: WavJourney: Compositional Audio Creation with Large Language Models\nOUTPUT: {"Aspects": ["Evaluation", "Methodology", "Data/Task", "Comparison", "Analysis", "Result", "Experiment", "Ablation"]}\n\nINPUT: Visually-Augmented Language Modeling\nOUTPUT: {"Aspects": ["Evaluation", "Methodology", "Data/Task", "Comparison", "Analysis", "Result", "Experiment", "Theory", "Ablation"]}\n\nINPUT: DEBERTA: DECODING-ENHANCED BERT WITH DISENTANGLED ATTENTION\nOUTPUT: {"Aspects": ["Evaluation", "Methodology", "Comparison", "Data/Task", "Result", "Experiment", "Ablation"]}\n\nINPUT: Rethinking skip connection model as a learnable Markov chain\nOUTPUT: {"Aspects": ["Evaluation", "Methodology", "Comparison", "Analysis", "Result", "Experiment"]}\n\nINPUT: Linking Emergent and Natural Languages via Corpus Transfer\nOUTPUT: {"Aspects": ["Evaluation", "Methodology", "Data/Task", "Comparison", "Analysis", "Result", "Experiment", "Ablation"]}\n\nINPUT: Transformer protein language models are unsupervised structure learners\nOUTPUT: {"Aspects": ["Evaluation", "Methodology", "Data/Task", "Comparison", "Analysis", "Result", "Experiment", "Ablation"]}\n\nINPUT: Taming AI Bots: Controllability of Neural States in Large Language Models\nOUTPUT: {"Aspects": ["Methodology", "Analysis", "Data/Task", "Result", "Experiment", "Theory"]}\n\nINPUT: Hybrid Retrieval-Augmented Generation for Real-time Composition Assistance\nOUTPUT: {"Aspects": ["Evaluation", "Methodology", "Data/Task", "Comparison", "Result", "Experiment", "Ablation"]}\n\nINPUT: On the interaction between supervision and self-play in emergent communication\nOUTPUT: {"Aspects": ["Evaluation", "Methodology", "Data/Task", "Analysis", "Comparison", "Result", "Experiment"]}\n\nINPUT: PEER: A Collaborative Language Model\nOUTPUT: {"Aspects": ["Evaluation", "Methodology", "Data/Task", "Comparison", "Analysis", "Result", "Experiment"]}\n\nINPUT: On the use of linguistic similarities to improve Neural Machine Translation for African Languages\nOUTPUT: {"Aspects": ["Evaluation", "Methodology", "Data/Task", "Analysis", "Comparison", "Result", "Experiment"]}\n\nINPUT: Hybrid Retrieval-Augmented Generation for Real-time Composition Assistance\nOUTPUT: {"Aspects": ["Evaluation", "Methodology", "Data/Task", "Comparison", "Result", "Experiment", "Ablation"]}\n\nINPUT: Self-Consistency Improves Chain of Thought Reasoning in Language Models\nOUTPUT: {"Aspects": ["Data/Task", "Methodology", "Experiment", "Result"]}\n\nINPUT: Curriculum Discovery through an Encompassing Curriculum Learning Framework\nOUTPUT: {"Aspects": ["Evaluation", "Methodology", "Data/Task", "Comparison", "Analysis", "Result", "Experiment", "Ablation"]}\n\nINPUT: LLMZip: Lossless Text Compression using Large Language Models\nOUTPUT: {"Aspects": ["Methodology", "Data/Task", "Comparison", "Result", "Experiment"]}\n\nINPUT: Pushing the bounds of dropout\nOUTPUT: {"Aspects": ["Evaluation", "Methodology", "Analysis", "Data/Task", "Result", "Experiment", "Theory"]}\n\nINPUT: Shaped Rewards Bias Emergent Language\nOUTPUT: {"Aspects": ["Evaluation", "Methodology", "Comparison", "Data/Task", "Analysis", "Result", "Experiment", "Theory"]}\n\nINPUT: Learn Interpretable Word Embeddings Efficiently with von Mises-Fisher Distribution\nOUTPUT: {"Aspects": ["Methodology", "Data/Task", "Analysis", "Comparison", "Result", "Experiment", "Theory"]}\n\nINPUT: Learning Hierarchical Structures with Differentiable Nondeterministic Stacks\nOUTPUT: {"Aspects": ["Evaluation", "Methodology", "Comparison", "Data/Task", "Analysis", "Result", "Experiment", "Theory"]}\n\nINPUT: SAGE: Semantic-Aware Global Explanations for Named Entity Recognition\nOUTPUT: {"Aspects": ["Evaluation", "Methodology", "Comparison", "Analysis", "Data/Task", "Result", "Experiment", "Ablation"]}\n\nINPUT: Learning to Model Editing Processes\nOUTPUT: {"Aspects": ["Evaluation", "Methodology", "Data/Task", "Comparison", "Result", "Experiment", "Theory", "Ablation"]}\n\nINPUT: Augmenting Transformers with KNN-Based Composite Memory\nOUTPUT: {"Aspects": ["Evaluation", "Methodology", "Data/Task", "Comparison", "Analysis", "Result", "Experiment", "Ablation"]}\n\nINPUT: Language Models are Open Knowledge Graphs\nOUTPUT: {"Aspects": ["Evaluation", "Methodology", "Data/Task", "Comparison", "Analysis", "Result", "Experiment", "Ablation"]}\n\n',
    'few_fine_title': 'Identify the aspect(s) that should be focused on during a review given a paper title. Aspects: AI, Ablation, Accuracy, Adaptation, Adversarial, Agent, Algorithm, Analysis, Annotation, Application, Approach, Architecture, Assumption, Attention, Baseline, Benchmark, Clarity, Comparison, Complexity, Confusion, Contribution, Data, Definition, Description, Detail, Discussion, Effectiveness, Efficiency, Embeddings, Evaluation, Evidence, Experiment, Explanation, Figure, Findings, Fine-tuning, Framework, Generalization, Grammar, Hypothesis, Impact, Implementation, Importance, Improvement, Interpretation, Intuition, Justification, Method, Metric, Model, Motivation, Notation, Novelty, Parameter, Performance, Presentation, Prompt, Related Work, Result, Robustness, Significance, Statistical Significance, Table, Task, Technique, Terminology, Theory, Training, Transformer, Typo, Validation. If there are multiple aspects that should be focused on, select all applicable aspects. Format the output in a json dictionary: {\"Aspects\": [...]}. Here are some examples:\n\nINPUT: word2ket: Space-efficient Word Embeddings inspired by Quantum Entanglement\nOUTPUT: {"Aspects": ["Embeddings", "Comparison", "Efficiency", "Experiment", "Training", "Model", "Transformer", "Application", "Benchmark", "Result", "Baseline", "Analysis", "Complexity", "Method", "Parameter", "Evaluation", "Task", "Performance", "Algorithm"]}\n\nINPUT: Curriculum Discovery through an Encompassing Curriculum Learning Framework\nOUTPUT: {"Aspects": ["Comparison", "Experiment", "Training", "Model", "Result", "Baseline", "Ablation", "Approach", "Analysis", "Improvement", "Method", "Data", "Implementation", "Framework", "Parameter", "Evaluation", "Task", "Performance", "Algorithm"]}\n\nINPUT: Learning to Act with Affordance-Aware Multimodal Neural SLAM\nOUTPUT: {"Aspects": ["Comparison", "Assumption", "Experiment", "Agent", "Model", "Application", "Result", "Benchmark", "Approach", "Analysis", "Improvement", "Robustness", "Complexity", "Method", "Architecture", "Parameter", "Evaluation", "Task", "Generalization", "Performance"]}\n\nINPUT: Improving Self-supervised Pre-training via a Fully-Explored Masked Language Model\nOUTPUT: {"Aspects": ["Comparison", "Assumption", "Efficiency", "Experiment", "Accuracy", "Training", "Model", "Result", "Benchmark", "Theory", "Ablation", "Baseline", "Improvement", "Complexity", "Method", "Data", "Effectiveness", "Implementation", "Parameter", "Task", "Hypothesis", "Generalization", "Performance", "Evidence", "Algorithm"]}\n\nINPUT: Hybrid Retrieval-Augmented Generation for Real-time Composition Assistance\nOUTPUT: {"Aspects": ["Comparison", "Efficiency", "Experiment", "Technique", "Accuracy", "Training", "Model", "Metric", "Application", "Fine-tuning", "Ablation", "Baseline", "Approach", "Improvement", "Complexity", "Method", "Data", "Effectiveness", "Framework", "Evaluation", "Task", "Generalization", "Performance", "Algorithm"]}\n\nINPUT: ToolChain*: Efficient Action Space Navigation in Large Language Models with A* Search\nOUTPUT: {"Aspects": ["Comparison", "Efficiency", "Experiment", "Agent", "Accuracy", "Model", "Prompt", "Result", "Ablation", "Baseline", "Approach", "Analysis", "Improvement", "Method", "Data", "Effectiveness", "Implementation", "Framework", "Parameter", "Evaluation", "Task", "Generalization", "Performance", "Algorithm"]}\n\nINPUT: CoDA: Contrast-enhanced and Diversity-promoting Data Augmentation for Natural Language Understanding\nOUTPUT: {"Aspects": ["Comparison", "Experiment", "Technique", "Training", "Model", "Transformer", "Application", "Benchmark", "Result", "Statistical Significance", "Ablation", "Approach", "Analysis", "Improvement", "Complexity", "Method", "Data", "Framework", "Task", "Adversarial", "Performance"]}\n\nINPUT: Adaptive Label Smoothing with Self-Knowledge\nOUTPUT: {"Aspects": ["Comparison", "Assumption", "Experiment", "Accuracy", "Training", "Model", "Metric", "Result", "Benchmark", "Theory", "Ablation", "Baseline", "Analysis", "Method", "Data", "Effectiveness", "Parameter", "Evaluation", "Performance"]}\n\nINPUT: Geometry matters: Exploring language examples at the decision boundary\nOUTPUT: {"Aspects": ["Comparison", "Experiment", "Technique", "Accuracy", "Training", "Model", "Metric", "Application", "Result", "Fine-tuning", "Theory", "Baseline", "Analysis", "Improvement", "Robustness", "Method", "Data", "Evaluation", "Task", "Generalization", "Adversarial", "Performance", "Evidence"]}\n\nINPUT: Words are all you need? Language as an approximation for human similarity judgments\nOUTPUT: {"Aspects": ["Model", "Evaluation", "Performance", "Approach", "Findings", "Task", "Comparison", "AI", "Annotation", "Method", "Benchmark", "Data", "Implementation", "Experiment", "Technique"]}\n\nINPUT: Automatically Auditing Large Language Models via Discrete Optimization\nOUTPUT: {"Aspects": ["Prompt", "Evaluation", "Approach", "Comparison", "Improvement", "Efficiency", "Result", "Effectiveness", "Experiment", "Algorithm"]}\n\nINPUT: Attention over Phrases\nOUTPUT: {"Aspects": ["Comparison", "Experiment", "Model", "Transformer", "Application", "Result", "Theory", "Baseline", "Analysis", "Complexity", "Method", "Data", "Architecture", "Parameter", "Attention", "Evaluation", "Task", "Performance", "Algorithm"]}\n\nINPUT: Exploring Routing Strategies for Multilingual Mixture-of-Experts Models\nOUTPUT: {"Aspects": ["Model", "Approach", "Findings", "Task", "Adaptation", "Comparison", "Analysis", "Complexity", "Accuracy", "Result", "Efficiency", "Data", "Implementation", "Framework", "Experiment", "Parameter", "Performance", "Training"]}\n\nINPUT: BROS: A Pre-trained Language Model for Understanding Texts in Document\nOUTPUT: {"Aspects": ["Embeddings", "Comparison", "Experiment", "Training", "Model", "Metric", "Result", "Ablation", "Baseline", "Analysis", "Improvement", "Robustness", "Method", "Data", "Effectiveness", "Architecture", "Attention", "Task", "Performance"]}\n\nINPUT: Quality-Diversity through AI Feedback\nOUTPUT: {"Aspects": ["Prompt", "Evaluation", "Approach", "Task", "Metric", "Analysis", "AI", "Comparison", "Application", "Method", "Data", "Generalization", "Experiment", "Ablation", "Baseline", "Performance", "Evidence", "Algorithm"]}\n\nINPUT: Learning Hierarchical Discrete Linguistic Units from Visually-Grounded Speech\nOUTPUT: {"Aspects": ["Model", "Evaluation", "Approach", "Task", "Metric", "Comparison", "Analysis", "Method", "Benchmark", "Result", "Hypothesis", "Experiment", "Architecture", "Performance", "Evidence", "Training"]}\n\nINPUT: Well-Read Students Learn Better: On the Importance of Pre-training Compact Models\nOUTPUT: {"Aspects": ["Model", "Findings", "Task", "Comparison", "Complexity", "Method", "Result", "Benchmark", "Hypothesis", "Fine-tuning", "Experiment", "Baseline", "Ablation", "Technique", "Training"]}\n\nINPUT: From English to Foreign Languages: Transferring Pre-trained Language Models\nOUTPUT: {"Aspects": ["Model", "Evaluation", "Embeddings", "Approach", "Task", "Adaptation", "Comparison", "Complexity", "Method", "Data", "Efficiency", "Fine-tuning", "Experiment", "Baseline", "Parameter", "Performance", "Training"]}\n\nINPUT: Representation and Bias in Multilingual NLP: Insights from Controlled Experiments on Conditional Language Modeling\nOUTPUT: {"Aspects": ["Model", "Transformer", "Task", "Comparison", "Improvement", "Complexity", "Method", "Data", "Result", "Generalization", "Experiment", "Statistical Significance", "Performance"]}\n\nINPUT: Pushing the bounds of dropout\nOUTPUT: {"Aspects": ["Model", "Task", "Metric", "Analysis", "Improvement", "Assumption", "Method", "Result", "Data", "Generalization", "Experiment", "Theory", "Parameter", "Performance", "Training"]}\n\nINPUT: Connecting Large Language Models with Evolutionary Algorithms Yields Powerful Prompt Optimizers\nOUTPUT: {"Aspects": ["Comparison", "Experiment", "Model", "Prompt", "Application", "Result", "Benchmark", "Baseline", "Ablation", "Approach", "Analysis", "Robustness", "Method", "Implementation", "Effectiveness", "Data", "Framework", "Parameter", "Evaluation", "Generalization", "Performance", "Algorithm"]}\n\nINPUT: SAGE: Semantic-Aware Global Explanations for Named Entity Recognition\nOUTPUT: {"Aspects": ["Model", "Evaluation", "Approach", "Task", "Metric", "Comparison", "Analysis", "Method", "Data", "Effectiveness", "Fine-tuning", "Experiment", "Ablation", "Baseline", "Performance", "Accuracy", "Algorithm"]}\n\nINPUT: Meta Learning to Bridge Vision and Language Models for Multimodal Few-Shot Learning\nOUTPUT: {"Aspects": ["Model", "Approach", "Task", "Comparison", "Improvement", "Analysis", "Application", "Efficiency", "Result", "Data", "Complexity", "Method", "Experiment", "Ablation", "Baseline", "Performance", "Training"]}\n\nINPUT: ReFusion: Improving Natural Language Understanding with Computation-Efficient Retrieval Representation Fusion\nOUTPUT: {"Aspects": ["Comparison", "Efficiency", "Experiment", "Model", "Prompt", "Metric", "Application", "Result", "Fine-tuning", "Baseline", "Approach", "Adaptation", "Improvement", "Method", "Effectiveness", "Architecture", "Parameter", "Evaluation", "Task", "Performance", "Evidence"]}\n\nINPUT: Rethinking skip connection model as a learnable Markov chain\nOUTPUT: {"Aspects": ["Model", "Evaluation", "Comparison", "Analysis", "Improvement", "Efficiency", "Method", "Accuracy", "Result", "Implementation", "Experiment", "Architecture", "Baseline", "Performance", "Evidence", "Training"]}\n\nINPUT: On Incorporating Semantic Prior Knowlegde in Deep Learning Through Embedding-Space Constraints\nOUTPUT: {"Aspects": ["Model", "Performance", "Embeddings", "Task", "Comparison", "Improvement", "Complexity", "Application", "Data", "Result", "Method", "Generalization", "Framework", "Experiment", "Architecture", "Parameter", "Technique", "Training"]}\n\nINPUT: SetCSE: Set Operations using Contrastive Learning of Sentence Embeddings\nOUTPUT: {"Aspects": ["Embeddings", "Comparison", "Experiment", "Training", "Model", "Application", "Result", "Theory", "Baseline", "Analysis", "Robustness", "Improvement", "Method", "Data", "Effectiveness", "Framework", "Parameter", "Evaluation", "Annotation", "Generalization", "Performance"]}\n\nINPUT: Perceiver IO: A General Architecture for Structured Inputs & Outputs\nOUTPUT: {"Aspects": ["Embeddings", "Comparison", "Experiment", "Training", "Model", "Transformer", "Application", "Benchmark", "Result", "Ablation", "Baseline", "Adaptation", "Analysis", "Improvement", "Complexity", "Method", "Data", "Architecture", "Parameter", "Attention", "Evaluation", "Task", "Performance", "Evidence"]}\n\nINPUT: JOINTLY LEARNING TOPIC SPECIFIC WORD AND DOCUMENT EMBEDDING\nOUTPUT: {"Aspects": ["Model", "Evaluation", "Embeddings", "Comparison", "Analysis", "Complexity", "Method", "Data", "Effectiveness", "Result", "Framework", "Experiment", "Baseline", "Performance", "Training", "Algorithm"]}\n\nINPUT: Entropy Minimization In Emergent Languages\nOUTPUT: {"Aspects": ["Comparison", "Assumption", "Experiment", "Agent", "Training", "Model", "Findings", "Result", "Ablation", "Approach", "Analysis", "Improvement", "Robustness", "Complexity", "Method", "Implementation", "Parameter", "Task", "Generalization", "Performance", "Evidence"]}\n\nINPUT: Minimum Edit Distance Training for Conditional Language Generation Models\nOUTPUT: {"Aspects": ["Model", "Evaluation", "Attention", "Approach", "Task", "Metric", "Comparison", "Improvement", "Analysis", "Method", "Data", "Implementation", "Result", "Generalization", "Experiment", "Baseline", "Performance", "Training"]}\n\nINPUT: Automatically Auditing Large Language Models via Discrete Optimization\nOUTPUT: {"Aspects": ["Prompt", "Evaluation", "Approach", "Comparison", "Improvement", "Efficiency", "Result", "Effectiveness", "Experiment", "Algorithm"]}\n\nINPUT: Progressive Memory Banks for Incremental Domain Adaptation\nOUTPUT: {"Aspects": ["Comparison", "Experiment", "Model", "Metric", "Application", "Result", "Ablation", "Baseline", "Approach", "Adaptation", "Analysis", "Improvement", "Method", "Data", "Effectiveness", "Framework", "Parameter", "Attention", "Evaluation", "Task"]}\n\nINPUT: Multi-timescale Representation Learning in LSTM Language Models\nOUTPUT: {"Aspects": ["Comparison", "Experiment", "Technique", "Training", "Model", "Findings", "Result", "Benchmark", "Theory", "Baseline", "Approach", "Analysis", "Improvement", "Method", "Data", "Architecture", "Parameter", "Evaluation", "Task", "Hypothesis", "Generalization", "Performance"]}\n\nINPUT: ReFusion: Improving Natural Language Understanding with Computation-Efficient Retrieval Representation Fusion\nOUTPUT: {"Aspects": ["Comparison", "Efficiency", "Experiment", "Model", "Prompt", "Metric", "Application", "Result", "Fine-tuning", "Baseline", "Approach", "Adaptation", "Improvement", "Method", "Effectiveness", "Architecture", "Parameter", "Evaluation", "Task", "Performance", "Evidence"]}\n\nINPUT: Fact-driven Logical Reasoning\nOUTPUT: {"Aspects": ["Comparison", "Experiment", "Technique", "Accuracy", "Model", "Benchmark", "Result", "Ablation", "Baseline", "Approach", "Robustness", "Improvement", "Analysis", "Complexity", "Method", "Data", "Effectiveness", "Parameter", "Attention", "Task", "Performance"]}\n\nINPUT: Deepening Hidden Representations from Pre-trained Language Models\nOUTPUT: {"Aspects": ["Comparison", "Efficiency", "Experiment", "Training", "Model", "Transformer", "Application", "Result", "Benchmark", "Ablation", "Baseline", "Approach", "Analysis", "Improvement", "Complexity", "Method", "Data", "Effectiveness", "Framework", "Architecture", "Parameter", "Attention", "Evaluation", "Task", "Generalization", "Performance"]}\n\nINPUT: Minimum Description Length Recurrent Neural Networks\nOUTPUT: {"Aspects": ["Comparison", "Experiment", "Accuracy", "Training", "Model", "Metric", "Application", "Result", "Baseline", "Approach", "Improvement", "Complexity", "Method", "Data", "Framework", "Architecture", "Evaluation", "Task", "Hypothesis", "Generalization", "Performance", "Algorithm"]}\n\nINPUT: Quality-Diversity through AI Feedback\nOUTPUT: {"Aspects": ["Prompt", "Evaluation", "Approach", "Task", "Metric", "Analysis", "AI", "Comparison", "Application", "Method", "Data", "Generalization", "Experiment", "Ablation", "Baseline", "Performance", "Evidence", "Algorithm"]}\n\nINPUT: Red Teaming Game: A Game-Theoretic Framework for Red Teaming Language Models\nOUTPUT: {"Aspects": ["Comparison", "Assumption", "Experiment", "Technique", "Training", "Model", "Prompt", "Metric", "Application", "Result", "Benchmark", "Theory", "Ablation", "Baseline", "Approach", "Improvement", "Complexity", "Method", "Framework", "Parameter", "Evaluation", "Task", "Annotation", "Adversarial", "Performance", "Algorithm"]}\n\nINPUT: Expressivity of Emergent Languages is a Trade-off between Contextual Complexity and Unpredictability\nOUTPUT: {"Aspects": ["Comparison", "Assumption", "Efficiency", "Experiment", "Agent", "Training", "Accuracy", "Model", "Transformer", "Metric", "Application", "Result", "Statistical Significance", "Analysis", "Complexity", "Method", "Data", "Evaluation", "Task", "Hypothesis", "Generalization", "Performance"]}\n\nINPUT: Differentiable Prompt Makes Pre-trained Language Models Better Few-shot Learners\nOUTPUT: {"Aspects": ["Embeddings", "Comparison", "Efficiency", "Experiment", "Training", "Prompt", "Model", "Application", "Result", "Fine-tuning", "Ablation", "Approach", "Analysis", "Improvement", "Method", "Data", "Framework", "Architecture", "Parameter", "Task", "Generalization", "Performance"]}\n\nINPUT: Language-biased image classification: evaluation based on semantic representations\nOUTPUT: {"Aspects": ["Model", "Evaluation", "Findings", "Task", "Analysis", "Comparison", "Method", "Data", "Result", "Hypothesis", "Fine-tuning", "Adversarial", "Performance", "Evidence", "Training"]}\n\nINPUT: 8-bit Optimizers via Block-wise Quantization\nOUTPUT: {"Aspects": ["Model", "Performance", "Transformer", "Approach", "Task", "Comparison", "Improvement", "Assumption", "Efficiency", "Method", "Result", "Implementation", "Experiment", "Parameter", "Technique", "Training", "Algorithm"]}\n\nINPUT: Entropy Minimization In Emergent Languages\nOUTPUT: {"Aspects": ["Comparison", "Assumption", "Experiment", "Agent", "Training", "Model", "Findings", "Result", "Ablation", "Approach", "Analysis", "Improvement", "Robustness", "Complexity", "Method", "Implementation", "Parameter", "Task", "Generalization", "Performance", "Evidence"]}\n\nINPUT: Crowd-sourced Phrase-Based Tokenization for Low-Resourced Neural Machine Translation: The case of Fon Language\nOUTPUT: {"Aspects": ["Evaluation", "Task", "Metric", "Comparison", "Improvement", "Annotation", "Analysis", "Method", "Data", "Effectiveness", "Result", "Experiment", "Baseline", "Performance", "Training", "Algorithm"]}\n\n'
}

In [None]:
experiment_id = time.strftime('%Y%m%d_%H%M%S', time.localtime())
pretrained = 'gpt-4o'
prompt_type = f'few_{type_of_labels}_{part}'
temperature = 0
max_tokens = 2048
number_of_aspects = len(categories)
batch_size = 'na'
epochs = 'na'
learning_rate = 'na'
device = 'na'

output = []
with tqdm(total=len(eval_texts)) as t:
    for text in eval_texts[:100]:
        response = client.chat.completions.create(
            model=pretrained,
            messages=[
                {"role": "system", "content": prompts[prompt_type]},
                {"role": "user", "content": text}
                ],
            temperature=temperature,
            max_tokens=max_tokens,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0,
            response_format={"type": "json_object"},
            seed=2266
            )
        output.append(response.choices[0].message.content)

        with open(f'results-{experiment_id}.json', 'w') as file:
            json.dump(output, file, ensure_ascii=False, indent=4)
        
        t.update(1)
        time.sleep(0.1)

In [None]:
with open('results-20241230_115354.json') as file:
    predictions = [json.loads(_)['Aspects'] for _ in json.loads(file.read())]
for i in range(len(predictions)):
    if predictions[i] == '-':
        predictions[i] = ['-']

In [None]:
predictions_one_hot = []
for item in predictions:
    output = np.zeros(len(categories))
    for category in item:
        if category in categories:
            output[categories.index(category)] = 1
    if not output.any():
        output[categories.index('-')] = 1
    predictions_one_hot.append(output)

In [None]:
print(classification_report(eval_labels, predictions_one_hot, target_names=categories, zero_division=0))
precision_weighted, recall_weighted, f1_weighted, _ = precision_recall_fscore_support(eval_labels, predictions_one_hot, average='weighted', zero_division=0)
print(round(precision_weighted, 4), round(recall_weighted, 4), round(f1_weighted, 4))

In [None]:
actuals, predictions = [], []
for item in eval_labels:
    actual = []
    for i in range(len(item)):
        if item[i] == 1:
            actual.append(categories[i])
    actuals.append(actual)

for item in predictions_one_hot:
    prediction = []
    for i in range(len(item)):
        if item[i] == 1:
            prediction.append(categories[i])
    predictions.append(prediction)

similarity = calculate_jaccard_similarity_for_lists(actuals, predictions)
sum(similarity) / len(similarity)

## review aspect prediction (RAP)

In [None]:
type_of_labels = 'coarse'
column_name = 'COARSE'
if type_of_labels == 'fine':
    column_name = 'FINE'

category_to_aspect = pd.read_csv(f'aspects - {type_of_labels}.csv')
llm_annotation = category_to_aspect['LLM annotation'].to_list()
category = category_to_aspect[column_name].to_list()
aspect_to_category = defaultdict(set)
for i in range(len(category_to_aspect)):
    aspect_to_category[llm_annotation[i]].add(category[i])

texts, labels = [], []
annotation = pd.read_csv('annotation - llm.csv')
annotation_1 = annotation['annotation_1'].tolist()
for i in range(len(annotation)):
    texts.append(annotation['review'].tolist()[i])
    label = merge_synonyms(str(annotation_1[i]).replace(' and ', ', ').split(', '))
    labels.append(label)

categories = ['-']
categories.extend(category_to_aspect[column_name].to_list())
categories = sorted(list(set(categories)))
labels_one_hot = []
for item in labels:
    output = torch.zeros(len(categories))
    for aspect in item:
        if aspect in aspect_to_category:
            for category in aspect_to_category[aspect]:
                output[categories.index(category)] = 1
    if not output.any():
        output[categories.index('-')] = 1
    labels_one_hot.append(output)

In [None]:
number_of_data = int(len(texts)*0.9)
eval_texts = texts[number_of_data:]
eval_labels = labels_one_hot[number_of_data:]

In [None]:
prompts = {
    'zero_coarse': 'Identify the aspect(s) that the given review sentence focuses on. Aspects: Ablation, Analysis, Comparison, Contribution, Data/Task, Definition/Description/Detail/Discussion/Explanation/Interpretation, Evaluation, Experiment, Intuition/Justification/Motivation/Validation, Methodology, Novelty, Presentation, Related Work, Result, Significance, Theory. If a review sentence does not express any of the listed aspects, mark "-". If the review sentence contains multiple aspects, select all applicable aspects. Format the output in a json dictionary: {\"Aspects\": [...]}.',
    'zero_fine': 'Identify the aspect(s) that the given review sentence focuses on. Aspects: AI, Ablation, Accuracy, Adaptation, Adversarial, Agent, Algorithm, Analysis, Annotation, Application, Approach, Architecture, Assumption, Attention, Baseline, Benchmark, Clarity, Comparison, Complexity, Confusion, Contribution, Data, Definition, Description, Detail, Discussion, Effectiveness, Efficiency, Embeddings, Evaluation, Evidence, Experiment, Explanation, Figure, Findings, Fine-tuning, Framework, Generalization, Grammar, Hypothesis, Impact, Implementation, Importance, Improvement, Interpretation, Intuition, Justification, Method, Metric, Model, Motivation, Notation, Novelty, Parameter, Performance, Presentation, Prompt, Related Work, Result, Robustness, Significance, Statistical Significance, Table, Task, Technique, Terminology, Theory, Training, Transformer, Typo, Validation. If a review sentence does not express any of the listed aspects, mark "-". If the review sentence contains multiple aspects, select all applicable aspects. Format the output in a json dictionary: {\"Aspects\": [...]}.',
    'few_coarse': 'Identify the aspect(s) that the given review sentence focuses on. Aspects: Ablation, Analysis, Comparison, Contribution, Data/Task, Definition/Description/Detail/Discussion/Explanation/Interpretation, Evaluation, Experiment, Intuition/Justification/Motivation/Validation, Methodology, Novelty, Presentation, Related Work, Result, Significance, Theory. If a review sentence does not express any of the listed aspects, mark "-". If the review sentence contains multiple aspects, select all applicable aspects. Format the output in a json dictionary: {\"Aspects\": [...]}. Here are some examples:\n\nINPUT: The evaluation shows that the proposed method can improve the intrigue and coherence of a generated story.\nOUTPUT: {"Aspects": ["Evaluation", "Methodology"]}\n\nINPUT: So if I have language A that receives a perplexity of 10 according to LM0, and another language B that receives a perplexity of 3 (and thus much higher average probability per token), and LM0 assigns its own language a perplexity of 2, then we’ll get scores of (10-2)/2 = 4 for A and (3-2)/2 = 0.5 for B, making A look more similar.\nOUTPUT: {"Aspects": ["Evaluation"]}\n\nINPUT: On Abstractive Summarization BeamSearch is used and again outperform MLE and UT.\nOUTPUT: {"Aspects": ["Evaluation", "Result"]}\n\nINPUT: It is also strange that the multi-cluster approach, which discards inter-cluster (word and language) semantic information performs the best with respect to the extrinsic metrics.\nOUTPUT: {"Aspects": ["Evaluation", "Methodology", "Result"]}\n\nINPUT: The overall problem is framed as a judgment question, further enhancing the method\'s Uniqueness and Efficiency.\nOUTPUT: {"Aspects": ["Methodology"]}\n\nINPUT: One of the main novelties with respect to previous text-to-clip models is the use of co-attention schemes at the level of words and frames.\nOUTPUT: {"Aspects": ["Methodology", "Novelty"]}\n\nINPUT: In this sense, the title is a bit misleading, as the technique does not apply to all ‘autoregressive models.’  - **Only demonstrated on text data, i.e., no images or audio, etc.\nOUTPUT: {"Aspects": ["Methodology"]}\n\nINPUT: The proposed method shows promising privacy-utility trade-off empirically.\nOUTPUT: {"Aspects": ["Methodology"]}\n\nINPUT: - Significance: The paper makes a substantial contribution to the field of molecular discovery.\nOUTPUT: {"Aspects": ["Significance", "Contribution"]}\n\nINPUT: Many-to-many relationships / soft correspondences are generally important for vision-and-language learning since there is never really one unique and perfect correspondence between an image and text.\nOUTPUT: {"Aspects": ["Significance"]}\n\nINPUT: Also the importance of each reward component or the effect of each phrase-matching automatic metrics is missing.\nOUTPUT: {"Aspects": ["Evaluation", "Significance"]}\n\nINPUT: More languages pairs should be verified.\nOUTPUT: {"Aspects": ["Significance", "Experiment"]}\n\nINPUT: However, authors do not compare their methods against any of the previous works.\nOUTPUT: {"Aspects": ["Comparison", "Related Work"]}\n\nINPUT: Zhang, Q., Wang, Y., Gong, Y., & Huang, X.\nOUTPUT: {"Aspects": ["Related Work"]}\n\nINPUT: There are many related works on iterative retrieval in the open-domain QA that have not been mentioned and compared, such as:    [1] Guo, Xiaoxiao, et al. "Learning to query, reason, and answer questions on ambiguous texts."\nOUTPUT: {"Aspects": ["Related Work"]}\n\nINPUT: Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP).\nOUTPUT: {"Aspects": ["Related Work"]}\n\nINPUT: I agree with all of your points about what is lacking, but in my mind, the novelty was enough to still give a 7.\nOUTPUT: {"Aspects": ["Novelty"]}\n\nINPUT: Overall, I continue to maintain that this is a good paper with novel ideas that are well-justified by experimental results and analyses, and would like to reiterate that I believe that this work is a clear accept.\nOUTPUT: {"Aspects": ["Analysis", "Experiment", "Result", "Novelty"]}\n\nINPUT: - The idea of MoE with sample routing to mitigate task interference for MLLMs is novel.\nOUTPUT: {"Aspects": ["Novelty"]}\n\nINPUT: Given the emergent of cutting-edge large language models and their decent in-context performance, the technique proposed in this paper seems to be redundant and outdated.\nOUTPUT: {"Aspects": ["Methodology", "Novelty"]}\n\nINPUT: - Causal intervention on the model\'s representations using the probes shows that the truth vectors are "active" and not "inert".\nOUTPUT: {"Aspects": ["Methodology", "Presentation"]}\n\nINPUT: Please bring the table closer to its reference.\nOUTPUT: {"Aspects": ["Related Work", "Presentation"]}\n\nINPUT: * (clarity) The "Return" metric is not defined or referenced in the results section.\nOUTPUT: {"Aspects": ["Evaluation", "Result", "Presentation"]}\n\nINPUT: p2 LTMS -> LSTM p3 For Self-attention -> Self-attention p5 following Vaswani -> follows Vaswani p6 we used has annotated -> we used has been annotated p7 but there is the principle a confound -> not clear what that means p9 our-word -> ?\nOUTPUT: {"Aspects": ["Presentation"]}\n\nINPUT: In fact, other example tasks should be selected to better highlight the advantages of the proposed method.\nOUTPUT: {"Aspects": ["Data/Task", "Methodology"]}\n\nINPUT: Why do the authors use the Quora dataset in particular?\nOUTPUT: {"Aspects": ["Data/Task"]}\n\nINPUT: - therefore the hyperparameters that work well on CC & WIT, which are of relatively high quality, might not be very transferable to more noisy datasets (like the one used in CLIP, for example).\nOUTPUT: {"Aspects": ["Data/Task", "Methodology"]}\n\nINPUT: The paired sentences from different languages are mined based on the sentence embeddings.\nOUTPUT: {"Aspects": ["Data/Task", "Methodology"]}\n\nINPUT: Mostly empirical analysis based on OpenAI playground.\nOUTPUT: {"Aspects": ["Analysis"]}\n\nINPUT: Furthermore, the paper lacks visual or interpretable analyses that incorporate concrete natural language statements.\nOUTPUT: {"Aspects": ["Analysis", "Definition/Description/Detail/Discussion/Explanation/Interpretation"]}\n\nINPUT: - Additionally, it would be beneficial to include an analysis of the model\'s scale.\nOUTPUT: {"Aspects": ["Analysis", "Methodology"]}\n\nINPUT: The insights of model design for more efficient fine-tuning are well supported by the analysis.\nOUTPUT: {"Aspects": ["Analysis", "Methodology"]}\n\nINPUT: The authors take time to implement and evaluate several prominent baselines.\nOUTPUT: {"Aspects": ["Comparison", "Evaluation", "Methodology"]}\n\nINPUT: Then it is reasonable to include the baseline suggest above, i.e. input additional features.\nOUTPUT: {"Aspects": ["Comparison"]}\n\nINPUT: - No actual comparisons to weight norm?\nOUTPUT: {"Aspects": ["Comparison"]}\n\nINPUT: It is not fair to discount this computational effort completely when comparing with the performance of other solvers that do not make LLM calls.\nOUTPUT: {"Aspects": ["Comparison"]}\n\nINPUT: I didn\'t see any significance measurements and it would be important to add them.\nOUTPUT: {"Aspects": ["Result"]}\n\nINPUT: * Obtains results comparable to strong rule-based data augmentation baseline GECA on two datasets, demonstrating that the combination of both resampling and recombination is effective.\nOUTPUT: {"Aspects": ["Data/Task", "Result"]}\n\nINPUT: I believe that the findings in this paper are going to be useful in practice to the general NLP community working on Transformers and Multilingual problems.\nOUTPUT: {"Aspects": ["Methodology", "Result"]}\n\nINPUT: * evidence for disproving alternative hypotheses is weak in several cases:     * Section 3.2.2: CCS Is Robust To Misleading Prompts         * goal: "discover latent knowledge in an LM even when its **training objective** causes the model to output false text"         * method: change **test time** prompts and observe difference         * method is not well suited for the goal, because you don\'t touch the training objective of the model at all.\nOUTPUT: {"Aspects": ["Methodology", "Result"]}\n\nINPUT: - The proposed method is intuitive and easy to understand.\nOUTPUT: {"Aspects": ["Intuition/Justification/Motivation/Validation", "Methodology"]}\n\nINPUT: Strengths: - This submission is well motivated.\nOUTPUT: {"Aspects": ["Intuition/Justification/Motivation/Validation"]}\n\nINPUT: Despite good empirical efforts, the motivation of this paper seems somewhat unclear.\nOUTPUT: {"Aspects": ["Intuition/Justification/Motivation/Validation"]}\n\nINPUT: Cons: Poor conceptual motivation and error analysis, with little evident understanding of the actual effect of the linguistic representations within the model.\nOUTPUT: {"Aspects": ["Intuition/Justification/Motivation/Validation", "Methodology", "Analysis"]}\n\nINPUT: Since this is not addressed at all in the paper, it makes the results in general a bit difficult to interpret.\nOUTPUT: {"Aspects": ["Definition/Description/Detail/Discussion/Explanation/Interpretation", "Result"]}\n\nINPUT: This is an interesting take to use VIP / sequential QA for image classification and the experiments demonstrate the effectiveness of this method for interpretability/explanations.\nOUTPUT: {"Aspects": ["Methodology", "Experiment", "Definition/Description/Detail/Discussion/Explanation/Interpretation"]}\n\nINPUT: It\'s important to discuss why these improvements are non-trivial and how they advance the field, considering the rapidly evolving landscape of both quantum computing and graph analysis.\nOUTPUT: {"Aspects": ["Analysis", "Definition/Description/Detail/Discussion/Explanation/Interpretation", "Result"]}\n\nINPUT: Also, the meaning of vector v in weight calculation is not clearly discussed.\nOUTPUT: {"Aspects": ["Definition/Description/Detail/Discussion/Explanation/Interpretation"]}\n\nINPUT: In general, I find Experiments II to be much weaker than Experiment I.\nOUTPUT: {"Aspects": ["Comparison", "Experiment"]}\n\nINPUT: Strength:  * the method is quite novel and experimental results seem strong  Weakness:  * The explanation and presentation of the methodology is complicated and can be reduced.\nOUTPUT: {"Aspects": ["Methodology", "Presentation", "Result", "Definition/Description/Detail/Discussion/Explanation/Interpretation", "Experiment"]}\n\nINPUT: However, I still have some concerns about both the experimental details and the paper writing, details are as follows.\nOUTPUT: {"Aspects": ["Definition/Description/Detail/Discussion/Explanation/Interpretation", "Experiment"]}\n\nINPUT: Testing for the true role of language will require many more experiments, which may be somewhat out of scope for this paper given the space constraints for a single paper.\nOUTPUT: {"Aspects": ["Experiment"]}\n\nINPUT: - How to find a discriminant for meaning is left out as the authors explicitly assume that “the mechanism is provided by human annotators and other providers of training data.” While the authors emphasize in the introduction that such information can be used in the LLM training without external reward model: “This observation shows that sentence-level annotations can be incorporated directly into the trained model without the need for any external reward model nor external policy model, simply by sentence-level feedback,” I do not see the advantage of this approach over using the very same data to train a reward model and use that either during the training (as in RLHF) or as an augmentation (as in Rectification method), the latter indeed provides quite strong theoretical guarantees.\nOUTPUT: {"Aspects": ["Data/Task", "Methodology", "Theory", "Presentation"]}\n\nINPUT: This is relevant because, by training end to end, that work effectively generates arbitrary amounts of training data through interaction the the HOL4 ITP system (intermediate theorems which are proven give some reward in that work).\nOUTPUT: {"Aspects": ["Data/Task", "Methodology", "Theory"]}\n\nINPUT: - I don\'t see why your theory does not generalize to a _masked_ language modeling (MLM).\nOUTPUT: {"Aspects": ["Methodology", "Theory"]}\n\nINPUT: Firstly, in the last paragraph of Section 2, the authors claim that the role matrix $R$ would be invertible such that there exists a matrix $U = R^{-1}$ such that the fillers would be recovered.\nOUTPUT: {"Aspects": ["Theory"]}\n\nINPUT: + The ablation studies are quite comprehensive, and most of my questions about the methods was addressed.\nOUTPUT: {"Aspects": ["Methodology", "Ablation"]}\n\nINPUT: Ablation studies on the varying parameter counts of these two components would be valuable, if possible.\nOUTPUT: {"Aspects": ["Methodology", "Ablation"]}\n\nINPUT: The paper has a nice ablation study which shows that the learned importance scores, the complementary representations, and the fusion network are needed to reach the model\'s full performance.\nOUTPUT: {"Aspects": ["Methodology", "Result", "Ablation"]}\n\nINPUT: It would be better if a comprehensive ablation study was conducted, to understand which traditional features contribute the most (and the least), especially given the BERT pretrained models for encoding.\nOUTPUT: {"Aspects": ["Methodology", "Ablation"]}\n\nINPUT: Such gaps make the main contribution questionable and make it as a pure empirical paper on its value.\nOUTPUT: {"Aspects": ["Contribution"]}\n\nINPUT: Considering the contributions of this paper, it is more suitable as a technical report than a paper to be published.\nOUTPUT: {"Aspects": ["Contribution"]}\n\nINPUT: **Technical contribution** Codegen for program synthesis seems effective, especially when the user wants to generate pieces of code from input/output examples or natural language descriptions.\nOUTPUT: {"Aspects": ["Contribution"]}\n\nINPUT: As you don\'t get to claim the same contribution twice, this contribution should go all to the benefit of the other paper.\nOUTPUT: {"Aspects": ["Contribution"]}\n\n',
    'few_fine': 'Identify the aspect(s) that the given review sentence focuses on. Aspects: AI, Ablation, Accuracy, Adaptation, Adversarial, Agent, Algorithm, Analysis, Annotation, Application, Approach, Architecture, Assumption, Attention, Baseline, Benchmark, Clarity, Comparison, Complexity, Confusion, Contribution, Data, Definition, Description, Detail, Discussion, Effectiveness, Efficiency, Embeddings, Evaluation, Evidence, Experiment, Explanation, Figure, Findings, Fine-tuning, Framework, Generalization, Grammar, Hypothesis, Impact, Implementation, Importance, Improvement, Interpretation, Intuition, Justification, Method, Metric, Model, Motivation, Notation, Novelty, Parameter, Performance, Presentation, Prompt, Related Work, Result, Robustness, Significance, Statistical Significance, Table, Task, Technique, Terminology, Theory, Training, Transformer, Typo, Validation. If a review sentence does not express any of the listed aspects, mark "-". If the review sentence contains multiple aspects, select all applicable aspects. Format the output in a json dictionary: {\"Aspects\": [...]}. Here are some examples:\n\nINPUT: The results presented in the paper are significant.\nOUTPUT: {"Aspects": ["Significance", "Result"]}\n\nINPUT: For example, please see below works in matrix factorization approaches, sparse word representation learning, codebook learning and other quantization approaches for compressing word embeddings:  https://www.aclweb.org/anthology/P16-1022/ https://aaai.org/ojs/index.php/AAAI/article/download/4578/4456 http://web.cs.ucla.edu/~chohsieh/papers/Mulcode_Compressor.pdf https://www.aaai.org/ocs/index.php/AAAI/AAAI18/paper/viewFile/17042/16071 https://storage.googleapis.com/pub-tools-public-publication-data/pdf/f158f7c81ed8e985fd51a20d193103ce427cad51.pdf https://arxiv.org/pdf/1711.01068.pdf https://arxiv.org/abs/1510.00149  I would appreciate if comparisons with some of these approaches is provided in the next iteration of this work.\nOUTPUT: {"Aspects": ["Embeddings", "Related Work"]}\n\nINPUT: However, authors do not compare their methods against any of the previous works.\nOUTPUT: {"Aspects": ["Comparison", "Related Work"]}\n\nINPUT: If I understand correctly, both the word-word and event-event are just homogeneous networks and what the author did is adding an additional connection between these two homogeneous graphs.\nOUTPUT: {"Aspects": ["Model", "Detail"]}\n\nINPUT: - Comparison only with one previous work and then claiming that the method is capable of zero-shot, is slightly overstated.\nOUTPUT: {"Aspects": ["Comparison"]}\n\nINPUT: Strengths:  1) Collaborative language modeling is an interesting and promising direction in NLG, which increases the interpretability and controllability of traditional NLG models.\nOUTPUT: {"Aspects": ["Model", "Interpretation"]}\n\nINPUT: There are few places (see details) that authors have assumptions in mind but do not provide those assumptions until later.\nOUTPUT: {"Aspects": ["Assumption"]}\n\nINPUT: The latter is efficient, but the former enables interpretability.\nOUTPUT: {"Aspects": ["Efficiency", "Interpretation"]}\n\nINPUT: However, if we considered this view, it seems like the problem they are solving should be more impactful than type recovery and AST generation.\nOUTPUT: {"Aspects": ["Impact"]}\n\nINPUT: In general, I find Experiments II to be much weaker than Experiment I.\nOUTPUT: {"Aspects": ["Comparison", "Experiment"]}\n\nINPUT: (IBR) Distilling an agent from an ensemble of 50 independently trained agents outperforms training single agents from scratch, but is still not as good as the whole ensemble.\nOUTPUT: {"Aspects": ["Training", "Related Work", "Agent"]}\n\nINPUT: (2) The paper does not explain why end-to-end training in the  entropy-regularization is necessary.\nOUTPUT: {"Aspects": ["Training", "Explanation"]}\n\nINPUT: In this way, the model is explicitly trained by decreasing the likelihood of generated samples, hopefully reducing the effect of “exposure bias.” Notably, this procedure can, in principle, be applied to any discrete autoregressive model, making this a rather general technique.\nOUTPUT: {"Aspects": ["Technique", "Training"]}\n\nINPUT: I think these earlier results should be explicitly discussed here and the authors should justify why they are interpreting the results differently (if they are).\nOUTPUT: {"Aspects": ["Discussion", "Justification", "Result"]}\n\nINPUT: Especially the performance of PEER-Document should be shown and analyzed, because in my view it’s hard for a pre-trained model with 3B model parameters to generate high-quality knowledge documents only given the texts before / after editing and plans.\nOUTPUT: {"Aspects": ["Model", "Training", "Performance"]}\n\nINPUT: In fact, the value of \\epsilon peaks early in training, but is not accompanied by peak accuracy?\nOUTPUT: {"Aspects": ["Training", "Accuracy"]}\n\nINPUT: It would be good to have a baseline comparison comparing what performance looks like with model scale.\nOUTPUT: {"Aspects": ["Comparison", "Model"]}\n\nINPUT: Then predicting the answer to this prompt is equivalent to performing the task, and a perfect LM is of course able to perform the task perfectly.\nOUTPUT: {"Aspects": ["Prompt", "Model", "Task"]}\n\nINPUT: Hence, it will be better to additionally include the results of other standard RL algorithms for better justifying this claim.\nOUTPUT: {"Aspects": ["Justification", "Algorithm"]}\n\nINPUT: Finally, the paper shows that compositional languages generalize better to the held-out validation set.\nOUTPUT: {"Aspects": ["Generalization", "Findings"]}\n\nINPUT: The problem of finding the prompt that causes certain behavior is interesting and important to study.\nOUTPUT: {"Aspects": ["Importance"]}\n\nINPUT: Strengths: - The unsupervised matching approach for extracting triples is quite novel and, to my knowledge, the first evidence that the attention maps of pretrained transformers contain paths which capture relational knowledge.\nOUTPUT: {"Aspects": ["Attention", "Training", "Transformer", "Novelty"]}\n\nINPUT: - §2: "study that implement" -> "study that implements" - §3: "adjectives has" -> "adjectives have" - §4: "analyzing sub-component" -> "analyzing sub-components" - §5: "Syllables features also generally performs" -> "Syllable features also generally perform"\nOUTPUT: {"Aspects": ["Grammar", "Presentation"]}\n\nINPUT: a) Why did self-check perform worse in the few-shot setting than the zero-shot setting with ChatGPT in Table 3?\nOUTPUT: {"Aspects": ["Table"]}\n\nINPUT: pplx & one-shot downstream performance.\nOUTPUT: {"Aspects": ["Performance", "Metric"]}\n\nINPUT: The method is applied to the standard Transformer and it is evaluated on several language pairs and different data sizes.\nOUTPUT: {"Aspects": ["Evaluation", "Application", "Method", "Data"]}\n\nINPUT: The evaluation metric is problematic (or at least unclear).\nOUTPUT: {"Aspects": ["Evaluation", "Clarity", "Metric"]}\n\nINPUT: Even within the reported experiments, the results suggest that systematicity is lacking in several places.\nOUTPUT: {"Aspects": ["Experiment", "Result"]}\n\nINPUT: The paper lacks technical novelty other than the training and test data generation approach, but having one available to the community with these apparently desirable characteristics as benchmark data for measuring complex, compositional generalization capabilities, and that could be invaluable to the research community.\nOUTPUT: {"Aspects": ["Approach", "Novelty", "Data", "Benchmark", "Generalization", "Training"]}\n\nINPUT: Typo:     - Figure 4 caption: "Our variable-resolution inputs prevent**s**..." 6.\nOUTPUT: {"Aspects": ["Typo"]}\n\nINPUT: In the more practical setting of finetuning the model, the attacks are not effective.\nOUTPUT: {"Aspects": ["Fine-tuning", "Effectiveness"]}\n\nINPUT: Yet, authors draw some conclusions on Page 4 based on this table, without reporting any variance or statistical significance tests.\nOUTPUT: {"Aspects": ["Statistical Significance", "Table"]}\n\nINPUT: - How to find a discriminant for meaning is left out as the authors explicitly assume that “the mechanism is provided by human annotators and other providers of training data.” While the authors emphasize in the introduction that such information can be used in the LLM training without external reward model: “This observation shows that sentence-level annotations can be incorporated directly into the trained model without the need for any external reward model nor external policy model, simply by sentence-level feedback,” I do not see the advantage of this approach over using the very same data to train a reward model and use that either during the training (as in RLHF) or as an augmentation (as in Rectification method), the latter indeed provides quite strong theoretical guarantees.\nOUTPUT: {"Aspects": ["Model", "Annotation", "Notation", "Method", "Data", "Theory", "Training"]}\n\nINPUT: + The ablation studies are quite comprehensive, and most of my questions about the methods was addressed.\nOUTPUT: {"Aspects": ["Method", "Ablation"]}\n\nINPUT: Two critical baselines are currently missing: the accuracy of the vanilla CLIP model (without continued pretraining), and the accuracy of $\\varepsilon=\\infty$ (non-private continued pretraining).\nOUTPUT: {"Aspects": ["Accuracy", "Baseline"]}\n\nINPUT: - The experimental results show that this generative approach even has superior performance while being much simpler than other task-specific classification models which require careful model architecture design for different tasks.\nOUTPUT: {"Aspects": ["Model", "Approach", "Task", "Complexity", "Result", "Experiment", "Architecture", "Performance"]}\n\nINPUT: I agree with all of your points about what is lacking, but in my mind, the novelty was enough to still give a 7.\nOUTPUT: {"Aspects": ["Novelty"]}\n\nINPUT: ### Novelty ###  This paper proposes incremental domain adaptation, which is inspired by Li & Hoiem\'s work.\nOUTPUT: {"Aspects": ["Novelty", "Adaptation"]}\n\nINPUT: Mostly empirical analysis based on OpenAI playground.\nOUTPUT: {"Aspects": ["Analysis"]}\n\nINPUT: Maybe the attention vector could be calculated in a more appropriate approach.\nOUTPUT: {"Aspects": ["Attention", "Improvement"]}\n\nINPUT: What they really do is use a set of contrastive prompts to mislead the model and the experimental results show that **CCS can improve the robustness of the model**.\nOUTPUT: {"Aspects": ["Prompt", "Model", "Robustness", "Result", "Experiment"]}\n\nINPUT: Both of them are quite straightforward.\nOUTPUT: {"Aspects": ["Complexity"]}\n\nINPUT: (4) Translation invariance : Uses a low rank decomposition of the word PMI matrix with an objective with includes bilingual alignment frequency components.\nOUTPUT: {"Aspects": ["Method"]}\n\nINPUT: They also use large scale unlabeled data to improve word representations.\nOUTPUT: {"Aspects": ["Data"]}\n\nINPUT: Admittedly, there is a chance that I have a bug in the code.\nOUTPUT: {"Aspects": ["Implementation"]}\n\nINPUT: 3) One-type of distillation, fine-tuning a smaller model from the generations of the 540B model, is effective.\nOUTPUT: {"Aspects": ["Fine-tuning", "Method", "Effectiveness"]}\n\nINPUT: As probably "lattice" in not meant in the purely mathematical sense (a partial order with least upper bound and greatest lower bound operators), more details are needed.\nOUTPUT: {"Aspects": ["Terminology"]}\n\nINPUT: Overall, this work contributes an interesting framework for analysis.\nOUTPUT: {"Aspects": ["Analysis", "Framework"]}\n\nINPUT: - Did you try other architectures like 5 layers (rather than 4) in Figure 2 - Figure 2 is a bit hard to interpret.\nOUTPUT: {"Aspects": ["Figure", "Architecture", "Interpretation"]}\n\nINPUT: Weaknesses: The main weakness is that the proposed extension to the baseline is relatively complex and rather heavy-weight in terms of new parameters (introducing ~25% more parameters compared to the baseline according to Table 2), yet only achieves a very marginal relative improvement of 0.2 percent over the baseline.\nOUTPUT: {"Aspects": ["Improvement", "Complexity", "Baseline", "Parameter"]}\n\nINPUT: Such gaps make the main contribution questionable and make it as a pure empirical paper on its value.\nOUTPUT: {"Aspects": ["Contribution"]}\n\nINPUT: Even something simple like the matrix norm of the difference between the original attention matrix and its transpose.\nOUTPUT: {"Aspects": ["Attention"]}\n\nINPUT: The task definition and evaluation methods are clear.\nOUTPUT: {"Aspects": ["Evaluation", "Definition", "Method", "Task"]}\n\nINPUT: Strengths - The proposed model is clearly defined and the ablation studies are carefully structured in Table 2.\nOUTPUT: {"Aspects": ["Model", "Definition", "Ablation"]}\n\nINPUT: - Strengths:  - the model if theoretically solid and motivated by formal semantics.\nOUTPUT: {"Aspects": ["Model", "Motivation"]}\n\nINPUT: I\'m confused as to why some of the visualizations in Fig 3 show white bands along the diagonal (d, f, and g).\nOUTPUT: {"Aspects": ["Figure", "Confusion"]}\n\nINPUT: One for validating the proposed method, they evaluated the proposed method on already established text2audio generation task.\nOUTPUT: {"Aspects": ["Validation", "Evaluation", "Task"]}\n\nINPUT: Is that the same method as LMX, fitness-only * It would be interesting to see ablation analysis to compare with the QD with and without AI feedback (not sure if LMX fitness-only or quality only serves the baseline)  * QD metric is used throughout, which is the “sum of highest quality value found in each bin” - it seems to only focus on quality rather than diversity.\nOUTPUT: {"Aspects": ["Metric", "Analysis", "Comparison", "AI", "Method", "Ablation", "Baseline"]}\n\nINPUT: This paper considers the challenging problem of learning to generate programs from natural language descriptions: the inputs are sequences of words describing a task and the outputs are corresponding programs solving the task.\nOUTPUT: {"Aspects": ["Description", "Task"]}\n\nINPUT: Additionally, there\'s the cost associated with extra data annotation, as outlined in Question 3.\nOUTPUT: {"Aspects": ["Annotation", "Data"]}\n\nINPUT: how many annotations have been collected for each narrative?\nOUTPUT: {"Aspects": ["Evaluation", "Annotation", "Notation"]}\n\nINPUT: ● In the results, it would have been nice to know how many samples in the test-bed are used, and how many are valid/atomic/misleading.\nOUTPUT: {"Aspects": ["Validation", "Result"]}\n\nINPUT: For instance, Claude shows, in general, insensitivity to the number of parameters, performing well or not regardless of this factor, while Flan-T5 aligns more closely with the initial hypothesis proposed by the authors.\nOUTPUT: {"Aspects": ["Model", "Performance", "Parameter", "Hypothesis"]}\n\nINPUT: First, the model\'s abilities for systematic generalization are overstated.\nOUTPUT: {"Aspects": ["Model", "Generalization"]}\n\nINPUT: The dataset is adversarially filtered using BERT and GPT, which gives deep learning model a huge disadvantage.\nOUTPUT: {"Aspects": ["Model", "Adversarial", "Data"]}\n\nINPUT: This paper presents a grammar-based generation approach for "slot-filling" style code generation tasks.\nOUTPUT: {"Aspects": ["Grammar"]}\n\nINPUT: * While I understand that space is limited, Section 2.1 might have benefited from explaining FiLM, its use here and some intuition, to help keep the paper more self-contained.\nOUTPUT: {"Aspects": ["Intuition", "Explanation"]}\n\nINPUT: For example, MVP_obj outperforms the best baseline fin by 0.52 while the std is 0.58 for MVP_obj and 0.61 for the best baseline.\nOUTPUT: {"Aspects": ["Performance", "Baseline"]}\n\nINPUT: For example, from the analysis of Fig (2), the conclusion that : “Lastly, note that fully-learnable RPEs also do not significantly distinguish far-distant RPEs (from -64 to -20 and from 20 to 64), suggesting that truncating RPEs into a distance of 64, like (Shaw et al., 2018), is reasonable.”  is a bit farfetched.\nOUTPUT: {"Aspects": ["Analysis", "Evidence"]}\n\nINPUT: - The details of the training steps expt in Figure 3 are a little underexplained, and I tried my best to reconstruct, though I invite the authors to clarify if I\'ve misunderstood anything.\nOUTPUT: {"Aspects": ["Training", "Clarity", "Figure"]}\n\nINPUT: There is a lot of content in this paper, including mathematical developments, algorithms, and experimental results.\nOUTPUT: {"Aspects": ["Experiment", "Result", "Algorithm"]}\n\n'
}

In [None]:
experiment_id = time.strftime('%Y%m%d_%H%M%S', time.localtime())
pretrained = 'gpt-4o'
prompt_type = f'few_{type_of_labels}'
temperature = 0
max_tokens = 2048
number_of_aspects = len(categories)
batch_size = 'na'
epochs = 'na'
learning_rate = 'na'
device = 'na'

output = []
with tqdm(total=len(eval_texts)) as t:
    for text in eval_texts[:1000]:
        response = client.chat.completions.create(
            model=pretrained,
            messages=[
                {"role": "system", "content": prompts[prompt_type]},
                {"role": "user", "content": text}
                ],
            temperature=temperature,
            max_tokens=max_tokens,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0,
            response_format={"type": "json_object"},
            seed=2266
            )
        output.append(response.choices[0].message.content)

        with open(f'results-{experiment_id}.json', 'w') as file:
            json.dump(output, file, ensure_ascii=False, indent=4)
        
        t.update(1)
        time.sleep(0.1)

In [None]:
with open('results-20241229_131810.json') as file:
    predictions = [json.loads(_)['Aspects'] for _ in json.loads(file.read())]
for i in range(len(predictions)):
    if predictions[i] == '-':
        predictions[i] = ['-']

In [None]:
predictions_one_hot = []
for item in predictions:
    output = np.zeros(len(categories))
    for category in item:
        if category in categories:
            output[categories.index(category)] = 1
    if not output.any():
        output[categories.index('-')] = 1
    predictions_one_hot.append(output)

In [None]:
print(classification_report(eval_labels[:1000], predictions_one_hot, target_names=categories, zero_division=0))
precision_weighted, recall_weighted, f1_weighted, _ = precision_recall_fscore_support(eval_labels[:1000], predictions_one_hot, average='weighted', zero_division=0)
print(round(precision_weighted, 4), round(recall_weighted, 4), round(f1_weighted, 4))

In [None]:
actuals, predictions = [], []
for item in eval_labels:
    actual = []
    for i in range(len(item)):
        if item[i] == 1:
            actual.append(categories[i])
    actuals.append(actual)

for item in predictions_one_hot:
    prediction = []
    for i in range(len(item)):
        if item[i] == 1:
            prediction.append(categories[i])
    predictions.append(prediction)

similarity = calculate_jaccard_similarity_for_lists(actuals[:1000], predictions)
sum(similarity) / len(similarity)

# automatic review generation

Here we generate reviews using GPT-4o with the prompt used in [Liang et al. (2024)](https://ai.nejm.org/doi/10.1056/AIoa2400196) and a prompt of our own. The generated reviews are used in **Section 5.2 Review comparison** and **5.3 LLM-generated review detection** in the paper.

In [None]:
venue = 'iclr24'
with open(f'preprocessed/preprocessed-{venue}_all.json') as file:
    data = json.loads(file.read())
with open(f'preprocessed/preprocessed-{venue}.json') as file:
    data_preprocessed = json.loads(file.read())
paper_ids = [_ for _ in data if _ not in data_preprocessed]

In [None]:
random.seed(2)
paper_ids = random.sample(paper_ids, 100)

In [None]:
number_of_reviews = 0
for paper_id in paper_ids:
    number_of_reviews += len(data[paper_id])

number_of_reviews / len(paper_ids)

In [None]:
prompts = {
    'ours': 'Write a paper review for the following paper regarding its strengths and weaknesses.',
    'liang24': 'Your task now is to draft a high-quality review outline for the given submission.\n======\nYour task:\nCompose a high-quality peer review of a paper.\n\nStart by \"Review outline:\".\nAnd then:\n\"1. Significance and novelty\"\n\"2. Potential reasons for acceptance\"\n\"3. Potential reasons for rejection\", List multiple key reasons. For each key reason, use **>=2 sub bullet points** to further clarify and support your arguments in painstaking details. Be as specific and detailed as possible.\n\"4. Suggestions for improvement\", List multiple key suggestions. Be as specific and detailed as possible.\n\nBe thoughtful and constructive. Write Outlines only.'
}

In [None]:
experiment_id = time.strftime('%Y%m%d_%H%M%S', time.localtime())
model = 'gpt-4o'
prompt_type = 'liang24'
temperature = 0
max_tokens = 2048

output = defaultdict(dict)
with tqdm(total=len(paper_ids)) as t:
    for paper_id in paper_ids:
        with open(f'data/papers/{paper_id}.txt') as file:
            paper = file.read().split('\nREFERENCES\n')[0]
        response = client.chat.completions.create(
            model=model,
            messages=[
                {'role': 'system', 'content': prompts[prompt_type]},
                {'role': 'user', 'content': paper}
                ],
            temperature=temperature,
            max_tokens=max_tokens,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0,
            seed=2266
            )
        output[paper_id]['llm_review'] = response.choices[0].message.content

        with open(f'llm_generated_reviews_{venue}_{prompt_type}.json', 'w') as file:
            json.dump(output, file, ensure_ascii=False, indent=4)
    
        t.update(1)
        time.sleep(0.1)

# zero-shot llm generated review detection

This is zero-shot llm generated review detection using GPT-4o, which corresponds to Table 23 in the paper.

In [None]:
source = 'llm_generated_reviews_iclr24_ours'
with open(f'preprocessed/preprocessed-{source}.json') as file:
    preprocessed = json.loads(file.read())

random.seed(2)
data = defaultdict()
for paper_id in preprocessed:
    keys = list(preprocessed[paper_id].keys())
    random.shuffle(keys)
    entry = {k: ' '.join(preprocessed[paper_id][k].values()) for k in keys}
    data[paper_id] = entry

In [None]:
experiment_id = time.strftime('%Y%m%d_%H%M%S', time.localtime())
model = 'gpt-4o'
temperature = 0
max_tokens = 64

output = defaultdict()
with tqdm(total=len(data)) as t:
    for paper_id in data:
        reviews = {}
        for i, reviewer_id in enumerate(data[paper_id]):
            reviews[f'Review {i+1}'] = data[paper_id][reviewer_id]
        response = client.chat.completions.create(
            model=model,
            messages=[
                {'role': 'system', 'content': 'Which of the following reviews is generated by an LLM? Return a json file with the answer: {"answer": ...}.'},
                {'role': 'user', 'content': str(reviews)}
                ],
            temperature=temperature,
            max_tokens=max_tokens,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0,
            response_format={'type': 'json_object'},
            seed=2266
            )
        output[paper_id] = response.choices[0].message.content

        with open(f'zero_shot_detection_{source}.json', 'w') as file:
            json.dump(output, file, ensure_ascii=False, indent=4)
    
        t.update(1)
        time.sleep(0.1)

In [None]:
with open(f'zero_shot_detection_{source}.json') as file:
    output = json.loads(file.read())

actuals = []
for _ in data:
    reviewer_ids = list(data[_].keys())
    actuals.append(reviewer_ids.index('llm_review') + 1)

preds = []
for _, item in output.items():
    preds.append(int(json.loads(item)['answer'].replace('Review ', '')))

accuracy_score(actuals, preds)