In [67]:
import json
import re
from bert_score import score
from sklearn.metrics import mean_squared_error, mean_absolute_error, confusion_matrix
import numpy as np
import torch
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import PCA

In [133]:
with open('instruction_finetuning_data_test.jsonl', 'r') as f:
    gold_standard = [json.loads(line) for line in f]

with open('instruction_finetuning_data_test.jsonl', 'r') as f:
    predictions = [json.loads(line) for line in f]

device = "mps" if torch.backends.mps.is_available() else "cpu"
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [134]:
predictions[0]['response'].replace('Explanation:\n', '',1)

"Soundness: 3\nPresentation: 3\nContribution: 4\nRating: 6\nConfidence: 3\n\nStrengths:\n1. Clear structure and presentation: The paper is well-organized, with a logical structure and smooth flow that improves readability and comprehension.\n2. Insightful motivation for OOD benchmarking: This dataset addresses the critical issue of data leakage in current benchmarks. This approach highlights the limitations of evaluating current models trained on web-scale data.\n3. Interesting distortion design: The authors designed six distinct distortion types that minimize overlap with common real-world corruptions, reducing the likelihood of data leakage. This approach improves the dataset’s utility as a truly challenging OOD benchmark.\n\nWeaknesses:\n1. Limited novel insights in model ranking comparisons: Figure 4 shows minimal divergence in the ranking of model performance between ImageNet-C and LAION-C. Similarly, Figure 3 demonstrates a strong linear correlation between model performance on I

In [135]:
def parse_review(text):
    text = text.replace('\nExplanation:\n', '',1)
    review_dict = {}
    
    # Split text into sections using double newlines as a separator
    sections = re.split(r"\n\n", text.strip())

    for section in sections:
        lines = section.split("\n")
        #lines = section
        
        # First part contains numerical ratings
        if ": " in lines[0]:
            for line in lines:
                if ": " in line:
                    key, value = line.split(": ", 1)
                    review_dict[key.strip()] = int(value) if value.isdigit() else value.strip()
        
        # Later parts contain explanations, strengths, weaknesses, questions
        else:
            key = lines[0].replace(":", "").strip()  # Remove colons and extra spaces
            review_dict[key] = "\n".join(lines[1:]).strip()  # Store the rest of the section

    return review_dict

In [136]:
import re

def parse_review1(text):
    # Define regex patterns for extracting different sections
    score_pattern = r"(\w+): (\d+)"
    section_pattern = r"(Strengths|Weaknesses|Questions):\n"

    # Initialize dictionary
    review_dict = {}

    # Extract numerical scores
    scores = re.findall(score_pattern, text)
    for key, value in scores:
        review_dict[key] = int(value)  # Convert scores to integers

    # Extract text sections
    sections = re.split(section_pattern, text)
    
    # Process sections
    for i in range(1, len(sections), 2):
        key = sections[i].strip()
        value = sections[i + 1].strip()
        review_dict[key] = value

    return review_dict


In [152]:
gold_standard = {item['paper_id']: parse_review1(item['response']) for item in gold_standard}
predictions = {item['paper_id']: parse_review1(item['response']) for item in predictions}


TypeError: string indices must be integers

In [215]:
with open('matched_papers_reviews_test.json', 'r') as f:
    papers_2025 = json.load(f)

In [216]:
def remove_citations(full_text):
    idx = full_text.find('\n\nREFERENCES')
    if idx == -1:
        idx = full_text.find('\n\nReferences')
    return full_text[:idx]

In [217]:
with open('instruction_finetuning_data.jsonl', 'r') as f:
    train_data = [json.loads(line) for line in f]
with open('instruction_finetuning_data_test.jsonl', 'r') as f:
    test_data = [json.loads(line) for line in f]

In [218]:
prompt_string = """
Categories (1–5):
- Soundness: The rigor of the methods for the stated problem
- Presentation: The clarity of writing and organization
- Contribution: The paper’s novelty or added value to the domain

Additionally:
- Rating (1–10): Overall recommendation for acceptance
- Confidence (1–5): How confident the reviewer is in their assessment

Please provide:
1) Soundness
2) Presentation
3) Contribution
4) Rating
5) Confidence
6) Explanation (strengths, weaknesses)
7) Questions
"""

In [219]:
json_txt = [{'paper_id': item['paper_id'], 
             'summary': item['prompt'].replace(prompt_string, ''), 
             'full text': remove_citations(papers_2025[item['paper_id']]['full_text']['value']), 
             'abstract': papers_2025[item['paper_id']]['abstract']['value'],
             'response': parse_review1(item['response'])} 
             for item in test_data]

In [220]:
with open('instruction_finetuning_test_data_reformatted.json', "w", encoding="utf-8") as json_file:
    json.dump(json_txt, json_file, indent=4)

In [81]:
def confusion_matrix_gen(num_pred, num_gold):
    num_pred = np.array(num_pred)
    num_gold = np.array(num_gold)

    soundness_conf = confusion_matrix(num_pred[:,0], num_gold[:,0], labels=[1,2,3,4,5])
    presentation_conf = confusion_matrix(num_pred[:,1], num_gold[:,1], labels=[1,2,3,4,5])
    contribution_conf = confusion_matrix(num_pred[:,2], num_gold[:,2], labels=[1,2,3,4,5])
    rating_conf = confusion_matrix(num_pred[:,3], num_gold[:,3], labels=[1,2,3,4,5,6,7,8,9,10])
    confidence_conf = confusion_matrix(num_pred[:,4], num_gold[:,4], labels=[1,2,3,4,5])

    return(soundness_conf, presentation_conf, contribution_conf, rating_conf, confidence_conf)

In [82]:
predictions = {k: predictions[k] for k in ('aAcOaJYbUg', 'dgR6i4TSng', 'UatDdAlr2x')}
predictions

{'aAcOaJYbUg': {'Soundness': 3,
  'Presentation': 3,
  'Contribution': 4,
  'Rating': 6,
  'Confidence': 3,
  'Strengths': '1. Clear structure and presentation: The paper is well-organized, with a logical structure and smooth flow that improves readability and comprehension.\n2. Insightful motivation for OOD benchmarking: This dataset addresses the critical issue of data leakage in current benchmarks. This approach highlights the limitations of evaluating current models trained on web-scale data.\n3. Interesting distortion design: The authors designed six distinct distortion types that minimize overlap with common real-world corruptions, reducing the likelihood of data leakage. This approach improves the dataset’s utility as a truly challenging OOD benchmark.',
  'Weaknesses': "1. Limited novel insights in model ranking comparisons: Figure 4 shows minimal divergence in the ranking of model performance between ImageNet-C and LAION-C. Similarly, Figure 3 demonstrates a strong linear corr

In [83]:
#def evaluate_review(predictions, gold_standard):
results = {}
numerical_scores = ["Soundness", "Presentation", "Contribution", "Rating", "Confidence"]
text_fields = ["Strengths", "Weaknesses", "Questions"]

num_pred = []
num_gold = []
for paper_id, response in predictions.items():
    gold = gold_standard[paper_id]
    results[paper_id] = {}

    num_pred = num_pred + [[response[category] for category in numerical_scores]]
    num_gold = num_gold + [[gold[category] for category in numerical_scores]]

    results[paper_id]["MAE"] = mean_absolute_error(num_gold, num_pred)
    results[paper_id]["MSE"] = mean_squared_error(num_gold, num_pred)

    text_results = {}
    #for text in text_fields:
    P, R, F1 = score([response[text_fields[0]], response[text_fields[1]], response[text_fields[2]]], 
                     [gold[text_fields[0]], gold[text_fields[1]], gold[text_fields[2]]], lang="en", rescale_with_baseline=True)
    text_results[text_fields[0] + "_BERTScore"] = F1[0]
    text_results[text_fields[1] + "_BERTScore"] = F1[1]
    text_results[text_fields[2] + "_BERTScore"] = F1[2]

    embeddings = model.encode([response[text_fields[0]], gold[text_fields[0]], response[text_fields[1]], gold[text_fields[1]], response[text_fields[1]], gold[text_fields[2]]])
    text_results[text_fields[0] + "_Embedding"] = (embeddings[0], embeddings[1])
    text_results[text_fields[1] + "_Embedding"] = (embeddings[2], embeddings[3])
    text_results[text_fields[2] + "_Embedding"] = (embeddings[4], embeddings[5])

    results[paper_id]['Text Comparisons'] = text_results
    
matrices = confusion_matrix_gen(num_pred, num_gold)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [84]:
model.encode([predictions['aAcOaJYbUg']['Strengths'], gold_standard['aAcOaJYbUg']['Strengths']]).shape

(2, 384)

In [8]:
score([predictions['aAcOaJYbUg']['Explanation']], [gold_standard['aAcOaJYbUg']['Explanation']], lang='en', device='mps')

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


(tensor([1.0000]), tensor([1.0000]), tensor([1.0000]))