In [None]:
import copy
import json
import time
import torch
import random
import numpy as np
import pandas as pd

from tqdm import tqdm
from collections import Counter, defaultdict
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from utils import *

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier

In [None]:
type_of_labels = 'coarse'
column_name = 'COARSE'
if type_of_labels == 'fine':
    column_name = 'FINE'

category_to_aspect = pd.read_csv(f'aspects - {type_of_labels}.csv')
aspect_to_category = defaultdict(set)
for i in range(len(category_to_aspect)):
    if (type_of_labels == 'fine' and category_to_aspect[column_name].to_list()[i] not in ['Contribution', 'Definition', 'Description', 'Detail', 'Discussion', 'Explanation', 'Interpretation', 'Intuition', 'Justification', 'Motivation', 'Validation', 'Novelty', 'Clarity', 'Confusion', 'Figure', 'Grammar', 'Notation', 'Presentation', 'Table', 'Terminology', 'Typo', 'Related Work', 'Impact', 'Importance', 'Significance']) or (type_of_labels == 'coarse' and category_to_aspect[column_name].to_list()[i] not in ['Contribution', 'Definition/Description/Detail/Discussion/Explanation/Interpretation', 'Intuition/Justification/Motivation/Validation', 'Novelty', 'Presentation', 'Related Work', 'Significance']):
        aspect_to_category[category_to_aspect['LLM annotation'].to_list()[i]].add(category_to_aspect[column_name].to_list()[i])

In [None]:
data = defaultdict()
for venue in ['iclr20', 'iclr21', 'iclr22', 'iclr23', 'iclr24']:
    with open(f'data/{venue}.json') as file:
        data[venue] = json.loads(file.read())

annotation = pd.read_csv('annotation - llm.csv')
result = defaultdict(list)
for venue in ['iclr20', 'iclr21', 'iclr22', 'iclr23', 'iclr24']:
    with open(f'preprocessed/preprocessed-{venue}.json') as file:
        preprocessed = json.loads(file.read())
    for paper_id in preprocessed:
        with open(f'data/papers/{paper_id}.txt') as file:
            paper = file.read()
        aspects = []
        for item in annotation['annotation_1'][(annotation['venue'] == venue) & (annotation['paper_id'] == paper_id)].tolist():
            aspects.extend(merge_synonyms(str(item).replace(' and ', ', ').split(', ')))
        result['venue'].append(venue)
        result['paper_id'].append(paper_id)
        result['abstract'].append(data[venue][paper_id]['Abstract'])
        result['keywords'].append(', '.join(data[venue][paper_id]['Keywords']))
        result['title'].append(data[venue][paper_id]['Title'])
        result['paper'].append(paper.split('\nREFERENCES\n')[0])
        result['aspects'].append(list(set(aspects)))

In [None]:
texts = result['title']
labels = result['aspects']

In [None]:
categories = []
for aspect, category in aspect_to_category.items():
    categories.extend(list(category))
categories = list(set(categories))
labels_one_hot = []
for item in labels:
    output = np.zeros(len(categories))
    for aspect in item:
        if aspect in aspect_to_category:
            for category in aspect_to_category[aspect]:
                output[categories.index(category)] = 1
    labels_one_hot.append(output)

In [None]:
vectorizer = CountVectorizer(max_features=5000)
X = vectorizer.fit_transform(texts)

In [None]:
number_of_data = int(len(texts)*0.9)
train_texts = X[:number_of_data]
train_labels = labels_one_hot[:number_of_data]
eval_texts = X[number_of_data:]
eval_labels = labels_one_hot[number_of_data:]

In [None]:
seed = 2266
classifier = OneVsRestClassifier(RandomForestClassifier(n_estimators=100, random_state=seed))
classifier.fit(train_texts, train_labels)

In [None]:
predictions_one_hot = classifier.predict(eval_texts)
print(classification_report(eval_labels, predictions_one_hot, target_names=categories, zero_division=0))
precision_weighted, recall_weighted, f1_weighted, _ = precision_recall_fscore_support(eval_labels, predictions_one_hot, average='weighted', zero_division=0)
print(round(precision_weighted, 4), round(recall_weighted, 4), round(f1_weighted, 4))

In [None]:
actuals, predictions = [], []
for item in eval_labels:
    actual = []
    for i in range(len(item)):
        if item[i] == 1:
            actual.append(categories[i])
    actuals.append(actual)

for item in predictions_one_hot:
    prediction = []
    for i in range(len(item)):
        if item[i] == 1:
            prediction.append(categories[i])
    predictions.append(prediction)
    
similarity = calculate_jaccard_similarity_for_lists(actuals, predictions)
round(sum(similarity) / len(similarity), 4)