In [None]:
import os
import ast
import json
import nltk
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm import tqdm
from nltk.tokenize import sent_tokenize
from matplotlib.ticker import MaxNLocator
from collections import Counter, defaultdict
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from statsmodels.stats.inter_rater import fleiss_kappa
from utils import *

In [None]:
aspects_rename = {
    'Ablation': 'Ablation',
    'Analysis': 'Analysis',
    'Comparison': 'Compar.',
    'Contribution': 'Contribution',
    'Data/Task': 'Data/Task',
    'Definition/Description/Detail/Discussion/Explanation/Interpretation': 'DDDDEI',
    'Evaluation': 'Eval.',
    'Experiment': 'Experi.',
    'Intuition/Justification/Motivation/Validation': 'IJMV',
    'Methodology': 'Method.',
    'Novelty': 'Novelty',
    'Presentation': 'Present.',
    'Related Work': 'Related Work',
    'Result': 'Result',
    'Significance': 'Significance',
    'Theory': 'Theory'
}

tracks_rename = {
    'Question Answering': 'Question Answering',
    'Theme Track: Large Language Models and the Future of NLP': 'Theme Track: LLMs\n&the Future of NLP',
    'Information Extraction': 'Information Extraction',
    'Resources and Evaluation': 'Resources & Evaluation',
    'Dialogue and Interactive Systems': 'Dialogue & Interactive\nSystems',
    'Machine Translation': 'Machine Translation',
    'Multilinguality and Linguistic Diversity': 'Multilinguality\n& Linguistic Diversity',
    'Linguistic Theories, Cognitive Modeling, and Psycholinguistics': 'Linguistic Theories,\nCognitive Modeling,\n& Psycholinguistics',
    'NLP Applications': 'NLP Applications',
    'Sentiment Analysis, Stylistic Analysis, and Argument Mining': 'Sentiment Analysis,\nStylistic Analysis,\n& Argument Mining',
    'Language Modeling and Analysis of Language Models': 'Language Modeling\n& Analysis of\nLanguage Models',
    'Computational Social Science and Cultural Analytics': 'Computational Social Science\n& Cultural Analytics',
    'Syntax, Parsing and their Applications': 'Syntax, Parsing\n& their Applications',
    'Interpretability, Interactivity, and Analysis of Models for NLP': 'Interpretability, Interactivity,\n& Analysis of Models for NLP',
    'Summarization': 'Summarization',
    'Speech and Multimodality': 'Speech & Multimodality',
    'Discourse and Pragmatics': 'Discourse & Pragmatics',
    'Natural Language Generation': 'Natural Language Generation',
    'Machine Learning for NLP': 'ML for NLP',
    'Human-Centered NLP': 'Human-Centered NLP',
    'Ethics in NLP': 'Ethics in NLP',
    'Phonology, Morphology, and Word Segmentation': 'Phonology, Morphology,\n& Word Segmentation',
    'Efficient Methods for NLP': 'Efficient Methods for NLP',
    'Semantics: Lexical, Sentence level, Document Level, Textual Inference, etc.': 'Semantics: Lexical,\nSentence level,\nDocument Level,\nTextual Inference, etc.',
    'Information Retrieval and Text Mining': 'Information Retrieval\n& Text Mining',
    'Commonsense Reasoning': 'Commonsense Reasoning',
    'Language Grounding to Vision, Robotics and Beyond': 'Language Grounding to Vision,\nRobotics & Beyond'
}

## validity check

This corresponds to **Section 3.4 Validity check** in the paper.

### llm annotation consistency

In [None]:
annotation = pd.read_csv('annotation - llm.csv')

In [None]:
annotation_i_t0 = [str(_) for _ in annotation['annotation_1'].tolist()]
annotation_i_t1 = [str(_) for _ in annotation['annotation_2'].tolist()]
annotation_i2_t0 = [str(_) for _ in annotation['annotation_3'].tolist()]
annotation_i2_t1 = [str(_) for _ in annotation['annotation_4'].tolist()]

In [None]:
ref = annotation_i_t0
cand = annotation_i_t1

#### exact match

In [None]:
em = 0
for i in range(len(ref)):
    if sorted(ref[i].lower().split(', ')) == sorted(cand[i].lower().split(', ')):
        em += 1
em/len(annotation)

#### bertscore

In [None]:
with open('results/bert_score_similarity_llm_annotation_1&2.json') as file:
    results = json.loads(file.read())
f1 = [_['f1_score'] for _ in results]
count = Counter()
for _ in f1:
    if _ >= 0.99:
        count['>=0.99'] += 1
    if 0.98 <= _ < 0.99:
        count['[0.98,0.99)'] += 1
    if 0.97 <= _ < 0.98:
        count['[0.97,0.98)'] += 1
    if 0.96 <= _ < 0.97:
        count['[0.96,0.97)'] += 1
    if 0.95 <= _ < 0.96:
        count['[0.95,0.96)'] += 1
    if 0.90 <= _ < 0.95:
        count['[0.90,0.95)'] += 1
    if _ < 0.90:
        count['<0.90'] += 1
1 - (count['<0.90'] / sum(count.values()))

#### aspect consistency

In [None]:
def match_aspect(annotation):
    output = []
    for i in range(len(annotation)):
        items = merge_synonyms(annotation[i].replace(' and ', ', ').split(', '))
        entries = set()
        for aspect in items:
            if aspect in aspect_to_category:
                entries.update(aspect_to_category[aspect])
            else:
                entries.add('-')
        if len(entries) != 1 and '-' in entries:
            entries.remove('-')
        output.append(entries)
    return output

In [None]:
category_to_aspect = pd.read_csv('aspects - coarse.csv')
aspect_to_category = defaultdict(set)
for i in range(len(category_to_aspect)):
    aspect_to_category[category_to_aspect['LLM annotation'].to_list()[i]].add(category_to_aspect['COARSE'].to_list()[i])

ref_aspects, cand_aspects = match_aspect(ref), match_aspect(cand)
em = 0
for i in range(len(ref)):
    if ref_aspects[i] == cand_aspects[i]:
        em += 1
print('exact match:', em/len(annotation))

scores = []
for i in range(len(ref)):
    scores.append(jaccard_similarity(ref_aspects[i], cand_aspects[i]))
print('jaccard:', sum(scores)/len(scores))

### human annotation consistency

In [None]:
human_annotations = defaultdict()
for filename in os.listdir('human annotations/'):
    if 'annotation' in filename:
        human_annotations[filename] = pd.read_csv(os.path.join('human annotations/', filename))

In [None]:
for filename, df in human_annotations.items():
    review = df['review'].to_list()
    question = [_.replace('Does the review address ', '').replace('?', '') for _ in df['question'].to_list()]
    break

In [None]:
percentage_yes = defaultdict()
for filename, df in human_annotations.items():
    percentage_yes[filename] = df['yes'].sum() / len(df)

In [None]:
percentage_yes

In [None]:
sum(percentage_yes.values()) / len(percentage_yes)

In [None]:
matrix = defaultdict(Counter)
for filename, df in human_annotations.items():
    yes = df['yes'].to_list()
    for i in range(len(df)):
        if yes[i] == True:
            matrix['yes'][i] += 1
            matrix['no'][i] += 0
        else:
            matrix['yes'][i] += 0
            matrix['no'][i] += 1

In [None]:
matrix = pd.DataFrame(matrix)

In [None]:
n = 200
x_ticks = [_ for _ in range(n, len(matrix), n)]
fleiss = [fleiss_kappa(matrix[:_].values, method='fleiss') for _ in x_ticks]

In [None]:
plt.rcParams['font.size'] = 12
fig, ax = plt.subplots(figsize=(7, 3))
sns.lineplot(data=fleiss, marker='o', ax=ax)
ax.set_xticks([_ for _ in range(len(x_ticks))])
ax.set_xticklabels([str(_) if _ % 400 != 0 else '' for _ in x_ticks])
ax.set_ylim(0, 0.8)
ax.set_yticks([0, 0.2, 0.4, 0.6, 0.8])

xmin, xmax = ax.get_xlim()
ax.set_xlim(xmin, xmax)
ax.fill_between([xmin, xmax], 0.00, 0.20, color='lightblue', edgecolor='lightblue', alpha=0.2)
ax.fill_between([xmin, xmax], 0.20, 0.40, color='lightblue', edgecolor='lightblue', alpha=0.4)
ax.fill_between([xmin, xmax], 0.40, 0.60, color='lightblue', edgecolor='lightblue', alpha=0.6)
ax.fill_between([xmin, xmax], 0.60, 0.80, color='lightblue', edgecolor='lightblue', alpha=0.8)

ax.text(x=14, y=0.1, s='slight', ha='right', va='center', color='black', fontstyle='italic')
ax.text(x=14, y=0.3, s='fair', ha='right', va='center', color='black', fontstyle='italic')
ax.text(x=14, y=0.5, s='moderate', ha='right', va='center', color='black', fontstyle='italic')
ax.text(x=14, y=0.7, s='substantial', ha='right', va='center', color='black', fontstyle='italic')

ax.set_xlabel('entry')
ax.set_ylabel('fleiss\' kappa')

plt.tight_layout()
plt.savefig(f'plots/fleiss_kappa.png', format='png', bbox_inches='tight', dpi=300)
plt.show()

In [None]:
fleiss_kappa(matrix.values, method='fleiss')

## aspect analysis

This corresponds to **Section 5.1 Aspect analysis** in the paper.

In [None]:
config = pd.read_csv('config_inference.txt', sep='\t')
run_id = config['run_id'][(config['source'] == 'emnlp23_all') & (config['type_of_labels'] == 'coarse')].to_list()[0]

with open('data/emnlp23.json') as file:
    data = json.loads(file.read())

for paper_id in data:
    if data[paper_id]['Submission_Track'] == 'Semantics: Lexical':
        data[paper_id]['Submission_Track'] = 'Semantics: Lexical, Sentence level, Document Level, Textual Inference, etc.'

with open(f'results/inference-{run_id}.json') as file:
    results = json.loads(file.read())

number_of_reviews_by_track, number_of_reviews_by_score, counter_aspects_by_track, counter_aspects_by_score = Counter(), Counter(), defaultdict(Counter), defaultdict(Counter)
for paper_id in results:
    aspects = set()
    for reviewer_id, items in results[paper_id].items():
        number_of_reviews_by_track[data[paper_id]['Submission_Track']] += 1
        number_of_reviews_by_score[data[paper_id]['Reviews'][reviewer_id]['Excitement'].split(': ')[0]] += 1
        for _, item in items.items():
            aspects.update(item)
        if '-' in aspects:
            aspects.remove('-')
        counter_aspects_by_track[data[paper_id]['Submission_Track']].update(aspects)
        counter_aspects_by_score[data[paper_id]['Reviews'][reviewer_id]['Excitement'].split(': ')[0]].update(aspects)

In [None]:
target_tracks = ['Machine Translation', 'Multilinguality and Linguistic Diversity', 'Question Answering', 'Resources and Evaluation']

data = []
for track in target_tracks:
    for aspect, count in counter_aspects_by_track[track].most_common(5):
        percentage = count / number_of_reviews_by_track[track] * 100
        data.append({'track': track, 'aspect': aspects_rename[aspect], 'percentage': percentage})

df = pd.DataFrame(data)

fig, axes = plt.subplots(2, 2, figsize=(5.6, 5.8))
axes = axes.flatten()

plt.rcParams['font.size'] = 12

for ax, track in zip(axes, target_tracks):
    subset = df[df['track'] == track]
    sns.barplot(
        data=subset,
        x='aspect',
        y='percentage',
        ax=ax
    )
    ax.set_title(tracks_rename[track], fontsize=12.6, fontweight='bold')
    ax.set_xlabel('')
    ax.set_ylabel('')
    ax.set_yticks([0, 20, 40, 60, 80, 100])
    ax.set_ylim(0, 100)
    ax.tick_params(axis='x', rotation=45)
    
fig.supylabel('review (%)')
plt.tight_layout()
plt.savefig(f'plots/frequency-target_tracks.png', format='png', bbox_inches='tight', dpi=300)
plt.show()

In [None]:
target_tracks = counter_aspects_by_track.keys()

data = []
for track in target_tracks:
    for aspect, count in counter_aspects_by_track[track].most_common(5):
        percentage = count / number_of_reviews_by_track[track] * 100
        data.append({'track': track, 'aspect': aspects_rename[aspect], 'percentage': percentage})

df = pd.DataFrame(data)

fig, axes = plt.subplots(6, 5, figsize=(15, 22))
axes = axes.flatten()

plt.rcParams['font.size'] = 12

for ax, track in zip(axes, target_tracks):
    subset = df[df['track'] == track]
    sns.barplot(
        data=subset,
        x='aspect',
        y='percentage',
        ax=ax
    )
    ax.set_title(tracks_rename[track], fontsize=12.6, fontweight='bold')
    ax.set_xlabel('')
    if track in ['Question Answering', 'Machine Translation', 'Language Modeling and Analysis of Language Models', 'Speech and Multimodality', 'Ethics in NLP', 'Commonsense Reasoning']:
        ax.set_ylabel('review (%)')
    else:
        ax.set_ylabel('')
    ax.set_yticks([0, 20, 40, 60, 80, 100])
    ax.set_ylim(0, 100)
    ax.tick_params(axis='x', rotation=45)

axes[27].set_visible(False)
axes[28].set_visible(False)
axes[29].set_visible(False)
plt.tight_layout()
plt.savefig(f'plots/frequency-all_tracks.png', format='png', bbox_inches='tight', dpi=300)
plt.show()

In [None]:
from sklearn.cluster import KMeans

In [None]:
def levenshtein_similarity(list1, list2):
    len1, len2 = len(list1), len(list2)
    dp = np.zeros((len1 + 1, len2 + 1), dtype=int)
    
    for i in range(len1 + 1):
        dp[i][0] = i
    for j in range(len2 + 1):
        dp[0][j] = j
    
    for i in range(1, len1 + 1):
        for j in range(1, len2 + 1):
            if list1[i - 1] == list2[j - 1]:
                dp[i][j] = dp[i - 1][j - 1]
            else:
                dp[i][j] = 1 + min(dp[i - 1][j], dp[i][j - 1], dp[i - 1][j - 1])
    
    edit_distance = dp[len1][len2]
    max_length = max(len1, len2)
    return 1 - edit_distance / max_length

In [None]:
most_commons = defaultdict(list)
for track in sorted(list(counter_aspects_by_track.keys())):
    most_commons[track] = [_[0] for _ in counter_aspects_by_track[track].most_common(10)]

similarities = []
for track in most_commons:
    entry = []
    for _ in most_commons:
        entry.append(levenshtein_similarity(most_commons[track], most_commons[_]))
    similarities.append(entry)
    
similarities = np.array(similarities)
kmeans_rows = KMeans(n_clusters=2, random_state=2266).fit(similarities)
kmeans_cols = KMeans(n_clusters=2, random_state=2266).fit(similarities)

row_order = np.argsort(kmeans_rows.labels_)
col_order = np.argsort(kmeans_cols.labels_)

similarities = similarities[row_order, :][:, col_order]

plt.rcParams['font.size'] = 12

fig, ax = plt.subplots(figsize=(8, 6))
ax = sns.heatmap(similarities, cmap='coolwarm') #, cbar_kws={'ticks': [0, 2, 4, 6, 8]})#, vmin=0.5, vmax=1.0)
ticks = [_+0.5 for _ in range(0, 27, 2)]
ticklabels = [_ for _ in range(1, 28, 2)]
ax.set_xticks(ticks)
ax.set_xticklabels(ticklabels, rotation=0)
ax.set_xlabel('track', fontsize=14)
ax.invert_yaxis()
ax.set_yticks(ticks)
ax.set_yticklabels(ticklabels, rotation=0)
ax.set_ylabel('track', fontsize=14)

plt.savefig(f'plots/heatmap-track_similarity.png', format='png', bbox_inches='tight', dpi=300)
plt.show()

In [None]:
for _,i in enumerate(row_order):
    print(f'({_+1})', list(counter_aspects_by_track.keys())[i])

In [None]:
config = pd.read_csv('config_inference.txt', sep='\t')
counters, numbers = defaultdict(), defaultdict()
venue = 'emnlp23'
score_field = 'Excitement' if venue == 'emnlp23' else 'Rating'
for source in [f'{venue}_strengths', f'{venue}_weaknesses']:
    run_id = config['run_id'][(config['source'] == source) & (config['type_of_labels'] == 'coarse')].to_list()[0]

    with open(f'data/{venue}.json') as file:
        data = json.loads(file.read())
    with open(f'results/inference-{run_id}.json') as file:
        results = json.loads(file.read())
    
    number_of_reviews_by_score, counter_aspects_by_score = Counter(), defaultdict(Counter)
    for paper_id in results:
        aspects = set()
        for reviewer_id, items in results[paper_id].items():
            number_of_reviews_by_score[data[paper_id]['Reviews'][reviewer_id][score_field].split(': ')[0]] += 1
            for _, item in items.items():
                aspects.update(item)
            if '-' in aspects:
                aspects.remove('-')
            counter_aspects_by_score[data[paper_id]['Reviews'][reviewer_id][score_field].split(': ')[0]].update(aspects)

    counters[source] = counter_aspects_by_score
    numbers[source] = number_of_reviews_by_score

In [None]:
target_aspects = ['Analysis', 'Definition/Description/Detail/Discussion/Explanation/Interpretation']

data = []
for aspect in target_aspects:
    for track, counter in counter_aspects_by_track.items():
        count = counter.get(aspect, 0)
        percentage = round(count / number_of_reviews_by_track[track] * 100, 2)
        data.append({'track': track, 'aspect': aspect, 'percentage': percentage})

df = pd.DataFrame(data)

for aspect in target_aspects:
    subset = df[df['aspect'] == aspect].sort_values(by='percentage', ascending=False)
    print('|track|frequency (%)|')
    print('|--|--|')
    tracks = subset['track'].to_list()
    frequencies = subset['percentage'].to_list()
    for i in range(len(subset)):
        print(f'|{tracks[i]}|{frequencies[i]}|')
    print('\n\n')

|track|frequency (%)|
|--|--|
|Computational Social Science and Cultural Analytics|69.23|
|Linguistic Theories, Cognitive Modeling, and Psycholinguistics|68.75|
|Commonsense Reasoning|62.62|
|Multilinguality and Linguistic Diversity|60.68|
|Machine Learning for NLP|60.0|
|Machine Translation|57.38|
|Discourse and Pragmatics|56.86|
|Phonology, Morphology, and Word Segmentation|56.67|
|Interpretability, Interactivity, and Analysis of Models for NLP|56.37|
|Sentiment Analysis, Stylistic Analysis, and Argument Mining|55.66|
|Theme Track: Large Language Models and the Future of NLP|55.56|
|Information Retrieval and Text Mining|55.5|
|NLP Applications|54.48|
|Summarization|54.44|
|Resources and Evaluation|54.35|
|Efficient Methods for NLP|53.07|
|Information Extraction|53.06|
|Syntax, Parsing and their Applications|52.94|
|Language Modeling and Analysis of Language Models|51.85|
|Dialogue and Interactive Systems|51.23|
|Speech and Multimodality|51.01|
|Ethics in NLP|50.96|
|Semantics: Lexical, Sentence level, Document Level, Textual Inference, etc.|48.15|
|Language Grounding to Vision, Robotics and Beyond|47.91|
|Human-Centered NLP|47.78|
|Question Answering|45.58|
|Natural Language Generation|42.34|



|track|frequency (%)|
|--|--|
|Ethics in NLP|65.38|
|Linguistic Theories, Cognitive Modeling, and Psycholinguistics|60.42|
|Interpretability, Interactivity, and Analysis of Models for NLP|59.46|
|NLP Applications|57.17|
|Information Extraction|56.85|
|Resources and Evaluation|56.09|
|Semantics: Lexical, Sentence level, Document Level, Textual Inference, etc.|56.02|
|Human-Centered NLP|54.44|
|Machine Learning for NLP|53.56|
|Information Retrieval and Text Mining|53.5|
|Speech and Multimodality|52.02|
|Sentiment Analysis, Stylistic Analysis, and Argument Mining|51.89|
|Language Modeling and Analysis of Language Models|51.85|
|Language Grounding to Vision, Robotics and Beyond|50.95|
|Natural Language Generation|50.9|
|Computational Social Science and Cultural Analytics|50.3|
|Theme Track: Large Language Models and the Future of NLP|49.9|
|Summarization|49.44|
|Multilinguality and Linguistic Diversity|49.03|
|Dialogue and Interactive Systems|48.77|
|Discourse and Pragmatics|47.06|
|Efficient Methods for NLP|46.93|
|Question Answering|45.94|
|Commonsense Reasoning|42.99|
|Machine Translation|39.34|
|Syntax, Parsing and their Applications|39.22|
|Phonology, Morphology, and Word Segmentation|33.33|