# Analysis of ICLR citation data

In [None]:
from pathlib import Path
import pandas as pd
import json
import numpy as np

In [None]:
data_dir = Path("/net/nfs.cirrascale/allennlp/davidw/proj/end-of-anonymity/data")

# ICLR data with S2's and publications dates.
df = pd.read_csv(data_dir / "baby_iclr_ids_and_dates.csv")

# Citations for all submitted ICLR papers.
citations = [json.loads(line) for line in open(data_dir / "baby_iclr_citations.jsonl")]

Find out how many submissions we missed (i.e. couldn't find S2 ID's for).

NOTE: Nearly all the submissions we can't get data on are for rejected papers, since many of these were likely never arXiv'd. We might need to account for this somehow.

In [None]:
missed = (df["s2_id"] == -1).sum()
print(f"{missed} / {len(df)} submissions missed.")

# Look at this for accepted vs. not.
df_reject = df[df["full_decision"].str.contains("Reject")]
df_accept = df[df["full_decision"].str.contains("Accept")]

missed_reject = (df_reject["s2_id"] == -1).sum()
print(f"{missed_reject} / {len(df_reject)} rejected submissions missed.")

missed_accept = (df_accept["s2_id"] == -1).sum()
print(f"{missed_accept} / {len(df_accept)} accepted submissions missed.")

## Get citations within a year-long window for each paper

In [None]:
result_dir = Path("/net/nfs.cirrascale/allennlp/davidw/proj/end-of-anonymity/results")
cite_counts = pd.read_csv(result_dir / "citations_within_year.csv")

Look at number of citations a year out, comparing papers that were accepted vs. rejected.
Papers that were accepted were cited more.

In [None]:
cite_counts["short_decision"] = [x.split(" ")[0] for x in cite_counts["full_decision"]]
cite_counts.groupby("short_decision")["cites_within_year"].describe()

Look at citations a year out, comparing papers that were arXiv'd vs. not.
It looks like arXiv'ing doesn't make a big difference, at least in mean.

In [None]:
cite_counts.groupby("arxiv_first")["cites_within_year"].describe()

Look at acceptance rate for papers that were arXiv'd vs. not.
Papers that were arXiv'd first are more likely to be accepted.

In [None]:
cite_counts.groupby("arxiv_first").apply(lambda group: (group["short_decision"] == "Accept").mean())