# Analysis of ICLR citation data

In [3]:
from pathlib import Path
import pandas as pd
import json
import numpy as np
from collections import Counter

In [4]:
data_dir = Path("/net/nfs.cirrascale/allennlp/davidw/proj/end-of-anonymity/data")

# ICLR data with S2's and publications dates.
df = pd.read_csv(data_dir / "baby_iclr_ids_and_dates.csv")

# Treat workshop-invited papers as rejections.
df["full_decision_with_workshop"] = df["full_decision"].copy()
df["full_decision"] = df["full_decision"].replace("Invite to Workshop Track", "Reject")


# Citations for all submitted ICLR papers.
citations = [json.loads(line) for line in open(data_dir / "baby_iclr_citations.jsonl")]

## Data checks

Look at distribution of decisions. We can probably just throw out "invite to workshop"; seems like an edge case.

In [5]:
df.full_decision_with_workshop.value_counts()

full_decision_with_workshop
Reject                      6483
Accept (Poster)             3063
Accept (Spotlight)           398
Accept (Oral)                169
Invite to Workshop Track     136
Accept (Talk)                 48
Name: count, dtype: int64

In [6]:
df.full_decision.value_counts()

full_decision
Reject                6619
Accept (Poster)       3063
Accept (Spotlight)     398
Accept (Oral)          169
Accept (Talk)           48
Name: count, dtype: int64

In [7]:
len(df)

10297

Distribution of conference years.

In [8]:
df.year.value_counts().sort_index()

year
2017     490
2018     910
2019    1419
2020    2213
2021    2595
2022    2670
Name: count, dtype: int64

In [9]:
df.groupby("year").arxiv_first.value_counts()

year  arxiv_first
2017  False           490
2018  False           889
      True             21
2019  False          1359
      True             60
2020  False          1675
      True            538
2021  False          2062
      True            533
2022  False          2229
      True            441
Name: count, dtype: int64

Find out how many submissions we missed (i.e. couldn't find S2 ID's for).

**NOTE**: Nearly all the submissions we can't get data on are for rejected papers, since many of these were likely never arXiv'd. We might need to account for this somehow.

- For papers that were accepted, we're missing 1.1%
- For papers that were rejected, we're missing 19.8%. These were likely withdrawn and then never released publicly.

In [10]:
missed = (df["s2_id"] == -1).sum()
print(f"{missed} / {len(df)} submissions missed.")

# Look at this for accepted vs. not.
df_reject = df[df["full_decision"].str.contains("Reject")]
df_accept = df[df["full_decision"].str.contains("Accept")]

missed_reject = (df_reject["s2_id"] == -1).sum()
kept_reject = len(df_reject) - missed_reject
pct_found_reject = round((kept_reject / len(df_reject)) * 100, 1)
print(f"{kept_reject} / {len(df_reject)} ({pct_found_reject}%) rejected submissions found.")

missed_accept = (df_accept["s2_id"] == -1).sum()
kept_accept = len(df_accept) - missed_accept
pct_found_accept = round((kept_accept / len(df_accept)) * 100, 1)
print(f"{kept_accept} / {len(df_accept)} ({pct_found_accept}%) accepted submissions found.")

1325 / 10297 submissions missed.
5336 / 6619 (80.6%) rejected submissions found.
3636 / 3678 (98.9%) accepted submissions found.


Look at publication date coverage for papers where we got the S2 ID.

- Nearly all papers have a publication year.
- Roughly 10% of papers don't have a citation date
- Missingness isn't dramatically different for accepted vs. rejected papers.
- **Conclusion**: It should be fine to just toss out the papers with no citation date.

In [11]:
df_s2 = df[df["s2_id"] != -1]
num_papers = len(df_s2)
no_year = df_s2["publication_year"].isna().sum()
no_date = df_s2["publication_date"].isna().sum()

msg = f"{no_year} / {num_papers} papers have no year."
print(msg)

pct_missing = round((no_date / num_papers) * 100, 1)
msg = f"{no_date} / {num_papers} ({pct_missing}%) papers have no exact date."
print(msg)

# Look at whether missing dates are biased toward accepted or rejected papers.
df_s2_reject = df_s2[df_s2["full_decision"].str.contains("Reject")]
df_s2_accept = df_s2[df_s2["full_decision"].str.contains("Accept")]

for name, this_df in [("reject", df_s2_reject), ("accept", df_s2_accept)]:
    num_papers = len(this_df)
    no_date = this_df["publication_date"].isna().sum()
    has_date = num_papers - no_date
    pct_found = round((has_date / num_papers) * 100, 1)
    msg = f"{has_date} / {num_papers} ({pct_found}%) `{name}` papers have an exact date."
    print(msg)


9 / 8972 papers have no year.
884 / 8972 (9.9%) papers have no exact date.
4879 / 5336 (91.4%) `reject` papers have an exact date.
3209 / 3636 (88.3%) `accept` papers have an exact date.


Check that publication date missingness doesn't correlate with preprinting

In [16]:
for val, groupdf in df_s2.groupby("arxiv_first"):
    num_na = groupdf["publication_date"].isna().sum()
    frac_na = round((num_na / len(groupdf)) * 100, 1)
    msg = f"When arXiv is {val}, {frac_na}% of papers have no exact date."
    print(msg)

When arXiv is False, 10.8% of papers have no exact date.
When arXiv is True, 5.5% of papers have no exact date.


## Checks on cited papers

Get info on the cited papers. Roughly 15% of cited papers are missing publication dates, but this is virtually identical between accepted vs. rejected papers. So, I think it's probably OK to just throw these out as well.

In [None]:
citations = [json.loads(line) for line in open(data_dir / "baby_iclr_citations.jsonl")]

In [None]:
counts = {"Accept": Counter(), "Reject": Counter()}

for entry in citations:
    if entry["status"] != "Success":
        continue
    else:
        query = f"s2_id == {entry['s2_id']}"
        decision = df.query(query).iloc[0]["full_decision"]
        decision = decision.split(" ")[0]
        if decision not in ["Accept", "Reject"]:
            continue
        cites = entry["citations"]
        for cite in cites:
            if cite["citingPaper"]["publicationDate"] is None:
                counts[decision]["no_date"] += 1
            else:
                counts[decision]["has_date"] += 1

for name in ["Accept", "Reject"]:
    counts_loop = counts[name]
    total = counts_loop["has_date"] + counts_loop["no_date"]
    no_date = counts_loop["no_date"]
    pct_missing = round((no_date / total) * 100, 1)
    msg = f"{no_date} / {total} ({pct_missing}%) `{name}` cited papers have no exact date."
    print(msg)


In [None]:
df.query(f"title == '{entry['title']}' & s2_id == {entry['s2_id']}").iloc[0]["full_decision"]

## Citation analysis

In [None]:
result_dir = Path("/net/nfs.cirrascale/allennlp/davidw/proj/end-of-anonymity/results")
cite_counts = pd.read_csv(result_dir / "citations_within_year.csv")

Look at number of citations a year out, comparing papers that were accepted vs. rejected.
Papers that were accepted were cited more.

In [None]:
cite_counts["short_decision"] = [x.split(" ")[0] for x in cite_counts["full_decision"]]
cite_counts.groupby("short_decision")["cites_within_year"].describe()

Look at citations a year out, comparing papers that were arXiv'd vs. not.
It looks like arXiv'ing doesn't make a big difference, at least in mean.

In [None]:
cite_counts.groupby("arxiv_first")["cites_within_year"].describe()

Look at acceptance rate for papers that were arXiv'd vs. not.
Papers that were arXiv'd first are more likely to be accepted.

In [None]:
cite_counts.groupby("arxiv_first").apply(lambda group: (group["short_decision"] == "Accept").mean())