# Data Preparation For Analyses on ICLR 23 Public Comments

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## 1. Data Description

The features that we use to represent each blind submission is summarized as follows:

  - `id_forum`: the id or forum (the two should be identical to each other for the same paper) of a paper, which is unique by paper.

  - `title`: the title of the paper.

  - `authorids`: a list of author_ids. This feature is reserved for cross-referencing author profiles.

  - `authors`: a list of authors. This feature is reserved for cross-referencing external sources such as arxiv, google scholar, etc.

  - `decision`: acceptance.

  - `scores`: a list of scores given by paper's reviewers.

  - `avg_score`: the average score.

  - `confidences`: the confidence level of each reviewer when assigning the score, also being a list.

  - `avg_confidence`: the average confidence level.

  - `pub_cmtids`: a list of public comment ids. If a paper does not have any public comment, this field would be an empty list.

  - `cmt_responses`: a binary/bool list, each element indicates authors' response to the corresponding public comment. *e.g.*, `[True,False,True]` means the authors have replied to the 1st and 3rd comment, but not replied to the 2nd comment. **'replied' means an official comment from authors is created, replied to the public comment, and the creation time stamp is prior to decision time.** We notice that the authors may reply to some comments by a "general response" or reply to the entire thread with a single response, therefore  we first collect the items of this column by a rule-based algorithm then proofread the collected items.

  - `cmts_sentiment`: a 5-level integer list indicating whether each comment has a positive/neutral/negative sentiment. This feature requires human-labeling. Please refer to Section 2.2 for more details.
    - -2: negative,
    - -1: neutral-negative,
    - 0: neutral,
    - 1: neutral-positive,
    - 2: positive.

  - `arxiv_availability`: a binary/bool list, each element indicates the availability of an arxiv-preprint (of the paper under review) at the time that the corresponding comment is created. *e.g.*, for a submission whose preprint was submitted to arxiv on 2022-11-16, and two comments were created on 2022-11-15 and 2022-11-17, then the `arxiv-availability` is `[False, True]`.

In this study, we focus on public comments that start a new thread by their own, and leave other types of comments (*e.g.*, voluntary response to official reviews) to future studies.

## 2. Data Collection

### 2.1 Collecting unprocessed submission information

In [None]:
!pip install openreview-py

import openreview
import pandas as pd
import numpy as np
from tqdm import tqdm

class RepliesParser(object):
  def __init__(self):
    self.submission_info = {
        'decision': None,
        'scores': [],
        'avg_score': None,
        'confidences': [],
        'avg_confidence': None,
        'pub_cmtids': [],
        'cmt_responses': []
    }
    self._decision_time = None


  def get_submission_info(self, replies, directReplies):
    self._parse(directReplies)
    self._parse_pub_cmts(directReplies)
    self._parse_response_to_pubCmts(replies)
    return self.submission_info


  def _parse(self, directReplies):
    '''
      directReplies: a list containing all direct replies to a forum (a paper)
      return: self

      In this function, the following field(s) in submission_info will be updated:
        {'decision', 'scores', 'avg_score', 'confidences', 'avg_confidence'}
      self._decision_time will also be updated.
    '''
    for reply in directReplies:
      # 1. official reviews: record score (rating) and confidence
      if reply['invitation'].endswith('Official_Review'):
        # append score
        score = float(reply['content']['recommendation'].split(':')[0])
        self.submission_info['scores'].append(score)
        # append confidence
        confidence = float(reply['content']['confidence'].split(':')[0])
        self.submission_info['confidences'].append(confidence)

      # 2. decision: record decision and the time that the final decision is made
      if reply['invitation'].endswith('Decision'):
        self.submission_info['decision'] = reply['content']['decision']
        self._decision_time = reply['tmdate']

    # update average score:
    if len(self.submission_info['scores']) > 0:
      self.submission_info['avg_score'] = np.mean(self.submission_info['scores'])
    # update average confidence:
    if len(self.submission_info['confidences']) > 0:
      self.submission_info['avg_confidence'] = np.mean(self.submission_info['confidences'])
    return self


  def _parse_pub_cmts(self, directReplies):
    '''
      directReplies: a list containing all direct replies to a forum (a paper)
      return: self

      In this function, the following field(s) in submission_info will be updated:
        {'pub_cmtids'}
    '''
    for reply in directReplies:
      if reply['invitation'].endswith('Public_Comment'):
        cmt_cdate = reply['cdate']  # comment creation date time
        # only record public comments created before final decision
        if cmt_cdate < self._decision_time:
          self.submission_info['pub_cmtids'].append(reply['id'])
    return self


  def _parse_response_to_pubCmts(self, replies):
    '''
      replies: a list containig all replies under a forum (including direct replies, which are directly posted under the forum, or "indirect replies", which are posted under other replies)
      return: self

      In this function, the following field(s) in submission_info will be updated:
        {'cmt_responses'}
    '''
    repliedTos = []
    for reply in replies:
      if reply['invitation'].endswith('Official_Comment'):
        reply_cdate = reply['tcdate']
        if reply_cdate < self._decision_time:
          repliedTos.append(reply['replyto'])

    self.submission_info['cmt_responses'] = [(cmt in repliedTos) for cmt in self.submission_info['pub_cmtids']]
    return self

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting openreview-py
  Downloading openreview_py-1.19.5-py2.py3-none-any.whl (480 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.8/480.8 KB[0m [31m26.6 MB/s[0m eta [36m0:00:00[0m
Collecting pycryptodome
  Downloading pycryptodome-3.17-cp35-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m81.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting Deprecated
  Downloading Deprecated-1.2.13-py2.py3-none-any.whl (9.6 kB)
Collecting pyjwt
  Downloading PyJWT-2.6.0-py3-none-any.whl (20 kB)
Collecting pylatexenc
  Downloading pylatexenc-2.10.tar.gz (162 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m162.6/162.6 KB[0m [31m21.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting setuptools==65.5.1
  Downloading

There are some 'outlier' papers that still haven't receive the final decision.

In [None]:
myclient = openreview.Client(baseurl='https://api.openreview.net')
submissions_directReplies = openreview.tools.iterget_notes(myclient, invitation='ICLR.cc/2023/Conference/-/Blind_Submission', details='directReplies')

odd_ids = []

for sd in submissions_directReplies:
  title = sd.content['title']
  decision = None
  counter = 0
  for reply in sd.details['directReplies']:
    if reply['invitation'].endswith('Decision'):
      counter += 1
      decision = reply['content']['decision']
  if not decision:
    print(f'for paper: {title}, we count {counter} decisions')
    odd_ids.append(sd.id)


In [None]:
# collect all the features of each submission, and save the collected features into a DataFrame

myclient = openreview.Client(baseurl='https://api.openreview.net')
submissions_replies = openreview.tools.iterget_notes(myclient, invitation='ICLR.cc/2023/Conference/-/Blind_Submission', details='replies')
submissions_directReplies = openreview.tools.iterget_notes(myclient, invitation='ICLR.cc/2023/Conference/-/Blind_Submission', details='directReplies')

dfs = []

for sr, sd in tqdm(zip(submissions_replies, submissions_directReplies)):
  assert sr.id == sd.id, 'two different submissions are retrieved!'

  # skip the papers that still haven't got a decision
  if sd.id in odd_ids:
    continue

  submission_dict = {
      'id_forum': [sd.id],
      'title': [sd.content['title']],
      'authorids': [sd.content['authorids']],
      'authors': [sd.content['authors']]
  }

  replies = sr.details['replies']
  directReplies = sd.details['directReplies']

  replies_parser = RepliesParser()

  submission_info = replies_parser.get_submission_info(replies, directReplies)
  for key, value in submission_info.items():
    submission_dict[key] = [value]

  dfs.append(pd.DataFrame.from_dict(submission_dict))

df = pd.concat(dfs, ignore_index=True)

3842it [00:26, 144.10it/s]


In [None]:
df.head()

Unnamed: 0,id_forum,title,authorids,authors,decision,scores,avg_score,confidences,avg_confidence,pub_cmtids,cmt_responses
0,RUzSobdYy0V,Quantifying and Mitigating the Impact of Label...,"[~Julius_Adebayo1, ~Melissa_Hall1, ~Bowen_Yu2,...","[Julius Adebayo, Melissa Hall, Bowen Yu, Bobbi...",Accept: poster,"[8.0, 6.0, 5.0]",6.333333,"[3.0, 3.0, 4.0]",3.333333,[],[]
1,N3kGYG3ZcTi,Suppression helps: Lateral Inhibition-inspired...,"[~Chengyuan_Zhuang1, ~Xiaohui_Yuan1, ~XUAN_GUO2]","[Chengyuan Zhuang, Xiaohui Yuan, XUAN GUO]",Reject,"[1.0, 3.0, 6.0, 3.0]",3.25,"[5.0, 5.0, 5.0, 5.0]",5.0,[],[]
2,tmIiMPl4IPa,Factorized Fourier Neural Operators,"[~Alasdair_Tran1, almath123@gmail.com, ~Lexing...","[Alasdair Tran, Alexander Mathews, Lexing Xie,...",Accept: poster,"[6.0, 8.0, 5.0, 6.0, 8.0]",6.6,"[3.0, 2.0, 4.0, 4.0, 5.0]",3.6,[],[]
3,mhnHqRqcjYU,DFPC: Data flow driven pruning of coupled chan...,"[~Tanay_Narshana1, ~Chaitanya_Murti1, ~Chiranj...","[Tanay Narshana, Chaitanya Murti, Chiranjib Bh...",Accept: poster,"[6.0, 6.0, 8.0]",6.666667,"[3.0, 2.0, 3.0]",2.666667,[],[]
4,sZI1Oj9KBKy,TVSPrune - Pruning Non-discriminative filters ...,"[~Chaitanya_Murti1, ~Tanay_Narshana1, ~Chiranj...","[Chaitanya Murti, Tanay Narshana, Chiranjib Bh...",Accept: poster,"[3.0, 8.0, 6.0, 8.0]",6.25,"[4.0, 4.0, 3.0, 3.0]",3.5,[],[]


We save the dataframe to google drive.

In [None]:
df.to_csv('drive/MyDrive/openreview-project/cmt_v1/data/ICLR2023.csv')

For the convenience of further analyses on each comment, we save the content of all public comments to a separate .csv file. We are specifically interested in the following aspects of a comment:

  - `paper_id`: the id of the paper with such a comment.
  - `paper_title`: the title of the corresponding paper, we use this information to search for the paper on web.
  - `cmt_id`: the id of the comment, we use this information to annotate the sentiment of each comment.
  - `cmt_title`: the title of the comment.
  - `cmt_signatures`: this information can be used for cross-referencing commenters' profiles.
  - `cmt_content`: the content of the comment.

<font color='red'>
CAVEAT: the lists in the original dataframe cells will be converted to strings in the csv file, when importing the csv file to a new dataframe, we need to process these columns before treating their values as lists.
</font>


In [None]:
from google.colab import drive
drive.mount('drive')

import pandas as pd


# establish a sub dataframe to store papers with at least one public comment
df = pd.read_csv('drive/MyDrive/openreview-project/cmt_v1/data/ICLR2023.csv')
df = df.loc[df['pub_cmtids'] != '[]']
df = df[['id_forum', 'title', 'pub_cmtids']]

# process the column 'pub_cmtids'
chars_to_rm = [' ', '[', ']', '\'']
df['pub_cmtids'] = df['pub_cmtids'].apply(lambda x: x.translate({ord(c): None for c in chars_to_rm}))
df['pub_cmtids'] = df['pub_cmtids'].apply(lambda x: x.split(','))
df.head()


Drive already mounted at drive; to attempt to forcibly remount, call drive.mount("drive", force_remount=True).


Unnamed: 0,id_forum,title,pub_cmtids
7,6iEoTr-jeB7,Learning Continuous Normalizing Flows For Fast...,[RSHAXYU0R1]
25,0OlEBibFa_g,Detecting Out-of-Distribution Data with Semi-s...,[MwBZ9qaqVj]
35,cZM4iZmxzR7,Simple Spectral Graph Convolution from an Opti...,"[f_UYiyBMbbQ, 1zQPZtBK4X]"
94,688hNNMigVX,Learning a Data-Driven Policy Network for Pre-...,"[eDYga8Xhm7, SFaE6o03E1x, iKfplf6OmL]"
156,pRCMXcfdihq,Protein Sequence and Structure Co-Design with ...,[uzB2zzYJBT]


In [None]:
# execute the following line when starting from this cell for the first time
# !pip install openreview-py
import openreview
from tqdm import tqdm

myclient = openreview.Client(baseurl='https://api.openreview.net')
submissions = openreview.tools.iterget_notes(myclient, invitation='ICLR.cc/2023/Conference/-/Blind_Submission', details='directReplies')

dfs = []

for submission in tqdm(submissions):
  if submission.id in df['id_forum'].values:
    # print(f'processing submission {submission.id}')
    submission_dict = {
        'paper_id': [submission.id],
        'paper_title': [submission.content['title']],
    }

    # creat a dictionary to store all public comments on this submission
    directReplies = submission.details['directReplies']
    pubCmts = {}
    for reply in directReplies:
      if reply['invitation'].endswith('Public_Comment'):
        pubCmts[reply['id']] = reply

    # print(f'there are {len(pubCmts)} public comments for submission {submission.id}')

    pubCmtids = df.loc[df['id_forum'] == submission.id]['pub_cmtids'].tolist()
    # print(f'there are {len(pubCmtids)} public comment ids for submission {submission.id}')

    for pub_cmtid in pubCmtids:
      subDict = submission_dict.copy()
      subDict['cmt_id'] = [pub_cmtid]
      subDict['cmt_title'] = [pubCmts[pub_cmtid]['content']['title']]
      subDict['cmt_signatures'] = [pubCmts[pub_cmtid]['signatures']]
      subDict['cmt_content'] = [pubCmts[pub_cmtid]['content']['comment']]

      dfs.append(pd.DataFrame.from_dict(subDict))

df_pubCmts = pd.concat(dfs, ignore_index=True)
df_pubCmts.head()

3849it [00:09, 398.65it/s]


Unnamed: 0,paper_id,paper_title,cmt_id,cmt_title,cmt_signatures,cmt_content
0,6iEoTr-jeB7,Learning Continuous Normalizing Flows For Fast...,RSHAXYU0R1,Probably missing citations and some questions,[~Kaiwen_Zheng2],I appreciate the author's idea of using score ...
1,0OlEBibFa_g,Detecting Out-of-Distribution Data with Semi-s...,MwBZ9qaqVj,Attribution of library used for experiments,[~Benedek_Andras_Rozemberczki1],It is reasonable to assume that the paper uses...
2,cZM4iZmxzR7,Simple Spectral Graph Convolution from an Opti...,f_UYiyBMbbQ,Misattribution of datasets,[~Benedek_Andras_Rozemberczki1],The paper misattributes the Chameleons and Squ...
3,cZM4iZmxzR7,Simple Spectral Graph Convolution from an Opti...,1zQPZtBK4X,Relevant Work,[~Sitao_Luan1],Thank the authors for having this interesting ...
4,688hNNMigVX,Learning a Data-Driven Policy Network for Pre-...,eDYga8Xhm7,Unable to reproduce the experiments,[~Tianping_Zhang1],Thanks for your interesting work. We are attra...


In [None]:
len(df_pubCmts)

96

In [None]:
df_pubCmts.to_csv('drive/MyDrive/openreview-project/cmt_v1/data/ICLR2023_pubCmts.csv')

### 2.2 Human anotation for comment sentiment

In [None]:
from google.colab import drive
drive.mount('drive')

import pandas as pd
import numpy as np

df_pubCmts = pd.read_csv('drive/MyDrive/openreview-project/cmt_v1/data/ICLR2023_pubCmts_v2.csv', index_col=0)
df_pubCmts.head()

Drive already mounted at drive; to attempt to forcibly remount, call drive.mount("drive", force_remount=True).


Unnamed: 0,paper_id,paper_title,cmt_id,cmt_title,cmt_signatures,cmt_content,cmt_category,cmt_sentiment
0,6iEoTr-jeB7,Learning Continuous Normalizing Flows For Fast...,RSHAXYU0R1,Probably missing citations and some questions,['~Kaiwen_Zheng2'],I appreciate the author's idea of using score ...,questionable contribution,-1
1,0OlEBibFa_g,Detecting Out-of-Distribution Data with Semi-s...,MwBZ9qaqVj,Attribution of library used for experiments,['~Benedek_Andras_Rozemberczki1'],It is reasonable to assume that the paper uses...,missing or wrong reference,0
2,cZM4iZmxzR7,Simple Spectral Graph Convolution from an Opti...,f_UYiyBMbbQ,Misattribution of datasets,['~Benedek_Andras_Rozemberczki1'],The paper misattributes the Chameleons and Squ...,missing or wrong reference,0
3,cZM4iZmxzR7,Simple Spectral Graph Convolution from an Opti...,1zQPZtBK4X,Relevant Work,['~Sitao_Luan1'],Thank the authors for having this interesting ...,missing or wrong reference,0
4,688hNNMigVX,Learning a Data-Driven Policy Network for Pre-...,eDYga8Xhm7,Unable to reproduce the experiments,['~Tianping_Zhang1'],Thanks for your interesting work. We are attra...,reproducibility issue,-1


In [None]:
df_pubCmts['cmt_category'].unique()

array(['questionable contribution', 'missing or wrong reference',
       'reproducibility issue', 'na', 'problematic empirical evaluation',
       'plagiarism', 'inaccurate description', 'crucial incorrectness',
       'compliment', 'general question'], dtype=object)

We label each comment by one of the following categories:

| Comment category | Sentiment score (5-scale) | Description |
| ----------- | :-----------: | :-----------: |
| 'compliment'                |  2            | commendation to the submission |
| 'general question'          |    0          | neutral questions, not implying potential errors of the paper |
| 'missing or wrong reference'|    0          | asking authors to add / correct citations or discussions on potentially related work |
| 'questionable contribution' |      -1       | casting doubts on the significance or novelty of the paper |
| 'reproducibility issue'     |      -1       | the supplementary material potentially misses crucial files or cannot reproduce the results reported in the paper |
| 'inaccurate description'    |      -1       | some mispresentation of related work or any other less important conclusions  |
| 'problematic empirical evaluation' |   -2   | the empirical evaluation protocol is potentially unfair or problematic, hence the validity of some crucial conclusions might be damaged |
| 'plagiarism'                |          -2   | the submission is suspiciously similar with a prior / concurrent submission |
| 'crucial incorrectness'     |          -2   | incorrectness that destroyes the logical chain |
| 'na'                        | case by case  | all other scenarios


Note that these categories are not mutually exclusive, hence one paper potentially satisfies the descriptions of multiple categories. In such a case, we only label the paper with the most perceptually evident category.

In [None]:
df_iclr23 = pd.read_csv('drive/MyDrive/openreview-project/cmt_v1/data/ICLR2023.csv', index_col=0)
df_iclr23.head()

Unnamed: 0,id_forum,title,authorids,authors,decision,scores,avg_score,confidences,avg_confidence,pub_cmtids,cmt_responses
0,RUzSobdYy0V,Quantifying and Mitigating the Impact of Label...,"['~Julius_Adebayo1', '~Melissa_Hall1', '~Bowen...","['Julius Adebayo', 'Melissa Hall', 'Bowen Yu',...",Accept: poster,"[8.0, 6.0, 5.0]",6.333333,"[3.0, 3.0, 4.0]",3.333333,[],[]
1,N3kGYG3ZcTi,Suppression helps: Lateral Inhibition-inspired...,"['~Chengyuan_Zhuang1', '~Xiaohui_Yuan1', '~XUA...","['Chengyuan Zhuang', 'Xiaohui Yuan', 'XUAN GUO']",Reject,"[1.0, 3.0, 6.0, 3.0]",3.25,"[5.0, 5.0, 5.0, 5.0]",5.0,[],[]
2,tmIiMPl4IPa,Factorized Fourier Neural Operators,"['~Alasdair_Tran1', 'almath123@gmail.com', '~L...","['Alasdair Tran', 'Alexander Mathews', 'Lexing...",Accept: poster,"[6.0, 8.0, 5.0, 6.0, 8.0]",6.6,"[3.0, 2.0, 4.0, 4.0, 5.0]",3.6,[],[]
3,mhnHqRqcjYU,DFPC: Data flow driven pruning of coupled chan...,"['~Tanay_Narshana1', '~Chaitanya_Murti1', '~Ch...","['Tanay Narshana', 'Chaitanya Murti', 'Chiranj...",Accept: poster,"[6.0, 6.0, 8.0]",6.666667,"[3.0, 2.0, 3.0]",2.666667,[],[]
4,sZI1Oj9KBKy,TVSPrune - Pruning Non-discriminative filters ...,"['~Chaitanya_Murti1', '~Tanay_Narshana1', '~Ch...","['Chaitanya Murti', 'Tanay Narshana', 'Chiranj...",Accept: poster,"[3.0, 8.0, 6.0, 8.0]",6.25,"[4.0, 4.0, 3.0, 3.0]",3.5,[],[]


In this part, we will append a new column to the dataframe `df_iclr23`, namely the `cmts_sentiment`. Recall that it will be a list of 5-scale scores, with 2 being the most positive and -2 being the most negative.

In [None]:
# initialize the new column 'cmts_sentiment'
df_iclr23['cmts_sentiment'] = '[]'

papers_with_cmts = df_iclr23.loc[df_iclr23['pub_cmtids'] != '[]']['id_forum'].tolist()

for paper_id in papers_with_cmts:
  df_iclr23.loc[df_iclr23['id_forum'] == paper_id, 'cmts_sentiment'] = [str(df_pubCmts.loc[df_pubCmts['paper_id'] == paper_id]['cmt_sentiment'].tolist())]

df_iclr23.loc[df_iclr23['pub_cmtids'] != '[]'].head()

Unnamed: 0,id_forum,title,authorids,authors,decision,scores,avg_score,confidences,avg_confidence,pub_cmtids,cmt_responses,cmts_sentiment
7,6iEoTr-jeB7,Learning Continuous Normalizing Flows For Fast...,"['~Shuangshuang_Chen1', '~Sihao_Ding1', '~Yian...","['Shuangshuang Chen', 'Sihao Ding', 'Yiannis K...",Accept: poster,"[6.0, 8.0, 6.0]",6.666667,"[4.0, 3.0, 3.0]",3.333333,['RSHAXYU0R1'],[True],[-1]
25,0OlEBibFa_g,Detecting Out-of-Distribution Data with Semi-s...,"['~Debargha_Ganguly1', '~Debayan_Gupta1']","['Debargha Ganguly', 'Debayan Gupta']",Reject,"[3.0, 3.0, 1.0, 5.0]",3.0,"[5.0, 3.0, 5.0, 3.0]",4.0,['MwBZ9qaqVj'],[False],[0]
35,cZM4iZmxzR7,Simple Spectral Graph Convolution from an Opti...,"['~Hao_Zhu2', '~Piotr_Koniusz1']","['Hao Zhu', 'Piotr Koniusz']",Reject,"[6.0, 5.0, 5.0, 3.0]",4.75,"[4.0, 3.0, 2.0, 3.0]",3.0,"['f_UYiyBMbbQ', '1zQPZtBK4X']","[True, True]","[0, 0]"
94,688hNNMigVX,Learning a Data-Driven Policy Network for Pre-...,"['~Liyao_Li1', '~Haobo_Wang1', '~Liangyu_Zha1'...","['Liyao Li', 'Haobo Wang', 'Liangyu Zha', 'Qin...",Accept: notable-top-25%,"[8.0, 8.0, 8.0]",8.0,"[3.0, 5.0, 4.0]",4.0,"['eDYga8Xhm7', 'SFaE6o03E1x', 'iKfplf6OmL']","[True, True, False]","[-1, -1, 0]"
156,pRCMXcfdihq,Protein Sequence and Structure Co-Design with ...,"['~Chence_Shi1', 'chuanrui.wang@mila.quebec', ...","['Chence Shi', 'Chuanrui Wang', 'Jiarui Lu', '...",Accept: poster,"[6.0, 6.0, 6.0, 6.0]",6.0,"[4.0, 3.0, 5.0, 4.0]",4.0,['uzB2zzYJBT'],[True],[-2]


In [None]:
df_iclr23.to_csv('drive/MyDrive/openreview-project/cmt_v1/data/ICLR2023.csv')

### 2.3 Arxiv availability

Arxiv preprints may reveal the true identities of the authors, which may introduce a bias (either positive or negative) towards the comment.

#### 2.3.1 Arxiv creation timestamp

In [None]:
!pip install arxiv
import arxiv

from google.colab import drive
drive.mount('drive')

import pandas as pd
import re

def arxiv_finder(title):
  # removing non-alphanumerical chars from title, but keep spaces (the code is from this post: https://stackoverflow.com/a/55902074/18849124)
  # arixv api does not index special characters
  title = re.sub(r'[^0-9a-zA-Z\s]+', ' ', title)
  ttl_str = f'\"{title}\"'

  search = arxiv.Search(
    query = "ti:" + ttl_str,
    max_results = 1,
    sort_by = arxiv.SortCriterion.SubmittedDate,
    sort_order = arxiv.SortOrder.Descending
  )

  try:
    paper = next(search.results())
    return paper.published
  except StopIteration:
    return None

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting arxiv
  Downloading arxiv-1.4.3-py3-none-any.whl (12 kB)
Collecting feedparser
  Downloading feedparser-6.0.10-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.1/81.1 KB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sgmllib3k
  Downloading sgmllib3k-1.0.0.tar.gz (5.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: sgmllib3k
  Building wheel for sgmllib3k (setup.py) ... [?25l[?25hdone
  Created wheel for sgmllib3k: filename=sgmllib3k-1.0.0-py3-none-any.whl size=6066 sha256=93ac7793ae12becfc252e18af097440d5d2d3b13294c511d79c78f55d97dd7e1
  Stored in directory: /root/.cache/pip/wheels/83/63/2f/117884c3b19d46b64d3d61690333aa80c88dc14050e269c546
Successfully built sgmllib3k
Installing collected packages: sgmllib3k, feedparser, arxiv
Successfully installed arxiv-1.4.3 feedparse

In [None]:
df_iclr23 = pd.read_csv('drive/MyDrive/openreview-project/cmt_v1/data/ICLR2023.csv', index_col=0)
df_iclr23['arxiv_cdate'] = None
df_iclr23.to_csv('drive/MyDrive/openreview-project/cmt_v1/data/ICLR2023.csv')
df_iclr23.head()

Unnamed: 0,id_forum,title,authorids,authors,decision,scores,avg_score,confidences,avg_confidence,pub_cmtids,cmt_responses,cmts_sentiment,arxiv_cdate
0,RUzSobdYy0V,Quantifying and Mitigating the Impact of Label...,"['~Julius_Adebayo1', '~Melissa_Hall1', '~Bowen...","['Julius Adebayo', 'Melissa Hall', 'Bowen Yu',...",Accept: poster,"[8.0, 6.0, 5.0]",6.333333,"[3.0, 3.0, 4.0]",3.333333,[],[],[],
1,N3kGYG3ZcTi,Suppression helps: Lateral Inhibition-inspired...,"['~Chengyuan_Zhuang1', '~Xiaohui_Yuan1', '~XUA...","['Chengyuan Zhuang', 'Xiaohui Yuan', 'XUAN GUO']",Reject,"[1.0, 3.0, 6.0, 3.0]",3.25,"[5.0, 5.0, 5.0, 5.0]",5.0,[],[],[],
2,tmIiMPl4IPa,Factorized Fourier Neural Operators,"['~Alasdair_Tran1', 'almath123@gmail.com', '~L...","['Alasdair Tran', 'Alexander Mathews', 'Lexing...",Accept: poster,"[6.0, 8.0, 5.0, 6.0, 8.0]",6.6,"[3.0, 2.0, 4.0, 4.0, 5.0]",3.6,[],[],[],
3,mhnHqRqcjYU,DFPC: Data flow driven pruning of coupled chan...,"['~Tanay_Narshana1', '~Chaitanya_Murti1', '~Ch...","['Tanay Narshana', 'Chaitanya Murti', 'Chiranj...",Accept: poster,"[6.0, 6.0, 8.0]",6.666667,"[3.0, 2.0, 3.0]",2.666667,[],[],[],
4,sZI1Oj9KBKy,TVSPrune - Pruning Non-discriminative filters ...,"['~Chaitanya_Murti1', '~Tanay_Narshana1', '~Ch...","['Chaitanya Murti', 'Tanay Narshana', 'Chiranj...",Accept: poster,"[3.0, 8.0, 6.0, 8.0]",6.25,"[4.0, 4.0, 3.0, 3.0]",3.5,[],[],[],


Arxiv sets an upper limit of requests per time unit, hence we cannot scrape the arxiv publish time for all ICLR 2023 submissions at once. The strategy is to split the dataframe into several folds, and iteratively update the new column `arxiv_cdate` by fold. Each time when we update a fold, we save the intermediate dataframe so that what already been updated will not be lost.

In [None]:
from tqdm import tqdm
import numpy as np

def batch_query(batch_id, num_batches):
  ''' scrape arxiv_cdate batch by batch '''

  df_iclr23 = pd.read_csv('drive/MyDrive/openreview-project/cmt_v1/data/ICLR2023.csv', index_col=0)
  dfs = np.array_split(df_iclr23, num_batches)

  papers_to_check = dfs[batch_id]['title'].tolist()
  arxiv_cdates = []

  for paper in tqdm(papers_to_check):
    arxiv_cdates.append(arxiv_finder(paper))

  dfs[batch_id]['arxiv_cdate'] = arxiv_cdates

  # concatenate the dfs
  df_iclr23 = pd.concat(dfs, ignore_index=True)
  df_iclr23.to_csv('drive/MyDrive/openreview-project/cmt_v1/data/ICLR2023.csv')


In [None]:
batch_query(0, 10)

100%|██████████| 385/385 [03:05<00:00,  2.08it/s]


In [None]:
batch_query(1, 10)

100%|██████████| 385/385 [03:23<00:00,  1.89it/s]


In [None]:
batch_query(2, 10)

100%|██████████| 384/384 [03:16<00:00,  1.96it/s]


In [None]:
batch_query(3, 10)

100%|██████████| 384/384 [03:18<00:00,  1.94it/s]


In [None]:
batch_query(4, 10)

100%|██████████| 384/384 [03:18<00:00,  1.94it/s]


In [None]:
batch_query(5, 10)

100%|██████████| 384/384 [03:04<00:00,  2.08it/s]


In [None]:
batch_query(6, 10)

100%|██████████| 384/384 [03:17<00:00,  1.95it/s]


In [None]:
batch_query(7, 10)

100%|██████████| 384/384 [03:04<00:00,  2.08it/s]


In [None]:
batch_query(8, 10)

100%|██████████| 384/384 [03:15<00:00,  1.97it/s]


In [None]:
batch_query(9, 10)

100%|██████████| 384/384 [03:16<00:00,  1.95it/s]


For the convenience of comparing the order of arxiv submission and public comments, we also create a column `'arxiv_cdate'` in the public comment dataframe.

In [None]:
df_pubCmts = pd.read_csv('drive/MyDrive/openreview-project/cmt_v1/data/ICLR2023_pubCmts_v2.csv', index_col=0)
df_pubCmts.head()

Unnamed: 0,paper_id,paper_title,cmt_id,cmt_title,cmt_signatures,cmt_content,cmt_category,cmt_sentiment
0,6iEoTr-jeB7,Learning Continuous Normalizing Flows For Fast...,RSHAXYU0R1,Probably missing citations and some questions,['~Kaiwen_Zheng2'],I appreciate the author's idea of using score ...,questionable contribution,-1
1,0OlEBibFa_g,Detecting Out-of-Distribution Data with Semi-s...,MwBZ9qaqVj,Attribution of library used for experiments,['~Benedek_Andras_Rozemberczki1'],It is reasonable to assume that the paper uses...,missing or wrong reference,0
2,cZM4iZmxzR7,Simple Spectral Graph Convolution from an Opti...,f_UYiyBMbbQ,Misattribution of datasets,['~Benedek_Andras_Rozemberczki1'],The paper misattributes the Chameleons and Squ...,missing or wrong reference,0
3,cZM4iZmxzR7,Simple Spectral Graph Convolution from an Opti...,1zQPZtBK4X,Relevant Work,['~Sitao_Luan1'],Thank the authors for having this interesting ...,missing or wrong reference,0
4,688hNNMigVX,Learning a Data-Driven Policy Network for Pre-...,eDYga8Xhm7,Unable to reproduce the experiments,['~Tianping_Zhang1'],Thanks for your interesting work. We are attra...,reproducibility issue,-1


In [None]:
df_iclr23 = pd.read_csv('drive/MyDrive/openreview-project/cmt_v1/data/ICLR2023.csv', index_col=0)
df_iclr23.head()


Unnamed: 0,id_forum,title,authorids,authors,decision,scores,avg_score,confidences,avg_confidence,pub_cmtids,cmt_responses,cmts_sentiment,arxiv_cdate
0,RUzSobdYy0V,Quantifying and Mitigating the Impact of Label...,"['~Julius_Adebayo1', '~Melissa_Hall1', '~Bowen...","['Julius Adebayo', 'Melissa Hall', 'Bowen Yu',...",Accept: poster,"[8.0, 6.0, 5.0]",6.333333,"[3.0, 3.0, 4.0]",3.333333,[],[],[],
1,N3kGYG3ZcTi,Suppression helps: Lateral Inhibition-inspired...,"['~Chengyuan_Zhuang1', '~Xiaohui_Yuan1', '~XUA...","['Chengyuan Zhuang', 'Xiaohui Yuan', 'XUAN GUO']",Reject,"[1.0, 3.0, 6.0, 3.0]",3.25,"[5.0, 5.0, 5.0, 5.0]",5.0,[],[],[],
2,tmIiMPl4IPa,Factorized Fourier Neural Operators,"['~Alasdair_Tran1', 'almath123@gmail.com', '~L...","['Alasdair Tran', 'Alexander Mathews', 'Lexing...",Accept: poster,"[6.0, 8.0, 5.0, 6.0, 8.0]",6.6,"[3.0, 2.0, 4.0, 4.0, 5.0]",3.6,[],[],[],2021-11-27 03:34:13+00:00
3,mhnHqRqcjYU,DFPC: Data flow driven pruning of coupled chan...,"['~Tanay_Narshana1', '~Chaitanya_Murti1', '~Ch...","['Tanay Narshana', 'Chaitanya Murti', 'Chiranj...",Accept: poster,"[6.0, 6.0, 8.0]",6.666667,"[3.0, 2.0, 3.0]",2.666667,[],[],[],
4,sZI1Oj9KBKy,TVSPrune - Pruning Non-discriminative filters ...,"['~Chaitanya_Murti1', '~Tanay_Narshana1', '~Ch...","['Chaitanya Murti', 'Tanay Narshana', 'Chiranj...",Accept: poster,"[3.0, 8.0, 6.0, 8.0]",6.25,"[4.0, 4.0, 3.0, 3.0]",3.5,[],[],[],


In [None]:
def get_arxiv_cdate(paper_id):
  ''' get arxiv_cdate for each paper from df_iclr23 '''
  return df_iclr23.loc[df_iclr23['id_forum'] == paper_id, 'arxiv_cdate'].item()

df_pubCmts['arxiv_cdate'] = df_pubCmts['paper_id'].apply(get_arxiv_cdate)
df_pubCmts.head()

Unnamed: 0,paper_id,paper_title,cmt_id,cmt_title,cmt_signatures,cmt_content,cmt_category,cmt_sentiment,arxiv_cdate
0,6iEoTr-jeB7,Learning Continuous Normalizing Flows For Fast...,RSHAXYU0R1,Probably missing citations and some questions,['~Kaiwen_Zheng2'],I appreciate the author's idea of using score ...,questionable contribution,-1,
1,0OlEBibFa_g,Detecting Out-of-Distribution Data with Semi-s...,MwBZ9qaqVj,Attribution of library used for experiments,['~Benedek_Andras_Rozemberczki1'],It is reasonable to assume that the paper uses...,missing or wrong reference,0,
2,cZM4iZmxzR7,Simple Spectral Graph Convolution from an Opti...,f_UYiyBMbbQ,Misattribution of datasets,['~Benedek_Andras_Rozemberczki1'],The paper misattributes the Chameleons and Squ...,missing or wrong reference,0,
3,cZM4iZmxzR7,Simple Spectral Graph Convolution from an Opti...,1zQPZtBK4X,Relevant Work,['~Sitao_Luan1'],Thank the authors for having this interesting ...,missing or wrong reference,0,
4,688hNNMigVX,Learning a Data-Driven Policy Network for Pre-...,eDYga8Xhm7,Unable to reproduce the experiments,['~Tianping_Zhang1'],Thanks for your interesting work. We are attra...,reproducibility issue,-1,


In [None]:
df_pubCmts.loc[df_pubCmts['arxiv_cdate'].notna()].head()

Unnamed: 0,paper_id,paper_title,cmt_id,cmt_title,cmt_signatures,cmt_content,cmt_category,cmt_sentiment,arxiv_cdate
7,pRCMXcfdihq,Protein Sequence and Structure Co-Design with ...,uzB2zzYJBT,some confusion about your cdr design benchmark,['~Chentong_Wang1'],"I am very confused by your cdr benchmark, beca...",problematic empirical evaluation,-2,2022-10-17 06:00:12+00:00
8,6ruVLB727MC,UL2: Unifying Language Learning Paradigms,0FZGVuTXak,"""Successfully leverag[ing] CoT"" claim seems du...",['~Stella_Rose_Biderman1'],You write\n\n> Here we demonstrate that UL2 20...,problematic empirical evaluation,-2,2022-05-10 19:32:20+00:00
9,wKPmPBHSnT6,Ordered GNN: Ordering Message Passing to Deal ...,YUatcc8hzq,Misattribution of datasets,['~Benedek_Andras_Rozemberczki1'],The paper misattributed the authorship of the ...,missing or wrong reference,0,2023-02-03 03:38:50+00:00
10,PUIqjT4rzq7,Training-Free Structured Diffusion Guidance fo...,KX86whvifP,provided code is not working!,['~Mehmet_Ozgur_Turkoglu1'],Thanks for the interesting work. I spent quite...,reproducibility issue,-1,2022-12-09 18:30:24+00:00
13,4gc3MGZra1d,On Representing Mixed-Integer Linear Programs ...,hZcCh27efK,Two suspiciously similar submissions,['~Fanchen_Bu1'],https://openreview.net/forum?id=4gc3MGZra1d\nh...,plagiarism,-2,2022-10-19 17:56:07+00:00


Next, we collect the creation time for all the comments stored in `df_pubCmts`. to do this, we will iterate through all blind submissions whose direct replies contain public comments, and collect the `'cdate'` of the corresponding comment.

In [None]:
myclient = openreview.Client(baseurl='https://api.openreview.net')
submissions = openreview.tools.iterget_notes(myclient, invitation='ICLR.cc/2023/Conference/-/Blind_Submission', details='directReplies')

pid_with_Cmts = df_pubCmts['paper_id'].unique().tolist()
ptr = 0
cmt_cdates = []

for submission in tqdm(submissions):
  if not submission.id in pid_with_Cmts:
    continue
  else:
    assert submission.id == pid_with_Cmts[ptr], 'paper order is changed'
    ptr += 1

    directReplies = submission.details['directReplies']
    pub_cmtids = df_pubCmts.loc[df_pubCmts['paper_id'] == submission.id, 'cmt_id'].tolist()

    for pub_cmtid in pub_cmtids:
      # search the public comment by its id
      for reply in directReplies:
        if not reply['id'] == pub_cmtid:
          continue
        else:
          cdate = reply['cdate'] // 1000
          cmt_cdates.append(pd.to_datetime(cdate, utc=True, unit='s'))

df_pubCmts['cmt_cdate'] = cmt_cdates

3841it [00:08, 477.96it/s]


In [None]:
df_pubCmts.head()

Unnamed: 0,paper_id,paper_title,cmt_id,cmt_title,cmt_signatures,cmt_content,cmt_category,cmt_sentiment,arxiv_cdate,cmt_cdate
0,6iEoTr-jeB7,Learning Continuous Normalizing Flows For Fast...,RSHAXYU0R1,Probably missing citations and some questions,['~Kaiwen_Zheng2'],I appreciate the author's idea of using score ...,questionable contribution,-1,,2022-11-08 11:00:50+00:00
1,0OlEBibFa_g,Detecting Out-of-Distribution Data with Semi-s...,MwBZ9qaqVj,Attribution of library used for experiments,['~Benedek_Andras_Rozemberczki1'],It is reasonable to assume that the paper uses...,missing or wrong reference,0,,2022-11-05 20:28:16+00:00
2,cZM4iZmxzR7,Simple Spectral Graph Convolution from an Opti...,f_UYiyBMbbQ,Misattribution of datasets,['~Benedek_Andras_Rozemberczki1'],The paper misattributes the Chameleons and Squ...,missing or wrong reference,0,,2022-11-05 19:54:25+00:00
3,cZM4iZmxzR7,Simple Spectral Graph Convolution from an Opti...,1zQPZtBK4X,Relevant Work,['~Sitao_Luan1'],Thank the authors for having this interesting ...,missing or wrong reference,0,,2022-11-14 21:40:54+00:00
4,688hNNMigVX,Learning a Data-Driven Policy Network for Pre-...,eDYga8Xhm7,Unable to reproduce the experiments,['~Tianping_Zhang1'],Thanks for your interesting work. We are attra...,reproducibility issue,-1,,2022-11-09 03:05:37+00:00


In [None]:
def compare_time(arxiv_cdate, cmt_cdate):
  if arxiv_cdate is None:
    return False
  else:
    # convert arxiv_cdate to a pandas timestamp object
    arxiv_cdate = pd.to_datetime(arxiv_cdate, format='%Y-%m-%d %H:%M:%S%z')   # check this link for python3 datetime format: https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes
    return arxiv_cdate < cmt_cdate

df_pubCmts['arxiv_availability'] = df_pubCmts.apply(lambda x: compare_time(x['arxiv_cdate'], x['cmt_cdate']), axis=1)
df_pubCmts.head()

Unnamed: 0,paper_id,paper_title,cmt_id,cmt_title,cmt_signatures,cmt_content,cmt_category,cmt_sentiment,arxiv_cdate,cmt_cdate,arxiv_availability
0,6iEoTr-jeB7,Learning Continuous Normalizing Flows For Fast...,RSHAXYU0R1,Probably missing citations and some questions,['~Kaiwen_Zheng2'],I appreciate the author's idea of using score ...,questionable contribution,-1,,2022-11-08 11:00:50+00:00,False
1,0OlEBibFa_g,Detecting Out-of-Distribution Data with Semi-s...,MwBZ9qaqVj,Attribution of library used for experiments,['~Benedek_Andras_Rozemberczki1'],It is reasonable to assume that the paper uses...,missing or wrong reference,0,,2022-11-05 20:28:16+00:00,False
2,cZM4iZmxzR7,Simple Spectral Graph Convolution from an Opti...,f_UYiyBMbbQ,Misattribution of datasets,['~Benedek_Andras_Rozemberczki1'],The paper misattributes the Chameleons and Squ...,missing or wrong reference,0,,2022-11-05 19:54:25+00:00,False
3,cZM4iZmxzR7,Simple Spectral Graph Convolution from an Opti...,1zQPZtBK4X,Relevant Work,['~Sitao_Luan1'],Thank the authors for having this interesting ...,missing or wrong reference,0,,2022-11-14 21:40:54+00:00,False
4,688hNNMigVX,Learning a Data-Driven Policy Network for Pre-...,eDYga8Xhm7,Unable to reproduce the experiments,['~Tianping_Zhang1'],Thanks for your interesting work. We are attra...,reproducibility issue,-1,,2022-11-09 03:05:37+00:00,False


In [None]:
df_pubCmts.loc[df_pubCmts['arxiv_cdate'].notna()].head()

Unnamed: 0,paper_id,paper_title,cmt_id,cmt_title,cmt_signatures,cmt_content,cmt_category,cmt_sentiment,arxiv_cdate,cmt_cdate,arxiv_availability
7,pRCMXcfdihq,Protein Sequence and Structure Co-Design with ...,uzB2zzYJBT,some confusion about your cdr design benchmark,['~Chentong_Wang1'],"I am very confused by your cdr benchmark, beca...",problematic empirical evaluation,-2,2022-10-17 06:00:12+00:00,2022-11-14 08:39:36+00:00,True
8,6ruVLB727MC,UL2: Unifying Language Learning Paradigms,0FZGVuTXak,"""Successfully leverag[ing] CoT"" claim seems du...",['~Stella_Rose_Biderman1'],You write\n\n> Here we demonstrate that UL2 20...,problematic empirical evaluation,-2,2022-05-10 19:32:20+00:00,2022-11-07 02:24:45+00:00,True
9,wKPmPBHSnT6,Ordered GNN: Ordering Message Passing to Deal ...,YUatcc8hzq,Misattribution of datasets,['~Benedek_Andras_Rozemberczki1'],The paper misattributed the authorship of the ...,missing or wrong reference,0,2023-02-03 03:38:50+00:00,2022-11-05 20:30:55+00:00,False
10,PUIqjT4rzq7,Training-Free Structured Diffusion Guidance fo...,KX86whvifP,provided code is not working!,['~Mehmet_Ozgur_Turkoglu1'],Thanks for the interesting work. I spent quite...,reproducibility issue,-1,2022-12-09 18:30:24+00:00,2022-11-07 17:12:16+00:00,False
13,4gc3MGZra1d,On Representing Mixed-Integer Linear Programs ...,hZcCh27efK,Two suspiciously similar submissions,['~Fanchen_Bu1'],https://openreview.net/forum?id=4gc3MGZra1d\nh...,plagiarism,-2,2022-10-19 17:56:07+00:00,2022-11-07 00:40:38+00:00,True


In [None]:
df_pubCmts.to_csv('drive/MyDrive/openreview-project/cmt_v1/data/ICLR2023_pubCmts_v2.csv')

Finally, we update the 'arxiv_availability' column in `df_iclr23`.

In [None]:
AA = []
submission_ids = df_iclr23['id_forum'].tolist()

for submission_id in submission_ids:
  aa = df_pubCmts.loc[df_pubCmts['paper_id'] == submission_id, 'arxiv_availability']
  AA.append(str(aa.tolist()))

df_iclr23['arxiv_availability'] = AA
df_iclr23.head()

Unnamed: 0,id_forum,title,authorids,authors,decision,scores,avg_score,confidences,avg_confidence,pub_cmtids,cmt_responses,cmts_sentiment,arxiv_cdate,arxiv_availability
0,RUzSobdYy0V,Quantifying and Mitigating the Impact of Label...,"['~Julius_Adebayo1', '~Melissa_Hall1', '~Bowen...","['Julius Adebayo', 'Melissa Hall', 'Bowen Yu',...",Accept: poster,"[8.0, 6.0, 5.0]",6.333333,"[3.0, 3.0, 4.0]",3.333333,[],[],[],,[]
1,N3kGYG3ZcTi,Suppression helps: Lateral Inhibition-inspired...,"['~Chengyuan_Zhuang1', '~Xiaohui_Yuan1', '~XUA...","['Chengyuan Zhuang', 'Xiaohui Yuan', 'XUAN GUO']",Reject,"[1.0, 3.0, 6.0, 3.0]",3.25,"[5.0, 5.0, 5.0, 5.0]",5.0,[],[],[],,[]
2,tmIiMPl4IPa,Factorized Fourier Neural Operators,"['~Alasdair_Tran1', 'almath123@gmail.com', '~L...","['Alasdair Tran', 'Alexander Mathews', 'Lexing...",Accept: poster,"[6.0, 8.0, 5.0, 6.0, 8.0]",6.6,"[3.0, 2.0, 4.0, 4.0, 5.0]",3.6,[],[],[],2021-11-27 03:34:13+00:00,[]
3,mhnHqRqcjYU,DFPC: Data flow driven pruning of coupled chan...,"['~Tanay_Narshana1', '~Chaitanya_Murti1', '~Ch...","['Tanay Narshana', 'Chaitanya Murti', 'Chiranj...",Accept: poster,"[6.0, 6.0, 8.0]",6.666667,"[3.0, 2.0, 3.0]",2.666667,[],[],[],,[]
4,sZI1Oj9KBKy,TVSPrune - Pruning Non-discriminative filters ...,"['~Chaitanya_Murti1', '~Tanay_Narshana1', '~Ch...","['Chaitanya Murti', 'Tanay Narshana', 'Chiranj...",Accept: poster,"[3.0, 8.0, 6.0, 8.0]",6.25,"[4.0, 4.0, 3.0, 3.0]",3.5,[],[],[],,[]


In [None]:
df = df_iclr23.loc[df_iclr23['pub_cmtids'] != '[]']
df.loc[df['arxiv_cdate'].notna()].head()

Unnamed: 0,id_forum,title,authorids,authors,decision,scores,avg_score,confidences,avg_confidence,pub_cmtids,cmt_responses,cmts_sentiment,arxiv_cdate,arxiv_availability
156,pRCMXcfdihq,Protein Sequence and Structure Co-Design with ...,"['~Chence_Shi1', 'chuanrui.wang@mila.quebec', ...","['Chence Shi', 'Chuanrui Wang', 'Jiarui Lu', '...",Accept: poster,"[6.0, 6.0, 6.0, 6.0]",6.0,"[4.0, 3.0, 5.0, 4.0]",4.0,['uzB2zzYJBT'],[True],[-2],2022-10-17 06:00:12+00:00,[True]
197,6ruVLB727MC,UL2: Unifying Language Learning Paradigms,"['~Yi_Tay1', '~Mostafa_Dehghani1', '~Vinh_Q._T...","['Yi Tay', 'Mostafa Dehghani', 'Vinh Q. Tran',...",Accept: poster,"[8.0, 3.0, 8.0, 6.0]",6.25,"[4.0, 4.0, 4.0, 4.0]",4.0,['0FZGVuTXak'],[True],[-2],2022-05-10 19:32:20+00:00,[True]
222,wKPmPBHSnT6,Ordered GNN: Ordering Message Passing to Deal ...,"['~Yunchong_Song1', '~Chenghu_Zhou3', '~Xinbin...","['Yunchong Song', 'Chenghu Zhou', 'Xinbing Wan...",Accept: poster,"[6.0, 5.0, 8.0, 3.0]",5.5,"[3.0, 5.0, 3.0, 4.0]",3.75,['YUatcc8hzq'],[True],[0],2023-02-03 03:38:50+00:00,[False]
255,PUIqjT4rzq7,Training-Free Structured Diffusion Guidance fo...,"['~Weixi_Feng2', '~Xuehai_He1', '~Tsu-Jui_Fu2'...","['Weixi Feng', 'Xuehai He', 'Tsu-Jui Fu', 'Var...",Accept: poster,"[6.0, 6.0, 6.0, 6.0]",6.0,"[4.0, 4.0, 4.0, 5.0]",4.25,['KX86whvifP'],[True],[-1],2022-12-09 18:30:24+00:00,[False]
314,4gc3MGZra1d,On Representing Mixed-Integer Linear Programs ...,"['~Ziang_Chen1', '~Jialin_Liu1', '~Xinshang_Wa...","['Ziang Chen', 'Jialin Liu', 'Xinshang Wang', ...",Accept: poster,"[6.0, 8.0, 1.0, 6.0]",5.25,"[4.0, 5.0, 2.0, 5.0]",4.0,['hZcCh27efK'],[True],[-2],2022-10-19 17:56:07+00:00,[True]


In [None]:
df_iclr23.to_csv('drive/MyDrive/openreview-project/cmt_v1/data/ICLR2023.csv')