In [15]:
def extract_weknesses (review):
    lower_review_text = review.lower()
    weakness_pos = lower_review_text.find('weakness') or lower_review_text.find('weaknesses')
    if weakness_pos != -1:
        weaknses = review[weakness_pos:]
        return weaknses
    else:
        return ''


def extract_nips_review (raw_review_dict, year):
    focused_review = ''
    
    ## Year 2022 we have two fields
    if year =='2022':
        if not 'Strengths And Weaknesses:' in raw_review_dict.keys():
            return ''
        weakneses = raw_review_dict['Strengths And Weaknesses:']
        weakneses = extract_weknesses(weakneses)
        if len(weakneses.split()) > 10:
            focused_review += weakneses
        if len(raw_review_dict['Limitations:'].split()) > 10:
            focused_review += raw_review_dict['Limitations:']

    elif year == '2021' :
        if not 'Main Review:' in raw_review_dict.keys():
            return ''
        weakneses = raw_review_dict['Main Review:']
        weakneses = extract_weknesses(weakneses)
        if len(weakneses.split()) > 10:
            focused_review += weakneses
        if len(raw_review_dict['Limitations And Societal Impact:'].split()) > 10:
            focused_review += raw_review_dict['Limitations And Societal Impact:']        

    elif year == '2020':
        if not 'Weaknesses' in raw_review_dict.keys():
            return ''
        if len(raw_review_dict['Weaknesses'].split()) > 10:
            focused_review += raw_review_dict['Weaknesses']

    else:
        weakneses = raw_review_dict['review']
        weakneses = extract_weknesses(weakneses)
        if len(weakneses.split()) > 10:
            focused_review += weakneses   


    return focused_review

In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import json
from tqdm import tqdm
# Load the data
raw_reviews_path =  '/fsx/hyperpod-input-datasets/AROA6GBMFKRI2VWQAUGYI:Abdelrahman.Sadallah@mbzuai.ac.ae/peerq-generation/NIPS'
save_path = '/fsx/homes/Abdelrahman.Sadallah@mbzuai.ac.ae/mbzuai/peerq-generation/data/processed'
filename = 'nips_reviews.csv'
reviews = []
years = set()
review_key = ['Strengths And Weaknesses:', 'Limitations And Societal Impact:', 'Main Review:','Weaknesses','review']
valid_cnt = 0
total_cnt = 0
## Iterate over files in the directory and load them into a pandas dataframe
for root, dirs, files in os.walk(raw_reviews_path):
    for dir in dirs:
        year = str(dir).split('_')[1]

        ## Only consider 2022 reviews now
        # if year != '2022':
        #     continue


        reviews_path = os.path.join(raw_reviews_path, dir, f'{dir}_review')
        for root, dirs, files in os.walk(reviews_path):
            for file in tqdm(files):
                file_path = os.path.join(reviews_path, file)
                if file.endswith('.json'):
                    with open(file_path, 'r') as f:
                        review_data = json.load(f)

                    review_id = review_data['id']
                    for review in review_data['reviews']:
                        total_cnt += 1
                        years.add(year)
                        review['review_id'] = review_id
                        review['year'] = year
                        review['focused_review'] = extract_nips_review(review, year)

                        reviews.append(review)
                        valid_cnt += 1

  4%|▎         | 91/2465 [00:00<00:02, 906.73it/s]

100%|██████████| 2465/2465 [00:02<00:00, 962.89it/s]
100%|██████████| 1386/1386 [00:01<00:00, 994.77it/s]
100%|██████████| 2823/2823 [00:03<00:00, 914.28it/s]
100%|██████████| 666/666 [00:00<00:00, 968.82it/s] 
100%|██████████| 1898/1898 [00:08<00:00, 220.11it/s]
100%|██████████| 986/986 [00:08<00:00, 114.63it/s]
100%|██████████| 554/554 [00:05<00:00, 101.92it/s]


In [17]:
print(f'Valid reviews: {valid_cnt}/{total_cnt}')
print(f'Years: {years}')    
reviews_df = pd.DataFrame(reviews)
reviews_df.to_csv(os.path.join(save_path, filename), index=False)

Valid reviews: 39684/39684
Years: {'2016', '2022', '2019', '2020', '2017', '2018', '2021'}


In [18]:
reviews_df

Unnamed: 0,Summary:,Main Review:,Limitations And Societal Impact:,Ethical Concerns:,Needs Ethics Review:,Time Spent Reviewing:,Rating:,Confidence:,Code Of Conduct:,review_id,...,Contribution:,Summary and Contributions,Strengths,Weaknesses,Correctness,Clarity,Relation to Prior Work,Reproducibility,Additional Feedback,confidence
0,The paper analyzes the plug-in approach to off...,(After author feedback) The authors' responses...,The authors do not address the societal impact...,The paper does not raise any ethical concerns ...,No,7,6: Marginally above the acceptance threshold,3: You are fairly confident in your assessment...,While performing my duties as a reviewer (incl...,NIPS_2021_1545,...,,,,,,,,,,
1,This paper provided horizon-free theoretical r...,Originality. The proof of their results is ori...,The author addressed their limitation adequate...,,No,6,6: Marginally above the acceptance threshold,3: You are fairly confident in your assessment...,While performing my duties as a reviewer (incl...,NIPS_2021_1545,...,,,,,,,,,,
2,The authors revisit offline RL on episodic MDP...,This is a remarkable theoretical paper. I took...,yes\n,none\n,No,3,"9: Top 15% of accepted NeurIPS papers, strong ...",3: You are fairly confident in your assessment...,While performing my duties as a reviewer (incl...,NIPS_2021_1545,...,,,,,,,,,,
3,"This paper provides near-matching, horizon-fre...",The presentation of the paper is clear and ill...,NA\n,,No,1,6: Marginally above the acceptance threshold,3: You are fairly confident in your assessment...,While performing my duties as a reviewer (incl...,NIPS_2021_1545,...,,,,,,,,,,
4,"This work proposes spike-element-wise ResNet, ...",Post Rebuttal\nI have seen the author's respon...,"My concerns are listed above. In my opinion, i...",,No,2,6: Marginally above the acceptance threshold,3: You are fairly confident in your assessment...,While performing my duties as a reviewer (incl...,NIPS_2021_2123,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39679,,,,,,,,,,NIPS_2016_144,...,,,,,,,,,,2-Confident (read it all; understood it all re...
39680,,,,,,,,,,NIPS_2016_144,...,,,,,,,,,,2-Confident (read it all; understood it all re...
39681,,,,,,,,,,NIPS_2016_144,...,,,,,,,,,,2-Confident (read it all; understood it all re...
39682,,,,,,,,,,NIPS_2016_144,...,,,,,,,,,,2-Confident (read it all; understood it all re...


In [19]:
df = pd.read_csv(os.path.join(save_path, filename))

print(df.head())

                                            Summary:  \
0  The paper analyzes the plug-in approach to off...   
1  This paper provided horizon-free theoretical r...   
2  The authors revisit offline RL on episodic MDP...   
3  This paper provides near-matching, horizon-fre...   
4  This work proposes spike-element-wise ResNet, ...   

                                        Main Review:  \
0  (After author feedback) The authors' responses...   
1  Originality. The proof of their results is ori...   
2  This is a remarkable theoretical paper. I took...   
3  The presentation of the paper is clear and ill...   
4  Post Rebuttal\nI have seen the author's respon...   

                    Limitations And Societal Impact:  \
0  The authors do not address the societal impact...   
1  The author addressed their limitation adequate...   
2                                              yes\n   
3                                               NA\n   
4  My concerns are listed above. In my opinion

  df = pd.read_csv(os.path.join(save_path, filename))
