In [1]:
from vllm import LLM, SamplingParams
import os
import sys
os.environ["VLLM_ATTENTION_BACKEND"] = "FLASHINFER"

llm = LLM(
    model="google/gemma-2-9b-it",
    gpu_memory_utilization=0.9,
    max_model_len=4000
)

# %%
import os
import sys
from pathlib import Path
module_path = Path(os.path.abspath("")).parent
print(module_path)
sys.path.append(str(module_path))
from data_processing.filter_reviews import filter_reviews
from data_processing.semantic_segmentation import split_paragraph


import pandas as pd
from prompts import PROMPTS
from tqdm import tqdm
# reviews = pd.read_csv("../../data/reviewer2_ARR_2022_reviews.csv")
split_reviews = pd.read_csv("../../data/reviewer2_ARR_2022_manual_split_reviews.csv")
tokenizer = llm.get_tokenizer()
sampling_params = SamplingParams(
    temperature=0.0, top_p=1, max_tokens=32,
    stop_token_ids=[tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|eot_id|>")]
)


INFO 08-01 07:57:00 llm_engine.py:176] Initializing an LLM engine (v0.5.3.post1) with config: model='google/gemma-2-9b-it', speculative_config=None, tokenizer='google/gemma-2-9b-it', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=4000, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=google/gemma-2-9b-it, use_v2_block_manager=False, enable_prefix_caching=False)
INFO 08-01 07:57:02 selector.py:80] Using Flashinfer backend.
INFO 08-01 07:57:03 model_runner.py:680] Starting to load model google/gemma-2-9b-

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]


INFO 08-01 07:57:37 model_runner.py:692] Loading model weights took 17.3781 GB
INFO 08-01 07:57:38 gpu_executor.py:102] # GPU blocks: 9934, # CPU blocks: 780
INFO 08-01 07:57:42 model_runner.py:980] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 08-01 07:57:42 model_runner.py:984] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 08-01 07:57:59 model_runner.py:1181] Graph capturing finished in 18 secs.
/fsx/homes/Abdelrahman.Sadallah@mbzuai.ac.ae/mbzuai/review_rewrite/notebooks


In [2]:

aspects = ['Actionability','Constructiveness or Politeness','Credibility or Verifiability','Specificity']
cnt = 0
for i, row in tqdm(split_reviews.iterrows(), total=split_reviews.shape[0]):

    cur_split_review = row['split_review']
    cur_split_review = cur_split_review.split('$$$')    
    num_of_points = len(cur_split_review)
    for aspect in aspects:
        aspect_desc = PROMPTS[aspect]
        # print('evaluting aspect:', aspect)
        aspect_score = 0
        scores = []
        for j,review in enumerate(cur_split_review):
            # print(f'evaluting review:{j} out of {num_of_points}')


            prompt = PROMPTS['binary_score_prompt'].format(aspect=aspect, aspect_description =aspect_desc, review=review)

            conversation = [
                            {
                                'role': 'user', 'content': prompt
                            }
                        ]
            c = tokenizer.apply_chat_template(
                        conversation,
                        tokenize=False,
                        add_generation_prompt=True
                    )
            inputs = [c]
            outputs = llm.generate(inputs, sampling_params, use_tqdm= False)[0].outputs[0].text.strip()

            

            if  outputs != '1':
                outputs = '0'
            # print(outputs)
            aspect_score += int(outputs)
            scores.append(outputs)

        split_reviews.at[i, aspect + '_average'] = aspect_score / num_of_points
        split_reviews.at[i, aspect] = ','.join(scores)

    # print(reviews.iloc[i])
    cnt += 1
    # print(cnt)


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 374/374 [13:36<00:00,  2.18s/it]


In [3]:
split_reviews.columns

Index(['Unnamed: 0', 'paped_id', 'paper_summary', 'summary_of_strengths',
       'summary_of_weaknesses', 'comments,_suggestions_and_typos',
       'score_overall', 'score_best_paper', 'score_datasets', 'score_software',
       'author_identity_guess', 'confidence', 'score_replicability',
       'ethical_concerns', 'focused_review', 'Actionability',
       'Constructiveness or Politeness', 'Credibility or Verifiability',
       'Specificity', 'split_review', 'Actionability_average',
       'Constructiveness or Politeness_average',
       'Credibility or Verifiability_average', 'Specificity_average'],
      dtype='object')

In [4]:
split_reviews.to_csv("../../data/reviewer2_ARR_2022_manual_split_reviews.csv", index=False)