In [None]:
!ls ../../data/systematic_review_papers

# Prompts

In [None]:
import sys
sys.path.append('../..')

from src.index_files import *

from langchain_text_splitters import RecursiveCharacterTextSplitter
import pymupdf4llm

In [None]:
attr2question = {
    "base_model": "Which foundation models are used in the proposed method? A foundation model is a machine learning or deep learning model that is trained on broad data such that it can be applied across a wide range of use cases through further training or direct use. Only generate the models that are used in the proposed method and do not generate the baseline models that are used to compare with the proposed method. Also, generate the specific models instead of general model types.", 
    
    "adaption": '''Extract all the functionality adaptations applied to the foundation models in the proposed method. Each adaptation enables the foundation model for a specific functionality in the system. Multiple adaptations could be applied to the same foundation model. For each adaptation, classify the adaptation type following the below steps:

1.  Identify the Type of Foundation Model:
    •	Discriminative Model (e.g., BERT): Focuses on language representation.
    •	Generative Model (e.g., GPT): Focuses on natural language generation.
2.	Classify Adaptation Based on Model Type:
    •	For Discriminative Models:
        •	If the model is adapted to downstream tasks by designing specific objective functions, classify as "fine-tuning".
        •	If the adaptation aligns downstream tasks' objectives with pre-trained loss using hard/soft prompts and a label word verbalizer, classify as "prompt-tuning".
    •	For Generative Models:
        •	Check if the adaptation method updates the model parameters:
            •	Tuning Method:
                •	If the model serves primarily as an encoder for extracting user or item representations and the parameters are fine-tuned using specific downstream task loss functions, classify as "fine-tuning."
	            •	If the output is textual and parameters are trained focusing on a specific task using language modeling loss, classify as "prompt-tuning."
	            •	If the parameters are trained for multiple tasks with varying instructions, classify as "instruction-tuning."
	        •	Non-Tuning Method:
	            •	If the adaptation involves designing suitable instructions and prompts for task understanding and solving, classify as "prompting."
	            •	If it involves adding demonstration examples in the prompt to enhance task understanding, classify as "in-context learning."

This decision tree should help you effectively categorize the adaptation discussed in the given paper. You must extract the necessary implementation details at each step before making any decision to ensure accurate classification.''', 
    
    "task": "On which tasks is the proposed method evaluated? Only return the general tasks and do not return the datasets.", 
    
    "paradigm": '''Extract all the modeling paradigms used in the proposed method. For each pre-trained language model (PLM) that provides a specific functionality in the proposed system, classify its modeling paradigm type following the below steps:

    1.	Check the Role of PLM: Is the PLM primarily used as a feature extractor?
        •	If yes, move to step 2.
        •	If no, go to step 3.
    2.	Feature Extraction Details: Does the PLM output embeddings or tokens?
        •	If embeddings, classify as "PLM Embeddings + RS".
        •	If tokens, classify as "PLM Tokens + RS".
    3.	PLM Integration: Is the PLM itself configured to function directly as a recommendation system?
        •	If yes, classify as "PLM as RS".
        •	If no, further analysis of the paper is required to determine the specific paradigm. Classify as "Unknown".

This decision tree should help you effectively categorize the paradigms discussed in a given paper. Extract the necessary details at each step to ensure accurate classification. The paradigm type must be "PLM Embeddings + RS", "PLM Tokens + RS", "PLM as RS" or "Unknown".''', 
}

# attr2question = {
#     "base_model": "Which foundation models are used in the proposed method? A foundation model is a machine learning or deep learning model that is trained on broad data such that it can be applied across a wide range of use cases through further training or direct use.", 
#     "adaption": '''What adaptations are applied to the base models in the proposed method? The adaptation includes
    
#     1. Fine-tuning: The models mainly serve as encoders to extract representations of users or items, and the parameters of the models are subsequently fine-tuned on the specific loss functions of downstream recommendation tasks.
#     2. Prompt tuning with discriminative models: Prompt tuning with discriminative models aims to align the representations of pre-trained models like BERT with the domain-specific data through pre-trained loss, using hard/soft prompts and a label word verbalizer.
#     3. Prompt tuning with generative models: In the prompt tuning with generative models, the output of the models is consistently textual, and their parameters are trained using the loss of language modeling predominantly on a specific task.
#     4. Instruction tuning: In the instruction tuning, the output of the models is consistently textual, and their parameters are trained using the loss of language modeling predominantly on multiple tasks with different types of instructions.
#     5. Prompting: Prompting aims to design more suitable instructions and prompts to help models better understand and solve the tasks without training.
#     6. In-context learning: In-context learning is a technique using a few demonstration input-label pairs to quickly adapt the models to new tasks and information and predicting the label for an unseen input without additional parameter updates.
    
#     Extract the adaptations used in the proposed method and generate your response in the following format: "[Adaptation type]: [Detailed description of the adaptation in the proposed method]."''', 
#     "task": "On which tasks is the proposed method evaluated? Only return the general tasks and do not return the datasets.", 
#     "paradigm": '''What modeling paradigms are used in the proposed method? Modeling paradigms describe the components in a system and their interaction with one another. The modeling paradigm includes
    
#     1. LLM Embeddings + RS. This modeling paradigm views the language model as a feature extractor, which feeds the features of items and users into LLMs and outputs corresponding embeddings. A traditional RS model can utilize knowledgeaware embeddings for various recommendation tasks.
#     2. LLM Tokens + RS. Similar to the former method, this approach generates tokens based on the inputted items' and users' features. The generated tokens capture potential preferences through semantic mining, which can be integrated into the decision-making process of a recommendation system.
#     3. LLM as RS. Different from (1) and (2), this paradigm aims to directly transfer pre-trained LLM into a powerful recommendation system. The input sequence usually consists of the profile description, behavior prompt, and task instruction. The output sequence is expected to offer a reasonable recommendation result.
    
#     Extract the modeling paradigms used in the proposed method and generate your response in the following format: "[Modeling paradigm type]: [Detailed description of the modeling paradigm in the proposed method]."''', 
# }

# attr2format = {
#     "adaption": "Separate each extracted piece with '\n\n'.", 
#     "base_model": "Separate each extracted piece with '\n\n'.", 
#     "task": "Separate each extracted piece with '\n\n'.", 
#     "paradigm": "Separate each extracted piece with '\n\n'.", 
# }

prompts = {
    # attr: 'Answer the question based only on the following context:\n\nContext:\n\n{context}\n\n' + f'Answer the question based on the above context: {question}' for attr, question in attr2question.items()
    attr: f'Answer the question based on the above context: {question}' for attr, question in attr2question.items()
}

In [None]:
print(' '.join(attr2question['paradigm'].split()))

In [None]:
for attr, prompt in prompts.items():
    print(attr)
    print(prompt)
    print('----------------\n\n')

In [None]:
dataset = read_json('dataset.json')

# Experiments

In [None]:
# embeder = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2", model_kwargs={'device': 'cuda:1'}, encode_kwargs={'normalize_embeddings': False})
# embeder = HuggingFaceBgeEmbeddings(model_name="BAAI/bge-large-en-v1.5", model_kwargs={'device': 'cuda:1'}, encode_kwargs={'normalize_embeddings': True})
# embeder = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-large", model_kwargs={'device': 'cuda:1'}, encode_kwargs={'normalize_embeddings': True})
# embeder = HuggingFaceBgeEmbeddings(model_name="intfloat/e5-mistral-7b-instruct", model_kwargs={'device': 'cuda:1'}, query_instruction='Instruct: Given a search query, retrieve relevant passages that answer the query\nQuery: ')


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from langchain_huggingface import ChatHuggingFace, HuggingFacePipeline

llm_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"
# llm_name = "mistralai/Mistral-7B-Instruct-v0.3"

# llm = ChatHuggingFace(
#     llm=HuggingFacePipeline(pipeline = pipeline(
#         "text-generation", model=llm_name, device_map="auto", max_new_tokens=2000
#     )),
#     tokenizer=AutoTokenizer.from_pretrained(llm_name),
#     model_id=llm_name)
# f = Factory(llm_name=None)
# f.llm_name = llm_name
# f.llm = llm
# f.llm_tokenizer = llm.tokenizer

f = Factory(llm_name=llm_name, base_url='128.174.136.27')

In [None]:
article = pymupdf4llm.to_markdown(os.path.join('../../data/systematic_review_papers/', 'Agent4Rec.pdf'))

In [None]:
print(article)

In [None]:
results = defaultdict(dict)
for test_data in tqdm(dataset):
    test_file = test_data['file']
    article = pymupdf4llm.to_markdown(os.path.join('../../data/systematic_review_papers/', test_file))#, page_chunks=True)
    concated_article = '\n\n'.join([' '.join(passage.split()) for passage in article.replace('\n\n\n-----\n\n', ' ').split('\n\n')])
    cropped_article = f.llm_tokenizer.decode(f.llm_tokenizer.encode(concated_article, add_special_tokens=False)[:28000])
    
    prompt2attr = dict[str, str]()
    for attr, question in attr2question.items():
        prompt2attr[prompts[attr].format(context=cropped_article)] = attr
    
    test_prompts = list(prompt2attr)
    for prompt, gen in zip(test_prompts, f.llm.generate([[HumanMessage(content=full_prompt)] for full_prompt in test_prompts], max_tokens=2000).generations):
        results[test_file][f'{prompt2attr[prompt]}_gen'] = gen[0].text
        results[test_file][f'{prompt2attr[prompt]}_prompt'] = prompt
        
write_json(f'sys_review_openllm_{f.llm_name.split("/")[-1]}.json', results)

In [None]:
embeders = [
    HuggingFaceBgeEmbeddings(model_name="intfloat/e5-mistral-7b-instruct", model_kwargs={'device': 'cuda:1'}, query_instruction='Instruct: Given a search query, retrieve relevant passages that answer the query\nQuery: '),
    HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2", model_kwargs={'device': 'cuda:1'}, encode_kwargs={'normalize_embeddings': False}),
    HuggingFaceBgeEmbeddings(model_name="BAAI/bge-large-en-v1.5", model_kwargs={'device': 'cuda:1'}, encode_kwargs={'normalize_embeddings': True})
]
for embeder in embeders:

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=300,
        chunk_overlap=20,
        length_function=lambda x: len(embeder.client.tokenizer.encode(x, add_special_tokens=False)),
        separators=[
            "\n\n",
            "\n",
            ".",
            ",",
            "\u200b",  # Zero-width space
            "\uff0c",  # Fullwidth comma
            "\u3001",  # Ideographic comma
            "\uff0e",  # Fullwidth full stop
            "\u3002",  # Ideographic full stop
            " ",
            "",
        ],
        # Existing args
    )
    
    results = defaultdict(dict)
    for ret_num in [10]:
        print('ret_num:', ret_num)
        for test_data in tqdm(dataset):
            test_file = test_data['file']
            print(test_file)
            article = pymupdf4llm.to_markdown(os.path.join('../../data/systematic_review_papers/', test_file))#, page_chunks=True)
            concated_article = '\n\n'.join([' '.join(passage.split()) for passage in article.replace('\n\n\n-----\n\n', ' ').split('\n\n')])
            chunks = text_splitter.create_documents([concated_article])
            # for cid, chunk in enumerate(chunks):
            #     chunk.metadata['cid'] = str(cid)
            print(len(chunks))

            db_chroma = Chroma.from_documents(chunks, embeder, ids=[str(i) for i in range(len(chunks))])
            
            prompt2attr = dict[str, str]()
            for attr, question in attr2question.items():
                # docs_chroma = db_chroma.similarity_search_with_score('Instruct: Given a search query, retrieve relevant passages that answer the query.\nQuery: ' + question, k=ret_num)
                docs_chroma = db_chroma.similarity_search_with_score(question, k=len(chunks))[:ret_num]
                # docs_chroma.sort(key=lambda x: x[0].metadata['cid'])
                results[test_file][f'{attr}_retrieve_context'] = [(doc.page_content, _score) for doc, _score in docs_chroma]
                context_text = "\n\n".join([doc.page_content for doc, _score in docs_chroma])
                prompt2attr[prompts[attr].format(context=context_text)] = attr
            
            db_chroma.delete_collection()
            del db_chroma
            test_prompts = list(prompt2attr)
            for prompt, gen in zip(test_prompts, f.llm.generate([[HumanMessage(content=full_prompt)] for full_prompt in test_prompts], max_tokens=2000).generations):
                results[test_file][f'{prompt2attr[prompt]}_gen'] = gen[0].text
                
        write_json(f'sys_review_ret_{ret_num}_{f.llm_name.split("/")[-1]}_{embeder.model_name.split("/")[-1]}.json', results)

# Evaluation

In [None]:
samples = {sample['file']:sample for sample in dataset}

In [None]:
# sys_review_chatgpt_pdf = read_json('sys_review_chatgpt_pdf.json')
# sys_review_chatgpt = read_json('sys_review_chatgpt.json')
# sys_review_ret_mistral_mpnet = read_json('sys_review_ret_10_Mistral-7B-Instruct-v0.3_all-mpnet-base-v2.json')
# sys_review_ret_mistral_bge = read_json('sys_review_ret_10_Mistral-7B-Instruct-v0.3_bge-large-en-v1.5.json')
# sys_review_ret_mistral_mistral = read_json('sys_review_ret_10_Mistral-7B-Instruct-v0.3_e5-mistral-7b-instruct.json')
# sys_review_ret_llama3_mpnet = read_json('sys_review_ret_10_Meta-Llama-3.1-8B-Instruct_all-mpnet-base-v2.json')
# sys_review_ret_llama3_bge = read_json('sys_review_ret_10_Meta-Llama-3.1-8B-Instruct_bge-large-en-v1.5.json')
# sys_review_ret_llama3_mistral = read_json('sys_review_ret_10_Meta-Llama-3.1-8B-Instruct_e5-mistral-7b-instruct.json')
# sys_review_openllm_mistral = read_json('sys_review_openllm_Mistral-7B-Instruct-v0.3.json')
sys_review_openllm_llama3 = read_json('sys_review_openllm_Meta-Llama-3.1-8B-Instruct.json')

In [None]:
print(sys_review_openllm_llama3['RankGPT.pdf']['base_model_prompt'])

In [None]:
# test_data = sys_review_openllm_mistral_eval
# test_data = sys_review_openllm_llama3_eval
test_data = sys_review_openllm_llama3
# test_data = sys_review_chatgpt_pdf_eval
for sample in samples:
    print(sample)
    for attr in attr2question:
        print(attr)
        print('Question:', attr)
        print('Gold Standard:', samples[sample]['qa'][attr])
        if f'{attr}_cls' in test_data[sample]:
            print('Generation:', test_data[sample][f'{attr}_cls'], '\n\n')
        print('Generation:', test_data[sample][f'{attr}_gen'])
        print('-----\n')
    print('-------------------------------------\n\n')

In [None]:
eval_results = {
    'chatgpt' : {
        'ReprBERT.pdf': {
            "base_model": (1/2, 1),
            "adaption": (1, 1),
            "task": (1, 1),
            "paradigm": (1, 1),
        },
        'MESE.pdf': {
            "base_model": (1, 1),
            "adaption": (1, 1),
            "task": (1, 1),
            "paradigm": (1/2, 1/2),
        },
        'ChatGPT.pdf': {
            "base_model": (1, 1),
            "adaption": (1, 1),
            "task": (1, 1),
            "paradigm": (1, 1),
        },
        'LLM-Rate.pdf': {
            "base_model": (1, 1),
            "adaption": (1, 1),
            "task": (1, 1),
            "paradigm": (1, 1),
        },
        'PEPLER.pdf': {
            "base_model": (1, 1),
            "adaption": (1, 1),
            "task": (1, 1),
            "paradigm": (0, 0),
        },
        'Agent4Rec.pdf': {
            "base_model": (1, 1),
            "adaption": (1, 1),
            "task": (1, 7/8),
            "paradigm": (0, 0),
        },
        'GLRec.pdf': {
            "base_model": (1, 1),
            "adaption": (1/3, 1),
            "task": (1, 1),
            "paradigm": (1, 1),
        },
        'ONCE.pdf': {
            "base_model": (1, 1),
            "adaption": (1, 1),
            "task": (1, 1),
            "paradigm": (1, 1),
        },
        'UniCRS.pdf': {
            "base_model": (1, 1),
            "adaption": (0, 0),
            "task": (1, 1),
            "paradigm": (1/2, 1/2),
        },
        'SpeedyFeed.pdf': {
            "base_model": (1/2, 1),
            "adaption": (1, 1),
            "task": (1, 1),
            "paradigm": (1, 1),
        },
        'RankGPT.pdf': {
            "base_model": (3/4, 1),
            "adaption": (2/3, 2/3),
            "task": (1, 1),
            "paradigm": (1/2, 1/2),
        },
    },
    'openllm_llama3' : {
        'ReprBERT.pdf': {
            "base_model": (0, 0),
            "adaption": (0, 0),
            "task": (0, 0),
            "paradigm": (0, 0),
        },
        'MESE.pdf': {
            "base_model": (0, 0),
            "adaption": (0, 0),
            "task": (0, 0),
            "paradigm": (0, 0),
        },
        'ChatGPT.pdf': {
            "base_model": (0, 0),
            "adaption": (0, 0),
            "task": (0, 0),
            "paradigm": (0, 0),
        },
        'LLM-Rate.pdf': {
            "base_model": (0, 0),
            "adaption": (0, 0),
            "task": (0, 0),
            "paradigm": (0, 0),
        },
        'PEPLER.pdf': {
            "base_model": (0, 0),
            "adaption": (0, 0),
            "task": (0, 0),
            "paradigm": (0, 0),
        },
        'Agent4Rec.pdf': {
            "base_model": (0, 0),
            "adaption": (0, 0),
            "task": (0, 0),
            "paradigm": (0, 0),
        },
        'GLRec.pdf': {
            "base_model": (0, 0),
            "adaption": (0, 0),
            "task": (0, 0),
            "paradigm": (0, 0),
        },
        'ONCE.pdf': {
            "base_model": (0, 0),
            "adaption": (0, 0),
            "task": (0, 0),
            "paradigm": (0, 0),
        },
        'UniCRS.pdf': {
            "base_model": (0, 0),
            "adaption": (0, 0),
            "task": (0, 0),
            "paradigm": (0, 0),
        },
        'SpeedyFeed.pdf': {
            "base_model": (0, 0),
            "adaption": (0, 0),
            "task": (0, 0),
            "paradigm": (0, 0),
        },
        'RankGPT.pdf': {
            "base_model": (0, 0),
            "adaption": (0, 0),
            "task": (0, 0),
            "paradigm": (0, 0),
        },
    },
    'template' : {
        'ReprBERT.pdf': {
            "base_model": (0, 0),
            "adaption": (0, 0),
            "task": (0, 0),
            "paradigm": (0, 0),
        },
        'MESE.pdf': {
            "base_model": (0, 0),
            "adaption": (0, 0),
            "task": (0, 0),
            "paradigm": (0, 0),
        },
        'ChatGPT.pdf': {
            "base_model": (0, 0),
            "adaption": (0, 0),
            "task": (0, 0),
            "paradigm": (0, 0),
        },
        'LLM-Rate.pdf': {
            "base_model": (0, 0),
            "adaption": (0, 0),
            "task": (0, 0),
            "paradigm": (0, 0),
        },
        'PEPLER.pdf': {
            "base_model": (0, 0),
            "adaption": (0, 0),
            "task": (0, 0),
            "paradigm": (0, 0),
        },
        'Agent4Rec.pdf': {
            "base_model": (0, 0),
            "adaption": (0, 0),
            "task": (0, 0),
            "paradigm": (0, 0),
        },
        'GLRec.pdf': {
            "base_model": (0, 0),
            "adaption": (0, 0),
            "task": (0, 0),
            "paradigm": (0, 0),
        },
        'ONCE.pdf': {
            "base_model": (0, 0),
            "adaption": (0, 0),
            "task": (0, 0),
            "paradigm": (0, 0),
        },
        'UniCRS.pdf': {
            "base_model": (0, 0),
            "adaption": (0, 0),
            "task": (0, 0),
            "paradigm": (0, 0),
        },
        'SpeedyFeed.pdf': {
            "base_model": (0, 0),
            "adaption": (0, 0),
            "task": (0, 0),
            "paradigm": (0, 0),
        },
        'RankGPT.pdf': {
            "base_model": (0, 0),
            "adaption": (0, 0),
            "task": (0, 0),
            "paradigm": (0, 0),
        },
    },
}

In [None]:
question = 'base_model'
single_answer_df = []
multi_answer_df = []
for sample in dataset:
    file = sample['file']
    if len(sample['qa'][question]) > 1:
        multi_answer_df.append({'file': file, 'chatgpt_r': eval_results['chatgpt'][file][question][1], 'chatgpt_p': eval_results['chatgpt'][file][question][0], 'llama3_r': eval_results['openllm_llama3'][file][question][1], 'llama3_p': eval_results['openllm_llama3'][file][question][0]})
    else:
        single_answer_df.append({'file': file, 'chatgpt_p': eval_results['chatgpt'][file][question][0], 'llama3_p': eval_results['openllm_llama3'][file][question][0]})

print(f'{question}_single_answer')
df = pd.DataFrame(single_answer_df)
print(df)
print(f"chatgpt_p: {df['chatgpt_p'].mean()}", f"llama3_p: {df['llama3_p'].mean()}")

In [None]:
print(f'{question}_multi_answer')
df = pd.DataFrame(multi_answer_df)
print(df)
print(df['chatgpt_r'].mean(), df['chatgpt_p'].mean(), df['llama3_r'].mean(), df['llama3_p'].mean())

In [None]:
question = 'task'
single_answer_df = []
multi_answer_df = []
for sample in dataset:
    file = sample['file']
    if len(sample['qa'][question]) > 1:
        multi_answer_df.append({'file': file, 'chatgpt_r': eval_results['chatgpt'][file][question][1], 'chatgpt_p': eval_results['chatgpt'][file][question][0], 'llama3_r': eval_results['openllm_llama3'][file][question][1], 'llama3_p': eval_results['openllm_llama3'][file][question][0]})
    else:
        single_answer_df.append({'file': file, 'chatgpt_p': eval_results['chatgpt'][file][question][0], 'llama3_p': eval_results['openllm_llama3'][file][question][0]})

print(f'{question}_single_answer')
df = pd.DataFrame(single_answer_df)
print(df)
print(f"chatgpt_p: {df['chatgpt_p'].mean()}", f"llama3_p: {df['llama3_p'].mean()}")

In [None]:
print(f'{question}_multi_answer')
df = pd.DataFrame(multi_answer_df)
print(df)
print(df['chatgpt_r'].mean(), df['chatgpt_p'].mean(), df['llama3_r'].mean(), df['llama3_p'].mean())

In [None]:
question = 'adaption'
single_answer_df = []
multi_answer_df = []
for sample in dataset:
    file = sample['file']
    if len(sample['qa'][question]) > 1:
        multi_answer_df.append({'file': file, 'chatgpt_r': eval_results['chatgpt'][file][question][1], 'chatgpt_p': eval_results['chatgpt'][file][question][0], 'llama3_r': eval_results['openllm_llama3'][file][question][1], 'llama3_p': eval_results['openllm_llama3'][file][question][0]})
    else:
        single_answer_df.append({'file': file, 'chatgpt_p': eval_results['chatgpt'][file][question][0], 'llama3_p': eval_results['openllm_llama3'][file][question][0]})

print(f'{question}_single_answer')
df = pd.DataFrame(single_answer_df)
print(df)
print(f"chatgpt_p: {df['chatgpt_p'].mean()}", f"llama3_p: {df['llama3_p'].mean()}")

In [None]:
print(f'{question}_multi_answer')
df = pd.DataFrame(multi_answer_df)
print(df)
print(df['chatgpt_r'].mean(), df['chatgpt_p'].mean(), df['llama3_r'].mean(), df['llama3_p'].mean())

In [None]:
question = 'paradigm'
single_answer_df = []
multi_answer_df = []
for sample in dataset:
    file = sample['file']
    if len(sample['qa'][question]) > 1:
        multi_answer_df.append({'file': file, 'chatgpt_r': eval_results['chatgpt'][file][question][1], 'chatgpt_p': eval_results['chatgpt'][file][question][0], 'llama3_r': eval_results['openllm_llama3'][file][question][1], 'llama3_p': eval_results['openllm_llama3'][file][question][0]})
    else:
        single_answer_df.append({'file': file, 'chatgpt_p': eval_results['chatgpt'][file][question][0], 'llama3_p': eval_results['openllm_llama3'][file][question][0]})

print(f'{question}_single_answer')
df = pd.DataFrame(single_answer_df)
print(df)
print(f"chatgpt_p: {df['chatgpt_p'].mean()}", f"llama3_p: {df['llama3_p'].mean()}")

In [None]:
print(f'{question}_multi_answer')
df = pd.DataFrame(multi_answer_df)
print(df)
print(df['chatgpt_r'].mean(), df['chatgpt_p'].mean(), df['llama3_r'].mean(), df['llama3_p'].mean())

# Test

In [None]:
from spire.pdf.common import *
from spire.pdf import *

inputFile = "../../data/systematic_review_papers/RankGPT.pdf"

# Load a pdf document
inputfile = inputFile
doc = PdfDocument()
doc.LoadFromFile(inputfile)

In [None]:
page = doc.Pages[0]

In [None]:
bookmarks = doc.Bookmarks

In [None]:
bookmarks.Count

In [None]:
bookmarks.get_Item(1)

In [None]:
page.ExtractText(True)