In [None]:
!ls ../../data/systematic_review_papers

In [1]:
import sys
sys.path.append('../..')

from src.index_files import *

from langchain_text_splitters import RecursiveCharacterTextSplitter
# import pymupdf4llm

In [2]:
attr2question = {
    "adaption": "What training steps are taken to adapt the base models in the proposed method?", 
    "base_model": "Which base models are adapted in the proposed method?", 
    "task": "On which tasks is the proposed method evaluated?", 
    "paradigm": "Describe the modeling paradigm of the proposed method, ensuring that the explanation clearly outlines both its components and how they interact with one another.", 
}

# attr2format = {
#     "adaption": "Separate each extracted piece with '\n\n'.", 
#     "base_model": "Separate each extracted piece with '\n\n'.", 
#     "task": "Separate each extracted piece with '\n\n'.", 
#     "paradigm": "Separate each extracted piece with '\n\n'.", 
# }

prompts = {
    attr: 'Answer the question based only on the following context:\n\nContext:\n\n{context}\n\n' + f'Answer the question based on the above context: {question}' for attr, question in attr2question.items()
    # attr: f'Answer the question based on the above context: {question}\n{attr2format[attr]}' for attr, question in attr2question.items()
}

In [None]:
prompts

Answer the question based on the above context: Extract all the text that introduces the adaptions made to the base models in the proposed method.\nSeparate each extracted piece with '\n\n'.

Answer the question based on the above context: Extract all the text that introduces the base models used in the proposed method.\nSeparate each extracted piece with '\n\n'.

Answer the question based on the above context: Extract all the text that introduces the tasks being experimented in the paper.\nSeparate each extracted piece with '\n\n'.

Answer the question based on the above context: Extract all the text that introduces the modeling paradigm of the proposed method. Ensure that the extracted content clearly outlines both the components of the method and the relationships between these components.\nSeparate each extracted piece with '\n\n'.

In [9]:
dataset = read_json('../../data/systematic_review_papers/dataset.json')

# Experiments

In [6]:
# embeder = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2", model_kwargs={'device': 'cuda:1'}, encode_kwargs={'normalize_embeddings': False})
# embeder = HuggingFaceBgeEmbeddings(model_name="BAAI/bge-large-en-v1.5", model_kwargs={'device': 'cuda:1'}, encode_kwargs={'normalize_embeddings': True})
# embeder = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-large", model_kwargs={'device': 'cuda:1'}, encode_kwargs={'normalize_embeddings': True})
# embeder = HuggingFaceBgeEmbeddings(model_name="intfloat/e5-mistral-7b-instruct", model_kwargs={'device': 'cuda:1'}, query_instruction='Instruct: Given a search query, retrieve relevant passages that answer the query\nQuery: ')


In [9]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from langchain_huggingface import ChatHuggingFace, HuggingFacePipeline

llm_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"
# llm_name = "mistralai/Mistral-7B-Instruct-v0.3"

# llm = ChatHuggingFace(
#     llm=HuggingFacePipeline(pipeline = pipeline(
#         "text-generation", model=llm_name, device_map="auto", max_new_tokens=2000
#     )),
#     tokenizer=AutoTokenizer.from_pretrained(llm_name),
#     model_id=llm_name)
# f = Factory(llm_name=None)
# f.llm_name = llm_name
# f.llm = llm
# f.llm_tokenizer = llm.tokenizer

f = Factory(llm_name=llm_name, base_url='128.174.136.27')

In [None]:
results = defaultdict(dict)
for test_data in tqdm(dataset):
    test_file = test_data['file']
    article = pymupdf4llm.to_markdown(os.path.join('../../data/systematic_review_papers/', test_file))#, page_chunks=True)
    concated_article = '\n\n'.join([' '.join(passage.split()) for passage in article.replace('\n\n\n-----\n\n', ' ').split('\n\n')])
    cropped_article = f.llm_tokenizer.decode(f.llm_tokenizer.encode(concated_article, add_special_tokens=False)[:28000])
    
    prompt2attr = dict[str, str]()
    for attr, question in attr2question.items():
        prompt2attr[prompts[attr].format(context=cropped_article)] = attr
    
    test_prompts = list(prompt2attr)
    for prompt, gen in zip(test_prompts, f.llm.generate([[HumanMessage(content=full_prompt)] for full_prompt in test_prompts], max_tokens=2000).generations):
        results[test_file][f'{prompt2attr[prompt]}_gen'] = gen[0].text
        results[test_file][f'{prompt2attr[prompt]}_prompt'] = prompt
        
write_json(f'sys_review_openllm_{f.llm_name.split("/")[-1]}.json', results)

In [None]:
embeders = [
    HuggingFaceBgeEmbeddings(model_name="intfloat/e5-mistral-7b-instruct", model_kwargs={'device': 'cuda:1'}, query_instruction='Instruct: Given a search query, retrieve relevant passages that answer the query\nQuery: '),
    HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2", model_kwargs={'device': 'cuda:1'}, encode_kwargs={'normalize_embeddings': False}),
    HuggingFaceBgeEmbeddings(model_name="BAAI/bge-large-en-v1.5", model_kwargs={'device': 'cuda:1'}, encode_kwargs={'normalize_embeddings': True})
]
for embeder in embeders:

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=300,
        chunk_overlap=20,
        length_function=lambda x: len(embeder.client.tokenizer.encode(x, add_special_tokens=False)),
        separators=[
            "\n\n",
            "\n",
            ".",
            ",",
            "\u200b",  # Zero-width space
            "\uff0c",  # Fullwidth comma
            "\u3001",  # Ideographic comma
            "\uff0e",  # Fullwidth full stop
            "\u3002",  # Ideographic full stop
            " ",
            "",
        ],
        # Existing args
    )
    
    results = defaultdict(dict)
    for ret_num in [10]:
        print('ret_num:', ret_num)
        for test_data in tqdm(dataset):
            test_file = test_data['file']
            print(test_file)
            article = pymupdf4llm.to_markdown(os.path.join('../../data/systematic_review_papers/', test_file))#, page_chunks=True)
            concated_article = '\n\n'.join([' '.join(passage.split()) for passage in article.replace('\n\n\n-----\n\n', ' ').split('\n\n')])
            chunks = text_splitter.create_documents([concated_article])
            # for cid, chunk in enumerate(chunks):
            #     chunk.metadata['cid'] = str(cid)
            print(len(chunks))

            db_chroma = Chroma.from_documents(chunks, embeder, ids=[str(i) for i in range(len(chunks))])
            
            prompt2attr = dict[str, str]()
            for attr, question in attr2question.items():
                # docs_chroma = db_chroma.similarity_search_with_score('Instruct: Given a search query, retrieve relevant passages that answer the query.\nQuery: ' + question, k=ret_num)
                docs_chroma = db_chroma.similarity_search_with_score(question, k=len(chunks))[:ret_num]
                # docs_chroma.sort(key=lambda x: x[0].metadata['cid'])
                results[test_file][f'{attr}_retrieve_context'] = [(doc.page_content, _score) for doc, _score in docs_chroma]
                context_text = "\n\n".join([doc.page_content for doc, _score in docs_chroma])
                prompt2attr[prompts[attr].format(context=context_text)] = attr
            
            db_chroma.delete_collection()
            del db_chroma
            test_prompts = list(prompt2attr)
            for prompt, gen in zip(test_prompts, f.llm.generate([[HumanMessage(content=full_prompt)] for full_prompt in test_prompts], max_tokens=2000).generations):
                results[test_file][f'{prompt2attr[prompt]}_gen'] = gen[0].text
                
        write_json(f'sys_review_ret_{ret_num}_{f.llm_name.split("/")[-1]}_{embeder.model_name.split("/")[-1]}.json', results)

In [27]:
sys_review_chatgpt_pdf = {
   "ReprBERT.pdf": {
      "adaption_gen": '''In the proposed ReprBERT method, several training steps are taken to adapt the base models:

1. **Pre-training and Knowledge Distillation**: The ReprBERT model is initialized using a pre-trained StructBERT model as the teacher model. StructBERT is trained on both general and e-commerce-specific datasets to capture in-domain knowledge. The knowledge distillation step involves making ReprBERT approximate the output of the StructBERT teacher by minimizing the cross-entropy loss between the teacher's soft labels and the student's predictions.

2. **Context-Guided Attention Mechanism**: This mechanism is introduced to improve the encoding of query and product representations by considering context-level interactions between tokens. The goal is to better capture fine-grained word-level relationships that are typically lost in simple pooling-based approaches.

3. **Late Interaction Module**: After the embeddings for both the query and product are generated, a late interaction module is employed to enhance the interaction between these embeddings. This includes operations like summation, subtraction, and max-pooling, allowing the model to incorporate more fine-grained interaction information.

4. **Intermediate Interaction Module**: To address the gap caused by the absence of intermediate layer interactions, a novel intermediate interaction strategy is introduced. This module performs interactions between intermediate representations of the query and product at multiple layers and aggregates them using weighted pooling.

5. **Fine-tuning**: Finally, ReprBERT is fine-tuned using a combination of soft labels from the teacher model and hard labels from a human-annotated dataset. This step is crucial to ensure that the model adapts effectively to the specific task and dataset.

These steps collectively help reduce the performance gap between ReprBERT and the original BERT model while maintaining efficiency.''',
      "base_model_gen": '''The base models adapted in the proposed ReprBERT method are:

1. **BERT (Bidirectional Encoder Representations from Transformers)**: BERT serves as the foundation for the architecture, known for its strong performance in NLP tasks, including text matching.

2. **StructBERT**: A variant of BERT, StructBERT is used as the teacher model for knowledge distillation. StructBERT improves upon BERT by incorporating language structure information during pre-training. It is also fine-tuned on domain-specific e-commerce data to better suit the task of product-query relevance.

These two models form the base from which ReprBERT is distilled and adapted to a more efficient representation-based architecture for e-commerce relevance tasks.''',
      "task_gen": '''The proposed ReprBERT method is evaluated on the **e-commerce relevance task**, which involves measuring the semantic relevance between product titles and user queries in large-scale e-commerce search engines. Specifically, the task focuses on improving the efficiency and accuracy of text matching between queries and product descriptions to enhance search engine relevance, user experience, and business outcomes such as **Gross Merchandise Value (GMV)**.

The method is also evaluated through both **offline evaluations** (using large-scale human-annotated datasets with metrics like ROC-AUC and Negative PR-AUC) and **online evaluations** (including A/B testing in real-world e-commerce platforms such as Taobao, measuring improvements in transactions and relevance rates).''',
      "paradigm_gen": '''The modeling paradigm of the proposed **ReprBERT** method consists of a combination of components that interact to distill the BERT model into a more efficient, representation-based architecture for e-commerce relevance tasks. The key components and their interactions are as follows:

### 1. **ReprBERT Encoder**:
   - The core of the model uses **BERT** as the encoder to generate representations for both queries and products.
   - The encoder employs a **multi-layer architecture**, where each layer consists of a **multi-head attention mechanism** and a **feed-forward network (FFN)** to capture context and semantic relationships between tokens.
   - Instead of directly using token embeddings (such as the [CLS] token), ReprBERT introduces a **context-guided attention mechanism** that improves how representations are generated by considering the relationships among all token embeddings. This helps better encode the fine-grained interactions between tokens.

### 2. **Late Interaction Module**:
   - ReprBERT includes a **late interaction module**, which combines the representations of the query and product after they have been encoded.
   - This interaction involves summation, subtraction, and max-pooling operations to create a more **fine-grained interaction representation** that enhances the model’s ability to predict relevance by incorporating detailed interaction information between the encoded query and product representations.

### 3. **Intermediate Interaction Module**:
   - To further improve performance, ReprBERT includes an **intermediate interaction module**. Unlike typical representation-based models that perform interaction only at the final stage, this module introduces **interactions between the intermediate layers** of the query and product.
   - This component extracts intermediate representations from each layer of both the query and product, performs fine-grained interactions (similar to the late interaction module), and then aggregates these intermediate interactions using **weighted pooling**.
   - By incorporating interactions across multiple layers, this module mimics the more fine-grained interactions seen in the original BERT model, reducing the performance gap between interaction-based and representation-based architectures.

### 4. **Knowledge Distillation**:
   - ReprBERT is trained using **knowledge distillation**. A **StructBERT model** (a variant of BERT with enhanced pre-training on e-commerce data) serves as the **teacher model**, providing high-quality soft labels.
   - The **student model** (ReprBERT) learns to approximate the outputs of the teacher model by minimizing the loss between the teacher's soft labels and its own predictions. This process is aimed at transferring knowledge from the computationally expensive interaction-based model to the more efficient representation-based model.
   - The training process consists of two stages: (1) using a large, unlabeled dataset for distillation and (2) fine-tuning on a human-annotated dataset to improve accuracy further.

### Interaction Between Components:
   - The **ReprBERT encoder** generates initial representations for queries and products using a context-guided attention mechanism.
   - These representations are refined through both the **late interaction module**, which operates on the final embeddings, and the **intermediate interaction module**, which captures information across different layers.
   - **Knowledge distillation** ensures that the student model learns effectively from the teacher model's outputs, resulting in a model that combines the efficiency of a representation-based architecture with the fine-grained interaction capabilities of interaction-based models.

This paradigm enables ReprBERT to achieve high relevance performance with low latency, making it suitable for real-time, large-scale e-commerce platforms.'''
   },
   "MESE.pdf": {
      "adaption_gen": '''The proposed method takes the following steps to adapt the base models:

1. **Item Metadata Encoding**: The system uses an item encoder to map the metadata of items (e.g., titles, genres, plot descriptions) into embeddings. This encoder is based on a DistilBERT model, which converts the concatenated metadata into vector embeddings that represent the rich information of the items.

2. **Joint Training with Pre-trained Language Model (PLM)**: The item encoder and a pre-trained language model (PLM) are jointly trained. This allows the system to extract relevant co-occurring information between the dialog context and the item metadata. The PLM processes the dialog context, and the encoded item embeddings are consumed by the self-attention mechanism within the PLM, aligning the item metadata with the dialog context.

3. **Recommendation and Response Generation**: The system is trained simultaneously on both recommendation and response generation tasks. This joint training optimizes the model for generating high-quality recommendations that are context-aware and for producing relevant and fluent responses in the conversation. The training loss combines objectives for candidate selection, ranking, and response generation.

4. **Candidate Selection and Ranking**: During training, the system learns to generate candidates for recommendation and refines their ranking by computing the similarity between the dialog context and the candidate items using the self-attention mechanism of the PLM.

In summary, the method adapts the base models by incorporating item metadata into the dialog context through a joint training process, aligning the dialog context with item metadata, and using both for improved recommendation and response generation.''',
      "base_model_gen": '''The base models adapted in the proposed method are:

1. **DistilBERT**: This model is used as the core of the item encoder. It processes item metadata such as titles, genres, and plot descriptions, transforming them into vector embeddings that represent the items in a way that aligns with the dialog context.

2. **Pre-trained Language Model (PLM)**: Specifically, the system employs **GPT-2** as the backbone for dialog generation. The GPT-2 model is used to handle dialog context and generate responses, integrating the context-aware item embeddings produced by the DistilBERT-based item encoder. 

Both DistilBERT and GPT-2 are jointly trained to align item metadata with dialog context for improved recommendation and response generation.''',
      "task_gen": '''The proposed method is evaluated on two main tasks:

1. **Recommendation**: The system is evaluated on its ability to recommend relevant items (e.g., movies) based on the dialog context. Metrics such as Recall@1, Recall@10, and Recall@50 are used to assess how accurately the system can recommend the correct items.

2. **Response Generation**: The method is also evaluated on its ability to generate fluent and relevant responses in a conversation. Metrics for evaluating this include perplexity, distinct n-grams (Dist2, Dist3, Dist4), and BLEU scores, which measure the diversity and relevance of the generated responses.

Both tasks are evaluated on benchmark datasets, specifically the **ReDial** and **INSPIRED** datasets.''',
      "paradigm_gen": '''The modeling paradigm of the proposed method integrates item metadata and dialog context through a two-component architecture, which allows for seamless recommendation and response generation in conversational systems. Here's a detailed outline of the components and their interactions:

1. **Item Metadata Encoder (DistilBERT-based)**:
   - The method uses a DistilBERT-based item metadata encoder. This encoder takes in detailed metadata for each item (e.g., titles, genres, plot descriptions) and transforms this information into a vector embedding that represents the item's attributes.
   - The metadata is concatenated with a special separator token (`[SEP]`) before being processed by the DistilBERT model. The output is a single vector embedding per item, created after a pooling and feed-forward layer.

2. **Pre-trained Language Model (PLM - GPT-2)**:
   - The method employs GPT-2 as the PLM responsible for handling the dialog context and generating responses.
   - The GPT-2 model takes both the dialog history and the encoded item embeddings (from the item metadata encoder) as inputs.
   - A special placeholder token (`[PH]`) is introduced in place of specific item names during training to ensure the PLM can generalize better. The placeholder is later replaced with the actual item name during inference.

3. **Joint Training**:
   - The item encoder and the PLM are jointly trained, allowing the model to learn the alignment between dialog context and item metadata. This joint training ensures that the two components interact seamlessly.
   - The PLM uses self-attention mechanisms to integrate both the dialog context (word token embeddings) and the item embeddings. The goal is to generate both relevant recommendations and coherent responses based on this combined input.

4. **Recommendation Process**:
   - **Candidate Selection**: A special `[REC]` token is appended to the dialog context to trigger the recommendation process. The dialog context is processed by the PLM, which generates an embedding (`DR`) representing the summarized dialog context. This is compared with a pool of candidate item embeddings generated by the item encoder to select items for recommendation.
   - **Candidate Ranking**: After initial candidate selection, the PLM refines the ranking of these candidates using its self-attention mechanism to determine the most contextually appropriate recommendation.

5. **Response Generation**:
   - Once an item is selected for recommendation, the PLM continues generating a response based on the dialog context and the item's metadata. The generated response includes a placeholder token (`[PH]`), which is replaced with the actual item name before final output.

6. **Joint Loss Optimization**:
   - The entire system is trained using a combined loss function, which balances between the recommendation objective (candidate selection and ranking) and the response generation objective. This ensures that the model performs well in both tasks simultaneously.

In summary, the proposed method integrates a DistilBERT-based item encoder and a GPT-2 language model, where the item metadata is encoded and aligned with the dialog context through joint training. The components interact via self-attention, enabling both high-quality recommendations and fluent, context-aware responses.'''
   },
   "ChatGPT.pdf":{
      "adaption_gen": '''The training steps taken to adapt the base models in the proposed method, as described in the document, primarily revolve around the use of **prompt engineering** instead of traditional fine-tuning approaches. Here are the key steps outlined:

1. **Task-Specific Prompt Construction**: Prompts are tailored to the nature of each recommendation task. For instance, prompts are designed to predict user ratings or generate sequential recommendations. Each prompt typically includes three parts:
   - **Task description**: Converts recommendation tasks into natural language tasks.
   - **Behavior injection**: Incorporates user-item interaction data to help capture user preferences.
   - **Format indicator**: Constrains the output format, ensuring the results are more structured and easier to evaluate.

2. **Few-Shot Prompting**: This technique is used to inject a small amount of interaction data that represents user preferences. This helps the model better understand user needs without traditional fine-tuning. For example, few-shot prompts are used to improve rating predictions or sequential recommendations by providing additional context from historical user interactions.

3. **Output Refinement**: Since ChatGPT introduces randomness in its outputs, an additional step involves refining the results. If the generated output does not meet the expected format, it is corrected based on predefined rules. If necessary, the model re-generates recommendations until the correct format is achieved.

These steps are primarily focused on adapting the model via **prompt design and injection of few-shot learning information**, rather than retraining or modifying the model's internal parameters.''',
      "base_model_gen": '''The proposed method in the document adapts **ChatGPT** as the base model. It does not involve fine-tuning or modifying any traditional recommendation models; instead, it uses **ChatGPT** as a general-purpose recommendation model. The method focuses on adapting ChatGPT for different recommendation tasks, including rating prediction, sequential recommendation, direct recommendation, explanation generation, and review summarization.

While **few-shot prompting** is used to improve performance by injecting interaction data, the model itself remains ChatGPT, leveraging its extensive pre-trained linguistic and world knowledge from large-scale corpora.''',
      "task_gen": '''The proposed method is evaluated on the following five tasks:

1. **Rating Prediction**: Predicting the rating a user would give to a particular item.
2. **Sequential Recommendation**: Predicting the next item a user is likely to interact with based on their interaction history.
3. **Direct Recommendation**: Recommending the most appropriate item from a list of candidates.
4. **Explanation Generation**: Generating textual explanations that justify why a specific item is recommended to the user.
5. **Review Summarization**: Creating concise summaries of user reviews for different products.

These tasks evaluate the model's ability to handle both accuracy-oriented and explainability-oriented recommendation challenges.''',
      "paradigm_gen": '''The modeling paradigm of the proposed method is built around **ChatGPT** as a base model and follows a **prompt-based approach** for handling recommendation tasks. The method does not fine-tune the base model but instead relies on a structured interaction between specially designed prompts and ChatGPT to perform various tasks. Here’s how the components of the modeling paradigm interact:

### 1. **Task-Specific Prompt Construction**
   - **Components**:
     - **Task Description**: Each recommendation task is first translated into a natural language task using a prompt that describes the task (e.g., predicting ratings, sequential recommendations).
     - **Behavior Injection**: Few-shot learning is used by incorporating user-item interaction data. This behavior injection helps ChatGPT better capture user preferences and make more relevant recommendations.
     - **Format Indicator**: Prompts are also equipped with a format indicator to control the structure of the output, ensuring that the model generates results in a comprehensible and assessable manner.

   - **Interaction**: The prompt serves as an input to ChatGPT, transforming structured recommendation tasks into natural language queries. For example, a rating prediction task prompt might ask ChatGPT to predict how a user would rate a specific product based on their historical ratings.

### 2. **Few-Shot Prompting**
   - **Components**: This involves adding a few user interactions or item features to the prompt to give ChatGPT context about user behavior or preferences.
   
   - **Interaction**: The few-shot examples guide ChatGPT in understanding user-specific patterns, enhancing its ability to make personalized predictions without needing large-scale model retraining. It enables ChatGPT to simulate learning from limited data, improving performance on tasks like sequential or direct recommendation.

### 3. **Output Refinement**
   - **Components**: A post-processing step ensures that the output from ChatGPT adheres to the desired format. The refinement module checks the output and applies predefined rules for correction if needed.
   
   - **Interaction**: If ChatGPT’s output does not meet the required format (e.g., list of items, specific numerical ratings), the model is prompted to regenerate or correct the output. This ensures the recommendations are both accurate and well-structured.

### 4. **Evaluation and Feedback**
   - **Components**: The model's outputs are evaluated both through automated metrics (e.g., RMSE for rating prediction, Hit Ratio for recommendations) and human assessments (for tasks like explanation generation and review summarization).
   
   - **Interaction**: Evaluation feedback helps understand the strengths and weaknesses of ChatGPT in recommendation tasks. Human evaluators assess the quality of outputs like explanations and summaries, helping to refine future prompt designs.

### Summary of Interaction:
The method relies heavily on **prompt-based interaction** with ChatGPT, where carefully designed prompts (with task descriptions, behavior injection, and format indicators) guide the model to generate task-specific recommendations. Few-shot learning aids in capturing user preferences, while output refinement ensures the structured delivery of recommendations. This entire process circumvents the need for model fine-tuning by leveraging ChatGPT’s pre-existing language and knowledge capabilities, adapting it to the recommendation domain through natural language instructions.'''
   },
   "LLM-Rate.pdf": {
      "adaption_gen": '''The training steps taken to adapt the base models in the proposed method involve fine-tuning Large Language Models (LLMs) with human interaction data to improve their performance on the rating prediction task. Specifically:

1. **Multi-class classification:** The task is framed as a classification problem where the rating scale (1 to 5) is treated as distinct classes. During training, cross-entropy loss is used to optimize the model, and the LLM predicts the class (rating) with the highest probability during inference.

2. **Regression:** Another approach involves modifying the LLM’s output layer to predict a continuous rating. Mean squared error (MSE) loss is applied during training to minimize the difference between predicted and true ratings.

These approaches allow the LLM to leverage user-item interaction data for more accurate predictions while adapting the base models to suit the specific task requirements.''',
      "base_model_gen": '''The base models adapted in the proposed method include the **Flan-T5** family, specifically **Flan-T5-Base** and **Flan-T5-XXL**, as well as other Large Language Models (LLMs) like **GPT-3** (including **text-davinci-003** and **ChatGPT**) and **Flan-U-PaLM**. These models are fine-tuned for the rating prediction task using human interaction data.''',
      "task_gen": '''The proposed method is evaluated on the task of **user rating prediction**. Specifically, the evaluation focuses on predicting a user’s rating for a given item based on their past interactions. The performance of the method is tested in three different settings: **zero-shot**, **few-shot**, and **fine-tuning**. The evaluation is conducted on two widely adopted benchmark datasets: **MovieLens** and **Amazon-Books**, which involve user ratings for movies and books, respectively.''',
      "paradigm_gen": '''The modeling paradigm of the proposed method integrates **Large Language Models (LLMs)** with user interaction data to predict user ratings for items. It operates in three key components:

1. **Input Features**: The model uses a sequence of user interactions, which includes metadata about the items (e.g., title, genre, or brand) and user-specific past ratings. This data forms the context for the LLM to make predictions.

2. **Model Architecture**: The method leverages both **Encoder-Decoder** and **Decoder-Only** LLM architectures, such as **Flan-T5** and **GPT-3**. These models are pre-trained on vast amounts of textual data and fine-tuned for rating prediction. Two key modeling approaches are explored:
   - **Multi-class classification**: The model treats the rating prediction as a classification problem where each possible rating (from 1 to 5) is a class. The LLM projects the final layer’s output into class probabilities using a softmax function and applies a cross-entropy loss to optimize the prediction.
   - **Regression**: The LLM is modified to directly output a continuous numeric value as the predicted rating, and the model is trained using a mean-squared error (MSE) loss.

3. **Fine-tuning**: The LLMs are further adapted by fine-tuning them on human interaction data. This step is crucial as it incorporates the user-item interaction patterns into the model. Fine-tuning allows the model to understand dataset-specific characteristics (such as average ratings or trends) and improve prediction accuracy.

These components interact such that the LLM receives structured input (past interactions and item features), processes it using either classification or regression objectives, and adapts through fine-tuning on interaction data. This interaction allows the model to predict user ratings more effectively by leveraging both pre-trained knowledge and specific dataset training.'''
   },
   "GIRL.pdf": {
      "adaption_gen": '''The proposed method in the document utilizes a **three-step training process** to adapt the base models for job recommendation. These steps are:

1. **Supervised Fine-Tuning (SFT)**: This initial step focuses on teaching the Large Language Model (LLM) how to generate appropriate Job Descriptions (JDs) based on given Curriculum Vitae (CV). A dataset of matched CV-JD pairs is used, and the instruction-tuning method is applied to train the LLM generator to generate personalized job descriptions.

2. **Reward Model Training (RMT)**: A dataset consisting of both matched and mismatched CV-JD pairs, including recruiter feedback, is used to train a reward model. This model is designed to distinguish between suitable and unsuitable CV-JD pairs, thus simulating real-world recruiter preferences. The reward model is trained using a pairwise ranking loss to improve its predictive capabilities.

3. **Reinforcement Learning from Recruiter Feedback (RLRF)**: In this step, Proximal Policy Optimization (PPO)-based reinforcement learning is used to further refine the LLM. The reward model developed in the previous step helps align the LLM generator with recruiter preferences, making the job recommendations more tailored to both job seekers' profiles and market demands.

This comprehensive approach ensures that the LLM not only generates relevant job descriptions but also adapts to recruiter feedback, improving the overall quality and effectiveness of the recommendations.''',
      "base_model_gen": '''In the proposed method, two base models are adapted:

1. **Large Language Model (LLM) Generator**: This model is fine-tuned to generate personalized job descriptions (JDs) for job seekers based on their Curriculum Vitae (CV). The fine-tuning process involves supervised training and reinforcement learning to align the model with both job seeker characteristics and recruiter feedback.

2. **BERT**: BERT (Bidirectional Encoder Representations from Transformers) is used as the text encoder to generate embeddings for both CVs and JDs. These embeddings are used in the discriminative recommendation task to calculate the matching scores between job seekers and job descriptions.

These models are adapted using the three-step training process: Supervised Fine-Tuning (SFT), Reward Model Training (RMT), and Reinforcement Learning from Recruiter Feedback (RLRF).''',
      "task_gen": '''The proposed method is evaluated on the following tasks:

1. **Generation Quality Evaluation**: The method is evaluated on the quality of the job descriptions (JDs) generated by the Large Language Model (LLM). The evaluation focuses on aspects such as the level of detail, relevance to the candidate's profile, and conciseness. ChatGPT is used to rank the generated JDs in comparison to baseline methods.

2. **Enhanced Job Recommendation Performance**: The method is also evaluated on how well the generated JDs enhance the performance of traditional discriminative job recommendation models. Metrics such as AUC (Area Under the Curve) and LogLoss are used to measure the effectiveness of the model in improving recommendation accuracy.

3. **Cold-Start Condition Performance**: The method's performance is specifically tested under cold-start conditions, where job recommendations are made for job seekers who have not appeared in the training data. This is done to assess how well the generated JDs help in improving recommendation models when little or no prior information about the job seeker is available.''',
      "paradigm_gen": '''The modeling paradigm of the proposed method, **GIRL** (GeneratIve Job Recommendation based on Large Language Models), is a generative recommendation framework designed to enhance job recommendations by directly generating personalized Job Descriptions (JDs) based on the Curriculum Vitae (CV) of a job seeker. The paradigm is comprised of several key components and interactions, structured as follows:

### 1. **LLM-based Generator**
   - The core component is a **Large Language Model (LLM)** trained to generate JDs. The generator takes a job seeker's CV as input and outputs a tailored JD. 
   - **Supervised Fine-Tuning (SFT)**: Initially, the generator is fine-tuned using a dataset of matched CV-JD pairs. This teaches the model to produce job descriptions aligned with the skills, experience, and background of the job seeker.

### 2. **Reward Model**
   - A separate **Reward Model** is trained to evaluate the suitability of the generated JDs. It is responsible for scoring the matching degree between CVs and JDs.
   - **Reward Model Training (RMT)**: The reward model is trained using both matched and mismatched CV-JD pairs, incorporating recruiter feedback to simulate real-world job market preferences. This helps the model learn which job descriptions are suitable or unsuitable for specific candidates.

### 3. **Reinforcement Learning from Recruiter Feedback (RLRF)**
   - The generator is further optimized using **Proximal Policy Optimization (PPO)**, a reinforcement learning technique.
   - In this stage, the generator (LLM) and reward model interact, where the generator produces new JDs, and the reward model evaluates them. Feedback from this evaluation is used to improve the generator’s performance in aligning with recruiter preferences, thus balancing the job seeker’s profile with market demands.

### Interaction Between Components
   - The **LLM-based generator** produces job descriptions based on the CV.
   - The **Reward Model** evaluates these descriptions, giving feedback on whether the JD appropriately matches the job seeker's profile.
   - Through **Reinforcement Learning (PPO)**, the generator refines its outputs, aligning them not only with job seeker preferences but also with recruiter and market requirements.

### Enhanced Recommendation Process
   - The generated JD can serve two purposes:
     1. **Stand-alone Job Suggestion**: The generated JDs can directly provide job seekers with personalized career guidance.
     2. **Enhancement of Traditional Models**: The generated JDs are also used to improve traditional discriminative recommendation models by bridging the semantic gap between CVs and JDs, enhancing the accuracy and performance of these models.

In summary, the modeling paradigm consists of an LLM-based generator, a reward model for evaluating generated JDs, and a reinforcement learning process to fine-tune the generator. These components work together to produce personalized and recruiter-aligned job descriptions, which improve the job recommendation process.'''
   },
   "Agent4Rec.pdf": {
      "adaption_gen": '''The training steps to adapt the base models in the proposed method involve several key processes:

1. **Initialization of User Profiles**: The agents are initialized based on real-world datasets such as MovieLens, Steam, and Amazon-Book. This step ensures that the profile modules capture users' unique tastes and social traits.
   
2. **Memory Module**: The memory module logs both factual and emotional memories. It includes an emotion-driven reflection mechanism that allows the system to adapt based on previous user interactions and emotional responses.

3. **Action Module Interaction**: Agents interact with personalized recommender models in a page-by-page manner. The action module governs their behaviors, including taste-driven and emotion-driven actions. This interaction is crucial for refining the base models, as feedback is continuously generated.

4. **Feedback-Driven Enhancement**: After completing a recommendation simulation, both viewed and unviewed items are collected and added as positive signals to the training dataset of each user. This feedback is then used to re-train the recommendation algorithms, which helps adapt the base models to better capture user preferences over time.''',
      "base_model_gen": '''The base models adapted in the proposed method are primarily **collaborative filtering-based recommendation models**. Specifically, the models used include:

1. **Matrix Factorization (MF)**
2. **LightGCN** 
3. **MultVAE** 

These models are used in the recommendation environment to generate item recommendations for the agents in a page-by-page manner. The system's adaptability allows for these models to be re-trained and enhanced based on feedback generated from agent interactions.''',
      "task_gen": '''The proposed method, Agent4Rec, is evaluated on several tasks, including:

1. **User Alignment**: Evaluating how well the agents simulate real users' preferences, social traits, and behaviors. This involves experiments to ensure that generative agents maintain coherent long-term preferences aligned with real user data.

2. **Recommendation Strategy Evaluation**: Assessing the effectiveness of different recommendation algorithms (such as Matrix Factorization, LightGCN, and MultVAE) by measuring user satisfaction, number of items viewed, and overall user engagement.

3. **Filter Bubble Effect**: Replicating the filter bubble phenomenon, where users are exposed to a narrowing range of content due to feedback loops in recommendation algorithms, to understand how algorithms might contribute to this effect.

4. **Causal Discovery in Recommender Systems**: Using the simulator to explore latent causal relationships within the data, including identifying factors like movie quality, popularity, and exposure rates that influence movie ratings.''',
      "paradigm_gen": '''The modeling paradigm of the proposed method, **Agent4Rec**, involves a simulation framework that uses **LLM-empowered generative agents** to emulate human-like behaviors and preferences in recommendation environments. The paradigm is structured around two core components: **generative agents** and a **recommendation environment**.

### Key Components:

1. **Generative Agents**:
   These agents are designed to mimic human cognitive processes, preferences, and actions. Each agent is equipped with three key modules:
   
   - **Profile Module**: 
     This module initializes the agent’s social traits and unique tastes using real-world datasets (e.g., MovieLens, Steam, and Amazon-Book). It records activity levels, conformity (alignment with average user ratings), and diversity of preferences, providing a foundation for personalizing recommendations.
   
   - **Memory Module**: 
     The memory module stores both **factual memories** (such as previous interactions with the system) and **emotional memories** (user feelings like satisfaction or fatigue). It supports memory retrieval, writing, and reflection, enabling agents to recall past experiences and use them in decision-making.

   - **Action Module**: 
     This governs the agents' behaviors in the recommendation environment. It includes **taste-driven actions** (e.g., viewing, rating, or skipping recommended items) and **emotion-driven actions** (e.g., deciding to exit the system, rating the system itself, or providing feedback through interviews). These actions are influenced by both the factual and emotional states stored in the memory module.

2. **Recommendation Environment**:
   This component simulates real-world recommendation scenarios. The environment includes:
   
   - **Item Profiles**: Each item (such as a movie) is characterized by its quality, popularity, genre, and a summary, providing context for the recommendations.
   
   - **Page-by-Page Recommendation System**: The environment mimics platforms like Netflix or YouTube by displaying recommendations page by page. Agents interact with these recommendations in real-time, allowing for feedback-driven refinement of their preferences.
   
   - **Recommendation Algorithms**: The system uses pre-implemented collaborative filtering-based models like **Matrix Factorization (MF)**, **LightGCN**, and **MultVAE** to generate recommendations, which are tailored to each agent’s preferences and actions. The environment is designed to be extensible, allowing for the integration of other algorithms.

### Interaction Between Components:
- **Agent-Environment Interaction**: The generative agents interact with the recommendation system by viewing, rating, or skipping items based on their personal preferences, social traits, and memories. They also provide feedback, which influences future recommendations.
  
- **Feedback Loop**: The agents' actions, such as their ratings and choices to view or skip content, generate feedback that is used to retrain the recommendation algorithms. This feedback-driven interaction allows for iterative refinement, improving the system’s ability to tailor recommendations to the agents over time.

- **Emotion and Memory Reflection**: Emotional states (like satisfaction or fatigue) and factual interactions (e.g., items watched or skipped) feed into the agents’ memory module. As agents encounter new recommendations, they retrieve relevant memories to guide their decision-making, ensuring a coherent simulation of real user behavior.

In summary, the modeling paradigm of Agent4Rec integrates **LLM-powered generative agents** that simulate human cognitive and emotional processes with a **recommendation environment** that generates and refines personalized recommendations based on real-time agent feedback.'''
   },
   "SpeedyFeed.pdf": {
      "adaption_gen": '''The training steps taken to adapt the base models in the proposed SpeedyFeed method include:

1. **Input Processing**: The input data (users’ news history and candidate news) is gathered and tokenized. News articles are truncated or padded into sequences of a unified length. This step formats the raw data to be loaded for training .
  
2. **Cache Acceleration**: A caching mechanism is used to avoid redundant encoding of frequently clicked news. Cached embeddings are reused, while non-cached news articles are encoded using BusLM (Bus Language Modeling)  [oai_citation:2,SpeedyFeed.pdf](file-service://file-0Fbh72UuvQPi1io0TG5bpAgM).

3. **Autoregressive User Modeling**: The user history is encoded more efficiently by treating all historical news clicks as one training instance. This avoids the need to repeatedly re-encode overlapping portions of the user history [oai_citation:1,SpeedyFeed.pdf](file-service://file-0Fbh72UuvQPi1io0TG5bpAgM) .

4. **Bus Language Modeling (BusLM)**: News articles are divided into segments, such as title, abstract, and body, to reduce encoding complexity. This allows transformers to process shorter segments while still enabling cross-segment communication via proxy embeddings .

These methods aim to increase efficiency by reusing data, minimizing unnecessary encodings, and improving the utilization of historical user behavior during training .''',
      "base_model_gen": '''The base models adapted in the proposed SpeedyFeed method are **Pretrained Language Models (PLMs)**, specifically **BERT** and **RoBERTa**. These PLMs are utilized for encoding news articles due to their ability to capture deep semantic information from text. Additionally, **UniLM** (Unified Language Model) is mentioned as a key PLM used in the system. These models are adapted to efficiently handle large-scale news recommendations through techniques like caching, autoregressive user modeling, and Bus Language Modeling (BusLM).''',
      "task_gen": '''The proposed method, SpeedyFeed, is evaluated on the task of **news recommendation**. Specifically, it is tested on large-scale datasets, including production data from **Microsoft News** and a public dataset called **MIND**. The evaluation focuses on how well the model can predict users' future news preferences based on their historical news clicks. The method's performance is assessed using standard ranking metrics like **AUC**, **MRR**, and **NDCG**, as well as **Recall@K** for measuring its ability to retrieve relevant news from a large index.''',
      "paradigm_gen": '''The modeling paradigm of the proposed method, **SpeedyFeed**, is designed for efficient training of **Pretrained Language Models (PLMs)** in large-scale news recommendation systems. It consists of several key components that interact to improve both the efficiency and quality of the model:

1. **Centralized News Encoding**: 
   - **Description**: All news articles in a mini-batch are gathered into a merged set for joint encoding. This removes duplicated and padded articles, avoiding unnecessary computation.
   - **Interaction**: It reduces the redundant encoding of repeated news articles and enhances data efficiency.

2. **Cache Acceleration**:
   - **Description**: A caching mechanism stores freshly generated news embeddings for reuse across multiple training steps, particularly for popular news articles that appear frequently in users’ histories.
   - **Interaction**: When news articles are re-encountered during training, their embeddings are retrieved from the cache instead of being re-encoded, reducing computational cost.

3. **Bus Language Modeling (BusLM)**:
   - **Description**: News articles are divided into smaller segments (e.g., title, abstract, body) to reduce encoding complexity. A "bus" connection is established between the segments, allowing information to be exchanged between them through a proxy embedding (often the [CLS] token).
   - **Interaction**: This technique allows segments to be processed more efficiently while maintaining the quality of the final news embeddings by preserving cross-segment context.

4. **Autoregressive User Modeling**:
   - **Description**: Instead of encoding the user’s history for each individual prediction, this method reuses previously encoded parts of the user’s history to predict subsequent clicks. It treats the entire user history as a unified training instance and progressively predicts future clicks.
   - **Interaction**: By reusing user history embeddings, the system reduces the need for redundant encoding, speeding up the training process and improving efficiency.

5. **Content Refinement**:
   - **Description**: This component refines news content by removing non-informative parts and compactly representing the important content using a method called **Ordered Bag-of-Words (OBoW)**.
   - **Interaction**: It ensures that only the most relevant and informative parts of a news article are passed to the encoder, further reducing the complexity and improving training quality.

Together, these components form a pipeline where news and user data are efficiently processed through **centralized encoding**, cached for reuse, and further optimized using **BusLM** for high-quality embeddings. **Autoregressive user modeling** ensures that the user history is encoded only once and reused, while **content refinement** guarantees the input is as compact as possible without losing critical information. This interaction of components leads to high-speed training without sacrificing the quality of recommendations.'''
   },
   "RankGPT.pdf": {
      "adaption_gen": '''The proposed method adapts the base models through several key training steps:

1. **Instructional Permutation Generation**: The base large language model (LLM) is instructed to generate a permutation of passages based on their relevance to a query. This approach directly outputs a ranked list without intermediate relevance scores.

2. **Sliding Window Strategy**: To handle the token limitation of LLMs, a sliding window strategy is introduced. The model re-ranks passages by sliding through them in smaller groups, ensuring all passages are ranked.

3. **Permutation Distillation**: To adapt the base model, a permutation distillation process is employed. This involves distilling the ranking ability of the large model (such as ChatGPT) into a smaller specialized model by using ChatGPT-generated permutations as targets. This is achieved through a RankNet-based distillation objective, which minimizes the difference between the rankings generated by ChatGPT and those by the smaller model.

4. **Training Objective**: The training objective is to optimize the student model's ranking performance using the RankNet loss function, which focuses on the correct relative ordering of passages.

These steps collectively aim to transfer the ranking capability of large models to more efficient, smaller models.''',
      "base_model_gen": '''The proposed method adapts the following base models:

1. **DeBERTa-large**: A BERT-like model used as a cross-encoder. It concatenates the query and passage, and the relevance is estimated using the representation of the [CLS] token.
   
2. **LLaMA-7B**: A GPT-like model that uses a zero-shot relevance generation instruction. It classifies the query and passage as relevant or irrelevant by generating a relevance token, with the relevance score defined by the generation probability of this token.''',
      "task_gen": '''The proposed method is evaluated on the following tasks:

1. **Passage Re-Ranking**: The method is tested on several passage re-ranking benchmarks, specifically:
   - **TREC-DL (2019 and 2020)**: A well-established benchmark dataset for document ranking.
   - **BEIR (Benchmarking Information Retrieval)**: Includes diverse retrieval tasks from various domains such as scientific articles (Covid), biomedical information retrieval (NFCorpus), argument retrieval (Touche), entity retrieval (DBPedia), claim verification (SciFact), news retrieval (News), and others.
   - **Mr.TyDi**: A multilingual benchmark for passage retrieval across low-resource languages such as Arabic, Bengali, Finnish, Indonesian, Japanese, Korean, Russian, Swahili, Telugu, and Thai.
   
2. **NovelEval**: A newly constructed test set with 21 novel questions covering various domains such as sports, technology, movies, and other topics to evaluate the model's performance on previously unseen knowledge.''',
      "paradigm_gen": '''The modeling paradigm of the proposed method consists of several components that work together to adapt large language models (LLMs) for efficient passage re-ranking. The key components and their interactions are as follows:

### 1. **Instructional Permutation Generation**
   - **Description**: This approach uses a listwise method where the LLM is instructed to generate a permutation of passages (ranked by relevance to a query). Instead of assigning a score to each passage, the LLM directly outputs a ranked list using passage identifiers.
   - **Interaction**: The LLM (e.g., ChatGPT or GPT-4) processes a group of passages identified by unique labels (e.g., [1], [2]), and ranks them in descending order of relevance to a specific query. This process bypasses intermediate steps such as generating relevance scores.

### 2. **Sliding Window Strategy**
   - **Description**: To overcome the token limitations of LLMs, the method employs a sliding window approach. The LLM can only process a limited number of passages at once, so passages are divided into smaller groups (windows) and ranked progressively.
   - **Interaction**: Passages are re-ranked in batches, and as the window slides, the rankings from the previous batch are used to inform the ranking of the next set of passages. This ensures all passages are ranked despite the token limit.

### 3. **Permutation Distillation**
   - **Description**: The ranking ability of the large LLM (such as ChatGPT) is distilled into a smaller, specialized model through a process called permutation distillation. ChatGPT-generated passage rankings serve as the training targets for the smaller model.
   - **Interaction**: The smaller model is trained to replicate the ranking performance of the LLM by minimizing the differences in their outputs using a **RankNet loss**. This enables the smaller model to efficiently approximate the LLM's ranking capabilities, reducing computational cost.

### 4. **Training Objective (RankNet Loss)**
   - **Description**: The **RankNet loss** function is used to train the specialized model. This loss optimizes the model by focusing on the correct relative ordering of passages (i.e., the permutation), rather than individual relevance scores.
   - **Interaction**: The RankNet loss ensures that the order of the passages predicted by the smaller model closely matches the permutation generated by the LLM. The training leverages many passage pairs to fine-tune the model to match ChatGPT's ranking behavior.

### Component Interaction Summary:
   - The LLM generates permutations of passages based on their relevance to a query. Due to token limitations, a sliding window approach ensures all passages are ranked. The generated rankings are used as training data to distill the ranking ability into a smaller, specialized model, which is then trained using the RankNet loss. This combination of instructional permutation generation, sliding windows, and distillation allows the method to balance accuracy with efficiency.

By integrating these components, the proposed method leverages the powerful ranking capabilities of LLMs while distilling them into smaller, cost-efficient models that maintain competitive performance.'''
   },
}

sys_review_chatgpt = {
   "ReprBERT.pdf": {
      "adaption_gen": '''The training steps to adapt the base models in the proposed method, ReprBERT, involve the following key processes:

1. **Teacher Model Construction**: A 12-layer StructBERT model is used as the teacher, which has been pre-trained on general domain data and further continue-trained on a large, in-domain e-commerce dataset. This fine-tuning with e-commerce data helps the model perform better on the relevance task.

2. **Student Model Initialization**: The ReprBERT model, acting as the student, is initialized using the parameters from the first several layers of the teacher model to capture general knowledge. The method of initializing ReprBERT with jump layers (e.g., 1, 3, 5 layers) from the teacher model was also explored, but the results were not better.

3. **Knowledge Distillation**: The teacher model is used to annotate a large-scale unlabeled dataset to generate "soft labels." The student model is trained to minimize the cross-entropy loss between its predictions and the teacher’s soft labels. This step ensures that ReprBERT learns from the outputs of the more complex teacher model.

4. **Finetuning**: After the knowledge distillation, ReprBERT is fine-tuned on a human-annotated dataset using both soft and hard labels. The loss function combines the cross-entropy loss between the model's output and the teacher's soft labels and between the model's output and the actual hard labels. This finetuning process further improves the model’s performance for the specific e-commerce relevance task.

These steps help adapt the base BERT model into an efficient, representation-based ReprBERT model suitable for online deployment in e-commerce search engines.''',
      "base_model_gen": '''The base models adapted in the proposed method, ReprBERT, are:

1. **BERT**: A pre-trained deep bidirectional transformer model widely used for various natural language processing (NLP) tasks. BERT serves as the foundational architecture for both the teacher and student models.

2. **StructBERT**: A variant of BERT used as the teacher model in ReprBERT. StructBERT incorporates additional language structure information during pre-training and continues training on a large-scale e-commerce dataset. This model is more fine-tuned for e-commerce relevance tasks and serves as the source for knowledge distillation to the ReprBERT student model.

These base models are adapted through the use of knowledge distillation, where StructBERT (the teacher) transfers knowledge to the student ReprBERT model.''',
      "task_gen": '''The proposed method, ReprBERT, is evaluated on the following tasks:

1. **E-commerce Relevance Task**: The primary evaluation is on measuring the semantic relevance between queries and products in e-commerce search engines. This involves determining whether the retrieved products match the user's search intent based on the input query.

2. **Binary Classification Task**: Since the relevance data is binary (Good or Bad), the task is framed as a binary classification problem, where the goal is to predict the correct category (relevant or irrelevant) for each query-product pair.

The method is evaluated both offline using metrics like ROC-AUC and Neg PR-AUC, and online through A/B testing on the Taobao search engine to assess real-world performance improvements in relevance and user engagement.''',
      "paradigm_gen": '''The proposed method, **ReprBERT**, uses a **representation-based modeling paradigm** with several components designed to balance efficiency and performance for e-commerce relevance tasks. Here's a detailed outline of its components and how they interact:

1. **Base Encoder (BERT)**: 
   - ReprBERT uses a **BERT-based encoder** to encode queries and product titles into embeddings (representations). BERT's multi-layer architecture allows for rich context encoding.
   - The base model is adapted by using **StructBERT** as a teacher, a variant of BERT that is pre-trained and continue-trained on e-commerce data, improving relevance performance.

2. **Knowledge Distillation**:
   - **StructBERT** serves as the teacher model, which is trained on general and e-commerce-specific datasets. The knowledge distillation process transfers knowledge from this powerful teacher model to ReprBERT (the student).
   - ReprBERT is trained to mimic StructBERT’s outputs by minimizing the loss between the teacher’s "soft labels" (predicted probabilities) and the student’s outputs, as well as between the student’s outputs and the true labels. This step allows ReprBERT to achieve high performance while being more efficient.

3. **Siamese Architecture**:
   - ReprBERT adopts a **Siamese architecture**, where two identical BERT encoders are used to separately encode the query and product title into embeddings. This allows the embeddings of queries and products to be pre-computed offline, making the model efficient for real-time applications.

4. **Context-Guided Attention**:
   - A **context-guided attention mechanism** is introduced to improve the quality of the embeddings. This mechanism captures interactions between the query and product tokens, allowing ReprBERT to model relationships between words at a finer granularity than simple pooling techniques.

5. **Late Interaction Module**:
   - To address the limitation of representation-based models, which lack interaction between query and product embeddings, ReprBERT introduces a **late interaction module**. After the embeddings are generated, **summation, subtraction, and max pooling** operations are applied to the query and product embeddings to capture fine-grained matching information.
   - This allows for richer interaction without sacrificing the efficiency of the Siamese architecture.

6. **Intermediate Interaction Module**:
   - To further close the performance gap with interaction-based models like BERT, ReprBERT includes an **intermediate interaction strategy**. It performs interactions at multiple intermediate layers, allowing representations from different layers of the query and product to interact with each other.
   - These interactions are pooled to form a final representation that better captures the nuanced relationships between the query and product at different semantic levels.

7. **Final Relevance Prediction**:
   - After the embeddings go through late and intermediate interactions, the resulting representations are passed through a **multi-layer perceptron (MLP)** with a residual connection to predict the relevance score.
   - The relevance score is generated as a binary classification (relevant or irrelevant) using a softmax function, indicating whether the product matches the query.

**Interaction Between Components**:
- The **BERT encoder** produces initial embeddings for the query and product. 
- The **context-guided attention** enhances these embeddings by attending to token-level interactions.
- The **late interaction module** captures interaction between the final query and product embeddings, while the **intermediate interaction module** captures interactions between the intermediate layers’ representations.
- The final, refined representations from both interaction modules are passed through the **MLP** for relevance scoring.

This architecture achieves a balance of efficiency (with pre-computed embeddings) and performance (by incorporating interaction mechanisms), making it suitable for large-scale e-commerce search engines with real-time constraints.'''
   },
   "MESE.pdf": {
      "adaption_gen": '''In the proposed method, the training process involves several key steps to adapt the base models, including the pre-trained language model (PLM) and the item metadata encoder, for both recommendation and response generation tasks. These steps are as follows:

1. **Joint Training of Item Encoder and PLM**: The item encoder and the PLM are trained together to extract useful item metadata and align it with the dialog context. The item encoder learns to map item metadata to embeddings that can be integrated into the dialog context, and the PLM consumes these embeddings to generate recommendations and responses.

2. **Candidate Selection Training**: During training, the system selects candidate items from the database by calculating similarity scores between the dialog context and the item metadata. A special token, "[REC]," is used to indicate the recommendation process, and the model is trained to predict the ground truth item over negative examples using cross-entropy loss.

3. **Candidate Ranking Training**: In this step, after candidate selection, the system performs more fine-grained scoring using the self-attention mechanism of the PLM. It recomputes similarity scores between the dialog context and the candidate items, further ranking them to reflect the user’s preferences. This process also uses cross-entropy loss.

4. **Response Generation Training**: The PLM is trained to reconstruct system utterances by using both the dialog context and the metadata of the recommended items. This involves generating responses with placeholders for the recommended items, which are later replaced with the actual item titles. The loss function for this step is optimized to improve the accuracy of generating relevant responses.

5. **Loss Function Optimization**: A combined loss function is used to jointly train the system on recommendation and response generation tasks. This involves optimizing three objectives: candidate selection loss, candidate ranking loss, and response generation loss, with specific weights assigned to each.

These training steps adapt the base models to perform well on both recommendation and response generation by integrating item metadata and dialog context effectively.''',
      "base_model_gen": '''The proposed method adapts two primary base models:

1. **Pre-trained Language Model (PLM)**: Specifically, the method uses **GPT-2** as the backbone for dialog generation. The PLM is adapted to integrate item metadata with the conversational context using self-attention mechanisms to generate recommendations and responses.

2. **DistilBERT**: This model is used as the foundation for the **item metadata encoder**, which encodes item metadata (such as titles, genres, and plots) into embeddings. These embeddings are then used alongside the dialog context in the PLM for recommendation and response generation tasks.

Both of these base models—**GPT-2** and **DistilBERT**—are adapted and jointly trained to align item metadata with the dialog context for improved conversational recommendations and response generation.''',
      "task_gen": '''The proposed method is evaluated on two main tasks:

1. **Recommendation Task**: This task involves recommending items (e.g., movies) based on the dialog context. The system is evaluated on metrics such as Recall@X (R@1, R@10, R@50) to assess whether the recommended items include the ground truth items provided by human recommenders.

2. **Response Generation Task**: This task focuses on generating coherent and contextually appropriate responses in conversation. The method is evaluated using language metrics such as perplexity (PPL), distinct n-grams (Dist2, Dist3, Dist4), and BLEU scores (BLEU2, BLEU4) to measure the fluency and diversity of the generated responses.

Additionally, human evaluation is conducted to assess the fluency and informativeness of the responses. The model is compared against baselines on both recommendation and response generation tasks.''',
      "paradigm_gen": '''The modeling paradigm of the proposed method, called **MESE** (Metadata Enhanced learning approach via Semantic Extraction), integrates both item metadata and dialog context to improve conversational recommendation systems (CRS). It consists of two primary components: a **pre-trained language model (PLM)** and an **item metadata encoder**. These components work together to provide high-quality recommendations and generate coherent responses during conversations. Here’s how they interact:

### 1. **Item Metadata Encoder**:
   - **DistilBERT** is used as the core of the item metadata encoder. It takes item metadata (e.g., movie title, genre, actors, plot) as input and encodes this information into a vector embedding.
   - The metadata is concatenated using a special separator token "[SEP]" before being passed through the encoder. 
   - The encoder outputs an **item token embedding (ITE)**, which captures rich information about the item, making it context-aware.

### 2. **Pre-trained Language Model (PLM)**:
   - **GPT-2** is the PLM used for both recommendation and response generation tasks.
   - The PLM processes the dialog context as a sequence of word token embeddings (WTE).
   - The ITE produced by the item encoder is concatenated with the WTE from the dialog context to form a unified sequence that incorporates both dialog and item information. This sequence is then consumed by the PLM’s self-attention mechanism.

### 3. **Interaction Between Components**:
   - The **ITE** from the item encoder and the **WTE** from the dialog context are combined and fed into the PLM. This interaction allows the model to align and integrate item metadata with the conversational flow seamlessly.
   - The PLM uses self-attention to jointly process the dialog context and item information, ensuring that item recommendations are contextually relevant and the generated responses reflect the user’s needs.

### 4. **Recommendation Process**:
   - The recommendation task follows a two-phase process: **candidate selection** and **candidate ranking**.
     1. **Candidate Selection**: The entire item database is first narrowed down to a few hundred candidates based on the similarity between the dialog context and item metadata.
     2. **Candidate Ranking**: The PLM recomputes finer-grained similarity scores between the dialog context and the candidate items to rank them in accordance with the user’s preferences.

### 5. **Response Generation**:
   - The PLM is also responsible for generating natural language responses based on the dialog history and recommended items. 
   - When an item is recommended, a placeholder token "[PH]" is used during generation, which is replaced with the actual item name in the final response.

### Summary of Interaction:
- The item metadata encoder produces rich, context-aware embeddings for the items.
- These embeddings are combined with the dialog context and fed into the PLM.
- The PLM, with its self-attention mechanism, integrates this information to simultaneously handle both recommendation and response generation tasks.
- The interaction between the two components ensures that recommendations are contextually aligned with the dialog and that the responses naturally incorporate the recommended items.

This paradigm allows the model to handle both dynamic, real-time recommendations and high-quality conversational response generation in an integrated, seamless manner.'''
   },
   "ChatGPT.pdf":{
      "adaption_gen": '''In the proposed method, no fine-tuning of ChatGPT is performed on any specific recommendation data. Instead, the approach relies solely on task-specific prompts to adapt ChatGPT for recommendation tasks. These prompts include three key components: task description, behavior injection, and format indicators. Behavior injection helps ChatGPT understand user preferences through few-shot prompting, where user-item interaction data is introduced to enhance ChatGPT's ability to capture user interests. Additionally, the model's output is refined using predefined rules to ensure it matches the required format. Thus, the adaptation process involves prompt engineering and output refinement, not traditional model training or fine-tuning.''',
      "base_model_gen": '''The base model adapted in the proposed method is **ChatGPT** (specifically, the GPT-3.5-turbo version). No other traditional base models are fine-tuned or used directly in the proposed method. Instead, ChatGPT is utilized for various recommendation tasks such as rating prediction, sequential recommendation, direct recommendation, explanation generation, and review summarization. While other models like P5, MF (Matrix Factorization), MLP (Multilayer Perceptron), and BERT are mentioned as baselines or in comparison, the main focus of the proposed method is adapting **ChatGPT** through prompt engineering without fine-tuning.''',
      "task_gen": '''The proposed method is evaluated on the following five tasks:

1. **Rating Prediction** – Predicting how a user will rate a particular item.
2. **Sequential Recommendation** – Predicting the next item a user is likely to interact with based on their interaction history.
3. **Direct Recommendation** – Recommending the most relevant item from a pool of potential candidates based on explicit feedback.
4. **Explanation Generation** – Generating explanations for why a particular item is recommended to a user.
5. **Review Summarization** – Summarizing user reviews into concise, meaningful sentences.

These tasks are used to evaluate ChatGPT's performance in comparison to traditional recommendation models.''',
      "paradigm_gen": '''The modeling paradigm of the proposed method uses **ChatGPT** as a general-purpose recommendation system without any fine-tuning on recommendation-specific data. Instead, it relies entirely on **prompt engineering** to adapt ChatGPT for various recommendation tasks. The paradigm consists of the following key components:

1. **Task-Specific Prompt Construction**: Each recommendation task (rating prediction, sequential recommendation, direct recommendation, explanation generation, and review summarization) is adapted into a natural language processing task using carefully designed prompts. Each prompt has three parts:
   - **Task Description**: Converts the recommendation task into a natural language task.
   - **Behavior Injection**: Introduces user interaction history or few-shot learning information to help ChatGPT better capture user preferences and interests.
   - **Format Indicator**: Specifies the required format of the output to ensure the response is useful and consistent with expectations (e.g., output as a list or a rating).

2. **Interaction Between Components**: 
   - **Task-specific prompts** are fed into ChatGPT to generate recommendation outputs. The behavior injection component in the prompt allows the model to integrate user-item interaction data, helping ChatGPT understand users' preferences without requiring traditional model training.
   - **Output Refinement Module**: After ChatGPT generates its responses, an output refinement module checks the format of the responses. If the output does not match the expected format, it is corrected based on predefined rules. If the correction is unsuccessful, the prompt is re-fed into ChatGPT for re-generation until the format requirements are met.

This interaction between the task-specific prompts, ChatGPT’s natural language processing capabilities, and the output refinement module allows ChatGPT to function as a recommendation system, generating both accuracy-oriented results (e.g., rating prediction, direct recommendation) and explainability-oriented results (e.g., explanation generation, review summarization).'''
   },
   "LLM-Rate.pdf": {
      "adaption_gen": '''The proposed method adapts the base models, specifically the Flan-T5 models, through fine-tuning with human interaction data. The key training steps include:

1. **Formulation of the Task**: The rating prediction task is framed as either a multi-class classification or regression problem. In the multi-class classification setup, ratings are treated as discrete classes, and a cross-entropy loss is applied to train the model. For the regression task, a mean-squared-error (MSE) loss is used to optimize the model for continuous rating predictions.

2. **Fine-tuning Configuration**: The models are fine-tuned with specific hyperparameters, including a learning rate of 5e-5, batch size of 64, and a dropout rate of 0.1. They are trained for 50,000 steps across all datasets.

3. **Loss Function Selection**: It is observed that fine-tuning with a regression loss performs better than fine-tuning with a classification loss in most cases, indicating the importance of selecting the appropriate optimization objective for the task.

By following these steps, the base models are adapted for more accurate rating predictions through efficient fine-tuning on a small fraction of data.''',
      "base_model_gen": '''In the proposed method, the base models that are adapted are the **Flan-T5** models, specifically the **Flan-T5-Base** (250M parameters) and **Flan-T5-XXL** (11B parameters). These models are fine-tuned for the user rating prediction task using either a multi-class classification or regression approach.''',
      "task_gen": '''The proposed method is evaluated on the task of **user rating prediction**. This involves predicting a user's rating for a candidate item based on their past interactions. Specifically, the method is tested in three different settings:

1. **Zero-shot** prediction.
2. **Few-shot** prediction.
3. **Fine-tuning** on human interaction data.

The evaluation is conducted on two widely adopted benchmark datasets for recommendation systems:

- **MovieLens-1M**, which includes 1 million user ratings for movies.
- **Amazon-Books**, which contains user ratings on books from the Amazon Review Dataset.''',
      "paradigm_gen": '''The proposed method follows a modeling paradigm that adapts **Large Language Models (LLMs)**, specifically the **Flan-T5** models, to the task of **user rating prediction**. This paradigm consists of several components and their interactions, as outlined below:

1. **Input Features and Representation**:
   - The model takes a user’s historical interaction sequence, which includes information such as item IDs, titles, metadata (e.g., genre, brand), and past ratings.
   - These features are represented as text-based input prompts, which are fed into the LLM. The model processes both the user’s interaction history and the candidate item’s metadata as part of the input.

2. **Task Formulation**:
   - The rating prediction task is formulated in two ways:
     1. **Multi-class Classification**: Ratings (1 to 5) are treated as discrete classes. The LLM predicts the most likely class for the given input using a cross-entropy loss.
     2. **Regression**: The model predicts a continuous rating by modifying the projection layer to output a single value. The mean-squared-error (MSE) loss is used for optimization in this setting.

3. **Zero-shot and Few-shot Learning**:
   - In zero-shot settings, the LLM uses pre-existing world knowledge to predict ratings without any fine-tuning, solely based on the prompt structure.
   - In few-shot settings, a small number of labeled examples are included in the prompt to help the model better understand the task.

4. **Fine-tuning on Human Interaction Data**:
   - The LLMs are fine-tuned on datasets containing user interaction data. This process involves optimizing the model with the target dataset’s specific rating distribution, improving the LLM's ability to understand user preferences.
   - Fine-tuning is crucial for improving performance over zero-shot and few-shot approaches, particularly in dataset-specific contexts.

5. **Interaction Between Components**:
   - The input prompts encapsulate both the user’s historical data and item features in a textual format. The LLM processes these inputs to predict a rating using either classification or regression approaches.
   - The fine-tuning process leverages a small amount of interaction data to further adapt the LLM to the specific task, improving its accuracy by allowing it to learn from real-world user preferences.

In summary, the proposed method leverages LLMs' ability to process natural language representations of user interaction data and adapts them through fine-tuning for more accurate user rating predictions, with tasks formulated either as classification or regression.'''
   },
   "GIRL.pdf": {
      "adaption_gen": '''The proposed method takes three main training steps to adapt the base models:

1. **Supervised Fine-Tuning (SFT):** This step involves training the Large Language Model (LLM)-based generator to generate appropriate job descriptions (JDs) based on the curriculum vitae (CV) of a job seeker. The training is conducted using matched CV-JD pairs, where the model is fine-tuned with a manually designed prompt template to understand the task of job recommendation.

2. **Reward Model Training (RMT):** The reward model is trained to predict the matching score between CV-JD pairs by distinguishing between matched and mismatched pairs. This is achieved using recruiter feedback to create training data, and the reward model is optimized using a pairwise ranking loss.

3. **Reinforcement Learning from Recruiter Feedback (RLRF):** Proximal Policy Optimization (PPO)-based reinforcement learning is used to further align the LLM generator with recruiter preferences. The generator model is fine-tuned to ensure that it not only considers job seekers' preferences but also captures practical market demands. The reward model, developed in the previous step, provides feedback to guide this fine-tuning process.''',
      "base_model_gen": '''The proposed method adapts two types of base models:

1. **BERT:** This model is used as a text encoder to extract embeddings for both the job seeker's CV and the job description (JD) in the discriminative recommendation tasks. BERT helps compute matching scores by transforming the text into vector representations.

2. **GPT-based Large Language Models (LLMs):** These generative models are fine-tuned to generate personalized job descriptions (JDs) for job seekers based on their CVs. The GPT architecture serves as the foundation for generating text, and it is adapted using Supervised Fine-Tuning (SFT), Reward Model Training (RMT), and Reinforcement Learning (RL) from recruiter feedback to better align with the preferences of both job seekers and recruiters.

These base models (BERT for discriminative tasks and GPT-based LLMs for generative tasks) are adapted to improve both job description generation and recommendation quality.''',
      "task_gen": '''The proposed method is evaluated on two main tasks:

1. **Job Description Generation Quality:** The method is evaluated on its ability to generate high-quality job descriptions (JDs) based on a job seeker's curriculum vitae (CV). This is assessed through several metrics, such as the level of detail, relevance to the job seeker's profile, and conciseness of the generated JDs. The quality is compared against baseline models to determine if the generated results are effective.

2. **Enhanced Discriminative Job Recommendation:** The method is also evaluated on its ability to enhance the performance of traditional discriminative job recommendation models. The generated JDs are used to improve the matching between job seekers and job descriptions, and the evaluation metrics include AUC and LogLoss to measure the effectiveness of these enhancements. Additionally, the method's performance is tested under cold-start conditions, where it recommends jobs for job seekers who were not part of the training data.

These evaluations demonstrate the method's ability to generate relevant job descriptions and improve job recommendations.''',
      "paradigm_gen": '''The modeling paradigm of the proposed method consists of two primary components: **Generative Job Recommendation** and **Generation-Enhanced Job Recommendation.** These components interact to provide personalized job recommendations based on Large Language Models (LLMs). Here’s how the method is structured:

### 1. **Generative Job Recommendation:**
   - **Goal:** The aim is to generate personalized job descriptions (JDs) for job seekers based on their curriculum vitae (CV). This differs from traditional recommendation systems that retrieve and rank existing jobs. Instead, this component generates new JDs that align with the job seeker’s profile.
   - **LLM-based Generator:** At the core is a generative model (based on LLMs like GPT) that is fine-tuned to understand job recommendation tasks using Supervised Fine-Tuning (SFT). The model generates a job description that is most suitable for the given CV of a job seeker.
   - **Training Process:**
     1. **Supervised Fine-Tuning (SFT):** The model is trained on matched CV-JD pairs to generate appropriate JDs for specific job seekers.
     2. **Reward Model Training (RMT):** A reward model is trained to assess the quality of the generated JDs by comparing matched and mismatched CV-JD pairs, using feedback from recruiters.
     3. **Reinforcement Learning from Recruiter Feedback (RLRF):** Proximal Policy Optimization (PPO)-based reinforcement learning further aligns the generator with recruiter preferences, fine-tuning it to generate JDs that better fit market demands.

### 2. **Generation-Enhanced Job Recommendation:**
   - **Goal:** This component aims to enhance the performance of traditional job recommendation models by using the generated JDs from the first component to bridge the semantic gap between CVs and existing JDs.
   - **How It Works:**
     - The generated JDs are treated as additional features for traditional discriminative recommendation models. 
     - In the enhanced recommendation, the generated JD is embedded using a text encoder (such as BERT), and its representation is used along with the original CV and JD embeddings to improve the matching process.
     - Two methods are proposed:
       1. **MLP Predictor:** It concatenates the embeddings of the CV, original JD, and generated JD to predict the matching score.
       2. **Dot Predictor:** It enhances the job seeker’s embedding by combining the CV with the generated JD and then calculates the matching score with the job description.

### Interaction Between Components:
   - **Generated JDs:** The job descriptions generated by the LLM in the first component serve as an additional input for the second component. These generated JDs help improve the overall accuracy of job recommendations by offering richer semantic information about the job seeker.
   - **Enhancing Discriminative Models:** By integrating the generated JDs, the traditional discriminative job recommendation models benefit from more personalized and accurate matching between job seekers and job descriptions. This interaction enables the system to improve job recommendations and address challenges like cold-start conditions.

### Summary:
The proposed method combines a **generative model** to create personalized job descriptions and a **discriminative model** to improve job recommendation performance. These components interact through the generation of job descriptions, which are then used to enhance the discriminative models by providing richer, more tailored information for matching job seekers with suitable jobs.'''
   },
   "Agent4Rec.pdf": {
      "adaption_gen": '''The training steps taken to adapt the base models in the proposed method, Agent4Rec, involve several key processes designed to emulate and refine user behavior in recommendation environments:

1. **Initialization**: Generative agents are initialized using real-world datasets (such as MovieLens-1M, Steam, and Amazon-Book) that reflect users' personalized preferences and social traits. These datasets form the foundation for agent profiles, which consist of unique tastes and social traits like activity, conformity, and diversity.

2. **Feedback-driven Retraining**: After the recommendation simulation is completed, both viewed and unviewed movies from the simulation are added as signals to the training set of each agent. The recommendation algorithms are then retrained using these augmented datasets, aiming to better capture user preferences.

3. **Iterative Refinement**: The recommendation system iteratively retrains after each simulation round. Agents provide feedback based on their satisfaction and preferences, and the system adjusts its recommendations accordingly. This iterative process is particularly evident in experiments like the page-by-page recommendation enhancement, where retrained models incorporate agent interactions to improve recommendation accuracy.

4. **Memory Modules and Reflection**: Agents store both factual and emotional memories of their interactions with the recommendation system. These memories influence subsequent behaviors and decisions, contributing to the agents' ability to reflect on satisfaction levels and preferences, which in turn help refine the recommendation models.

Through these steps, the base models are adapted to align better with user preferences, enabling the agents to more accurately simulate human-like behaviors in recommendation tasks.''',
      "base_model_gen": '''The base models adapted in the proposed method, **Agent4Rec**, are primarily **collaborative filtering-based recommendation algorithms**. Specifically, the following models are adapted within the Agent4Rec framework:

1. **Matrix Factorization (MF)**: This is a widely-used collaborative filtering model that decomposes the interaction matrix into lower-dimensional representations to predict user-item interactions.
  
2. **LightGCN**: A simplified version of Graph Convolutional Networks (GCNs) tailored for recommendation tasks, focusing on leveraging user-item interactions through graph structures.

3. **MultVAE**: A variational autoencoder-based collaborative filtering model that captures latent factors of users and items, aiming to enhance the diversity and quality of recommendations.

4. **Random and Most Popular**: These are baseline models used for comparison in the simulation, with random recommendations and recommendations based on the most popular items in the dataset.

These base models are integrated into the recommendation environment and adapted through the iterative simulation and feedback-driven retraining process using agent interactions and preferences. The system's architecture allows for flexibility, enabling researchers to extend the simulation to other recommendation algorithms as needed.''',
      "task_gen": '''The proposed method, **Agent4Rec**, is evaluated on several tasks related to user simulation and recommendation system performance. These tasks aim to assess both the accuracy of agent behavior in simulating real users and the effectiveness of different recommendation strategies. The key tasks are:

1. **User Preference Alignment**:  
   This task evaluates how well the generative agents align with real users’ preferences. Specifically, the agents are tested on their ability to distinguish between items that real users have interacted with and those they haven't. The alignment is measured using metrics like **accuracy, precision, recall,** and **F1 score** across various real-world datasets (MovieLens-1M, Steam, Amazon-Book).

2. **Rating Distribution Alignment**:  
   This task assesses whether the agents can replicate the overall distribution of user ratings. The goal is to compare the agents' simulated rating distributions with the actual rating distributions from the MovieLens-1M dataset. The alignment is evaluated by analyzing how closely the agents' ratings match the real-world data.

3. **Social Traits Alignment**:  
   This task evaluates how well the agents' behavior reflects their social traits, such as **activity**, **conformity**, and **diversity**. The agents are grouped based on these traits, and their interaction patterns are analyzed to ensure they exhibit differentiated behaviors in line with the expected traits.

4. **Recommendation Strategy Evaluation**:  
   This task compares the effectiveness of different recommendation algorithms, such as **Matrix Factorization (MF)**, **LightGCN**, **MultVAE**, and baseline methods (e.g., random, most popular). The agents' satisfaction, viewing ratios, and exit behaviors are used to evaluate how well each algorithm performs in terms of providing personalized recommendations.

5. **Page-by-Page Recommendation Enhancement**:  
   In this task, the system evaluates the ability of recommendation algorithms to improve over time through feedback. Agents' interactions (both viewed and unviewed items) are used to retrain the recommendation models, and improvements in metrics like **Recall@20** and **NDCG@20** are tracked to assess the effectiveness of feedback-driven recommendation refinement.

6. **Filter Bubble Effect Simulation**:  
   This task explores whether the simulator can replicate the **filter bubble effect**, where users are exposed to increasingly homogeneous content. The diversity of genres recommended to agents is tracked over several rounds of simulations to measure the reduction in content diversity caused by feedback loops.

7. **Causal Relationship Discovery**:  
   This task uses the simulator to explore causal relationships in recommendation systems. Data from agent simulations (e.g., movie quality, popularity, exposure rate, and number of views) is used to construct a **causal graph** to uncover underlying factors that influence movie ratings and popularity in the simulated environment.

These tasks comprehensively evaluate Agent4Rec’s ability to simulate user behavior and improve recommendation systems based on agent feedback and interactions.''',
      "paradigm_gen": '''The modeling paradigm of the proposed method, **Agent4Rec**, involves two core facets: **LLM-empowered generative agents** and the **recommendation environment**. These components are designed to simulate user behavior and evaluate recommendation algorithms. Here's a breakdown of both components and how they interact:

### 1. **LLM-Empowered Generative Agents**

The generative agents in Agent4Rec are built upon **Large Language Models (LLMs)**, specifically using the **GPT-3.5-turbo** model as their foundation. These agents are enhanced with three specialized modules to emulate human-like behaviors in recommendation scenarios:

- **Profile Module**: This module is initialized using real-world datasets (e.g., MovieLens, Steam, Amazon-Book) and captures each agent's **social traits** (activity, conformity, and diversity) and **personalized preferences** (e.g., taste in movies). These traits guide the agent’s interactions and preferences, ensuring that its actions align with human behavior patterns.

- **Memory Module**: This logs both **factual and emotional memories** from the agents' interactions within the recommendation environment. Factual memories record which items were recommended and how the agent responded (e.g., watching, rating). Emotional memories capture feelings, such as satisfaction and fatigue, allowing the agent to reflect on these and use them in future decisions.

- **Action Module**: This enables agents to take both **taste-driven actions** (e.g., watching, rating, and reacting to movies) and **emotion-driven actions** (e.g., exiting the system, rating the recommendation list). These actions depend on the agent's current emotional state and satisfaction with previous recommendations.

### 2. **Recommendation Environment**

The recommendation environment presents personalized movie recommendations to agents in a **page-by-page manner**, simulating real-world recommendation systems (like Netflix or YouTube). This environment is composed of the following elements:

- **Item Profile Generation**: Movies and other items are represented with profiles that include key features such as **quality**, **popularity**, **genre**, and a summary generated by LLM. These profiles help the agents make decisions about which items to engage with.

- **Recommendation Algorithms**: The recommendation engine supports various collaborative filtering-based algorithms, including **Matrix Factorization (MF)**, **LightGCN**, and **MultVAE**, along with baseline methods like **random** and **most popular** recommendations. These algorithms generate the items shown to the agents.

- **Feedback Mechanism**: The environment is designed to collect feedback from the agents based on their actions and emotional responses. This feedback is used to refine the recommendation algorithms, improving the quality of recommendations over multiple simulation rounds.

### Interaction Between Components

The generative agents interact with the recommendation environment in a **dynamic, iterative process**:

1. **Recommendation Presentation**: The environment presents a set of recommendations (movies) to the agents using one of the collaborative filtering algorithms.
   
2. **Agent Interaction**: The agents evaluate the recommendations using their **profile** (preferences and social traits) and their **memory** (past experiences and emotions). They then decide to watch, rate, or ignore items and may eventually exit the system based on their satisfaction.

3. **Feedback Loop**: The agents' actions and emotional responses provide feedback to the system, which is logged in the memory module. This feedback is used to **retrain** the recommendation models, fine-tuning them to better match the agents' preferences in future rounds.

4. **Simulation Iteration**: The system iteratively refines recommendations based on the agents' preferences and feedback, leading to enhanced recommendation strategies over time. This process also helps replicate real-world phenomena like the **filter bubble effect**.

### Summary

In the proposed **Agent4Rec** paradigm, the **LLM-empowered agents** and the **recommendation environment** continuously interact. Agents make decisions based on their profile, memory, and emotions, while the environment adapts its recommendations using feedback from these interactions. This setup allows for a comprehensive simulation of user behavior and the evaluation of various recommendation strategies in a dynamic, evolving system.'''
   },
   "SpeedyFeed.pdf": {
      "adaption_gen": '''The proposed method, SpeedyFeed, takes several key training steps to adapt the base models effectively:

1. **Input Processing**: Raw user interaction data is processed, where news articles and user histories are tokenized and padded/truncated to a unified length for training.

2. **Centralized News Encoding**: News articles from both user histories and candidate news are gathered into a merged set for centralized encoding. This step eliminates redundant padded data, increasing data efficiency by jointly encoding all news articles and dispatching embeddings to their original positions.

3. **Cache-Accelerated News Encoding**: A caching mechanism is used to reuse recently generated news embeddings. Cached embeddings are reused if they haven’t expired, significantly reducing redundant encoding operations and improving training speed.

4. **Autoregressive User Modeling**: The model reuses encoded user history for predicting multiple news clicks in one shot, rather than re-encoding history for each new training instance. This autoregressive method reduces computation and speeds up training.

5. **Bus Language Modeling (BusLM)**: To handle the encoding of longer news articles, BusLM splits the articles into smaller segments while maintaining information exchange between segments through a "bus connection". This reduces the encoding complexity and preserves news embedding quality.

6. **Content Refinement**: The informative parts of news articles are extracted and transformed into a compact representation before being processed by the PLMs, reducing unnecessary data processing.

7. **Dynamic Batching**: Training instances are dynamically grouped and batched based on the lengths of news articles, minimizing padded data and maximizing GPU utilization.

These steps optimize the training workflow to adapt base models like UniLM for large-scale news recommendation, improving both efficiency and effectiveness.''',
      "base_model_gen": '''In the proposed method, SpeedyFeed, the base models that are adapted include:

1. **UniLM**: A full-scale pretrained language model (PLM), UniLM (Unified Language Model), serves as the primary base model. It is a 12-layer model with 768 hidden dimensions and is used as the news encoder. The UniLM model is trained end-to-end to achieve the best performance in news recommendation.

2. **MiniLM**: A distilled, simplified version of UniLM, MiniLM has its depth and width reduced by 50% (i.e., 6 layers and 384 hidden dimensions). This model is used as a smaller alternative to the full UniLM model.

3. **UniLM-Half**: Another simplified version of UniLM, UniLM-Half has the same size as MiniLM but inherits the model weights directly from the full UniLM.

4. **UniLM-Last**: This version uses the entire UniLM but only fine-tunes the last layer during training, in contrast to the default method, which trains UniLM end-to-end.

All of these base models are trained and adapted using the SpeedyFeed framework, with modifications such as centralized news encoding, cache acceleration, and BusLM to make their training more efficient for large-scale news recommendation tasks.''',
      "task_gen": '''The proposed method, SpeedyFeed, is evaluated on the following tasks:

1. **News Recommendation**: The primary task involves predicting users’ future news preferences based on their historical news clicks. The effectiveness of the method is tested in both offline and online environments.
   
2. **Offline Evaluation**: SpeedyFeed is tested using a large dataset from MSN users' news reading behaviors over four months. Metrics such as AUC, MRR, NDCG@5, NDCG@10, and Recall@K are used to assess the ranking and recall performance in recommending news articles.

3. **Online A/B Testing**: The method is evaluated on live Microsoft News users, focusing on key performance metrics like Clicks per User Usage (the number of clicks per user session) and User Engagement (the ratio of users who give feedback such as likes or shares after reading).

4. **Public Dataset Evaluation**: The method is also evaluated on the MIND dataset, a public dataset for news recommendation. In this evaluation, SpeedyFeed's performance is compared with several competitive baseline models.

These tasks assess both the recommendation quality and the training efficiency of the SpeedyFeed framework in real-world and controlled environments.''',
      "paradigm_gen": '''The modeling paradigm of the proposed method, SpeedyFeed, is designed to efficiently train large-scale news recommenders based on Pretrained Language Models (PLMs) like UniLM. The method optimizes both the training speed and recommendation quality by employing a series of components that interact in a coordinated workflow. Here's an outline of the components and how they interact:

### 1. **News and User Encoding:**
   - **Centralized News Encoding:** 
     News articles from user histories and candidate news are gathered into a "merged set" and encoded together. This approach eliminates unnecessary padded data and duplicated news articles, improving data efficiency. News embeddings are generated once and then dispatched to their respective original positions in the training instances.
     
   - **Bus Language Modeling (BusLM):** 
     To manage the complexity of encoding long news articles, BusLM divides the text into segments and connects them via a "bus" mechanism. This enables information exchange between segments, reducing encoding complexity while preserving the overall quality of the news embeddings.

   - **Cache-Accelerated News Encoding:**
     A caching mechanism is introduced to reuse previously generated news embeddings. Frequently encountered news articles, especially highly popular ones, are stored in cache and reused across training steps, further reducing the computational load.

   - **Content Refinement:** 
     To reduce the amount of non-informative data processed, a content refinement step is applied. It extracts the most important parts of each news article (e.g., titles, abstracts) to create more compact representations before encoding them with the PLMs.

### 2. **Autoregressive User Modeling:**
   - **User Embedding Reuse:** 
     Instead of re-encoding user history for every single click prediction, the method uses autoregressive modeling, where the encoded prefix of a user's news clicks is reused for predicting subsequent news clicks. This saves computational costs by encoding the full user history in one go and applying it across multiple training steps.

### 3. **Dynamic Batching:**
   - **Efficient Data Loading:** 
     A dynamic batching mechanism adjusts the size of mini-batches based on the lengths of the news articles. Training instances with similar lengths are grouped together to minimize padding, making the process more efficient and maximizing GPU utilization.

### Interaction Between Components:
   - **Centralized News Encoding** and **BusLM** work together to handle large-scale news encoding by segmenting and encoding news articles in a more efficient manner. The **Cache** and **Autoregressive User Modeling** components help in reducing redundant computations by reusing embeddings from previous steps.
   
   - The **Content Refinement** ensures that only the most relevant information is encoded, reducing the overall data processed by PLMs.
   
   - The **Dynamic Batching** component improves the computational efficiency by minimizing the use of padded data and adjusting batch sizes to the capabilities of the training hardware.

### Workflow:
The overall training process starts with **Input Processing** to format the data, followed by **News & User Encoding** through centralized encoding and caching. **BusLM** ensures efficient encoding of long news articles, and the cached embeddings are reused when applicable. The **Autoregressive User Modeling** helps reuse user embeddings for multiple click predictions, and **Dynamic Batching** optimizes the training efficiency by adapting batch sizes.

Together, these components interact in a way that drastically improves the training speed (by over 100x in some cases) while maintaining or enhancing the quality of news recommendations. This makes the method scalable and effective for large-scale PLMs-based recommender systems.'''
   },
   "RankGPT.pdf": {
      "adaption_gen": '''The training steps taken to adapt the base models in the proposed method include:

1. **Data Collection**: The researchers randomly sample 10,000 queries from the MS MARCO training set. For each query, BM25 retrieves 20 candidate passages, which are used for further re-ranking.

2. **Permutation Distillation**: The re-ranking capability of ChatGPT is distilled into smaller, specialized models. This involves using ChatGPT to generate the permutation of the passages, which serves as the training target for the student model.

3. **Model Training**: The base models (DeBERTa and LLaMA) are trained using the permutation generated by ChatGPT. The distillation objective minimizes the difference between the permutation outputs of the student and ChatGPT. RankNet loss is used to optimize the specialized model.

4. **Sliding Window Strategy**: Due to token limitations of LLMs, a sliding window strategy is employed, which allows for ranking larger sets of passages in stages.

5. **Training Hyperparameters**: The models are trained with a batch size of 32 using the AdamW optimizer, a constant learning rate of 5×10⁻⁵, and for two epochs.

These steps adapt the base models for passage re-ranking tasks by leveraging the re-ranking capabilities of ChatGPT through permutation distillation and fine-tuning on the MS MARCO dataset.''',
      "base_model_gen": '''The base models adapted in the proposed method are:

1. **DeBERTa**: Specifically, various sizes of the DeBERTa model are adapted, including DeBERTa-large, DeBERTa-base, DeBERTa-small, and DeBERTa-xsmall.

2. **LLaMA**: The LLaMA-7B model is adapted, with relevance generation instructions for fine-tuning.

These models are trained using the permutation generation method and the re-ranking outputs from ChatGPT to create specialized models for passage re-ranking tasks.''',
      "task_gen": '''The proposed method is evaluated on the following tasks:

1. **TREC Deep Learning (DL) Track**: This includes the TREC-DL19 and TREC-DL20 test sets, which are widely used benchmarks for information retrieval tasks.

2. **BEIR Benchmark**: A heterogeneous benchmark consisting of diverse retrieval tasks and domains, including:
   - **Covid**: Retrieval of scientific articles related to COVID-19.
   - **NFCorpus**: A biomedical information retrieval dataset.
   - **Touche**: An argument retrieval dataset.
   - **DBPedia**: Entity retrieval from the DBpedia corpus.
   - **SciFact**: Claim verification through retrieval of scientific evidence.
   - **Signal**: Retrieval of relevant tweets based on a news title.
   - **News**: Retrieval of relevant news articles from headlines.
   - **Robust04**: Evaluation on poorly performing topics.

3. **Mr.TyDi**: A multilingual passage retrieval dataset in ten low-resource languages, including Arabic, Bengali, Finnish, Indonesian, Japanese, Korean, Russian, Swahili, Telugu, and Thai.

4. **NovelEval Test Set**: A newly collected test set with 21 novel questions, designed to evaluate the method's performance on unfamiliar and recent knowledge, ensuring no data contamination from LLM training data.

These tasks cover various benchmarks from different domains, languages, and retrieval challenges.''',
      "paradigm_gen": '''The modeling paradigm of the proposed method involves adapting large language models (LLMs) like ChatGPT and GPT-4 for passage re-ranking tasks through **instructional permutation generation** and then distilling their ranking capabilities into smaller, specialized models. The key components and their interactions are outlined as follows:

### 1. **Large Language Models (LLMs) as Re-Ranking Agents**:
   - **ChatGPT and GPT-4** are employed for passage re-ranking using a novel **instructional permutation generation** method. These LLMs are prompted to directly rank passages based on their relevance to a given query, generating permutations of the passage identifiers instead of intermediate relevance scores.
   - The LLMs process a group of passages and output their ranking in a format such as `[2] > [3] > [1]`, where the most relevant passages appear first.

### 2. **Sliding Window Strategy**:
   - To handle token limitations in LLMs, the **sliding window strategy** is introduced. This technique divides a large set of passages into smaller overlapping windows, allowing LLMs to re-rank them iteratively. The first window's top-ranked passages influence the re-ranking of subsequent windows, ensuring a comprehensive comparison across all passages.

### 3. **Permutation Distillation**:
   - The **permutation generated by ChatGPT or GPT-4** is distilled into smaller, specialized models. The purpose of distillation is to transfer the ranking capability of LLMs to more efficient models.
   - A **student model** (e.g., a DeBERTa or LLaMA model) learns to replicate the LLM-generated permutation using a **RankNet-based distillation objective**, which focuses on preserving the relative order of passages.
   - This distillation process significantly reduces the computational cost and enables the deployment of efficient models that mimic the high-performance ranking abilities of larger LLMs.

### 4. **Specialized Models**:
   - The **specialized models** are based on smaller architectures like DeBERTa (e.g., DeBERTa-large, DeBERTa-base, etc.) and LLaMA-7B, fine-tuned using the distilled permutations.
   - These models are optimized for specific tasks and demonstrate improved efficiency and stability compared to directly using LLMs for re-ranking.

### 5. **Training and Optimization**:
   - The specialized models are trained on datasets like MS MARCO, using the permutation rankings from ChatGPT as ground truth.
   - The training objective involves **RankNet loss**, which minimizes the difference between the predicted and actual passage order. This pairwise loss measures how well the model ranks one passage relative to another.

### Interactions:
   - The **LLMs first generate passage rankings** through permutation generation, ensuring effective ranking decisions based on relevance.
   - The **sliding window strategy** enables handling larger sets of passages, and the rankings from the LLM are then distilled into smaller models using **permutation distillation**.
   - **Specialized models**, once trained, operate independently, replicating the performance of the LLMs with significantly reduced computational cost.

In summary, the modeling paradigm integrates **LLM-based passage ranking**, a **sliding window strategy** for managing large passage sets, and **distillation** into smaller models for practical, efficient deployment.'''
   },
}

sys_review_template = {
   "ReprBERT.pdf": {
      "adaption_gen": '''''',
      "base_model_gen": '''''',
      "task_gen": '''''',
      "paradigm_gen": ''''''
   },
   "MESE.pdf": {
      "adaption_gen": '''''',
      "base_model_gen": '''''',
      "task_gen": '''''',
      "paradigm_gen": ''''''
   },
   "ChatGPT.pdf":{
      "adaption_gen": '''''',
      "base_model_gen": '''''',
      "task_gen": '''''',
      "paradigm_gen": ''''''
   },
   "LLM-Rate.pdf": {
      "adaption_gen": '''''',
      "base_model_gen": '''''',
      "task_gen": '''''',
      "paradigm_gen": ''''''
   },
   "GIRL.pdf": {
      "adaption_gen": '''''',
      "base_model_gen": '''''',
      "task_gen": '''''',
      "paradigm_gen": ''''''
   },
   "Agent4Rec.pdf": {
      "adaption_gen": '''''',
      "base_model_gen": '''''',
      "task_gen": '''''',
      "paradigm_gen": ''''''
   },
   "SpeedyFeed.pdf": {
      "adaption_gen": '''''',
      "base_model_gen": '''''',
      "task_gen": '''''',
      "paradigm_gen": ''''''
   },
   "RankGPT.pdf": {
      "adaption_gen": '''''',
      "base_model_gen": '''''',
      "task_gen": '''''',
      "paradigm_gen": ''''''
   },
}

write_json(f'sys_review_chatgpt_pdf.json', sys_review_chatgpt_pdf)
write_json(f'sys_review_chatgpt.json', sys_review_chatgpt)

# Evaluation

In [10]:
samples = {sample['file']:sample for sample in dataset}

In [28]:
sys_review_chatgpt_pdf = read_json('sys_review_chatgpt_pdf.json')
sys_review_chatgpt = read_json('sys_review_chatgpt.json')
# sys_review_ret_mistral_mpnet = read_json('sys_review_ret_10_Mistral-7B-Instruct-v0.3_all-mpnet-base-v2.json')
# sys_review_ret_mistral_bge = read_json('sys_review_ret_10_Mistral-7B-Instruct-v0.3_bge-large-en-v1.5.json')
# sys_review_ret_mistral_mistral = read_json('sys_review_ret_10_Mistral-7B-Instruct-v0.3_e5-mistral-7b-instruct.json')
# sys_review_ret_llama3_mpnet = read_json('sys_review_ret_10_Meta-Llama-3.1-8B-Instruct_all-mpnet-base-v2.json')
# sys_review_ret_llama3_bge = read_json('sys_review_ret_10_Meta-Llama-3.1-8B-Instruct_bge-large-en-v1.5.json')
# sys_review_ret_llama3_mistral = read_json('sys_review_ret_10_Meta-Llama-3.1-8B-Instruct_e5-mistral-7b-instruct.json')
sys_review_openllm_mistral = read_json('sys_review_openllm_Mistral-7B-Instruct-v0.3.json')
sys_review_openllm_llama3 = read_json('sys_review_openllm_Meta-Llama-3.1-8B-Instruct.json')

In [None]:
print(sys_review_openllm_mistral['RankGPT.pdf']['adaption_prompt'])

In [13]:
attr2classfication = {
    "adaption": "Classify the following adaption description into one of the adaption types.\n\nAdaption Description:\n{context}\n\nAdaption types:\n1. Prompting\n2. In-context Learning\n3. Fine-tuning\n4. Prompt Tuning\n5. Instruction Tuning", 
    # "base_model": "What are the base models used in the proposed method?", 
    # "task": "What are the tasks being experimented in the paper?", 
    "paradigm": "Classify the following modeling paradigm description into one of the paradigm types.\n\nModeling Paradigm Description:\n{context}\n\nParadigm types:\n1. LLM Embeddings + RS. This modeling paradigm views the language model as a feature extractor, which feeds the features of items and users into LLMs and outputs corresponding embeddings. A traditional RS model can utilize knowledgeaware embeddings for various recommendation tasks.\n2. LLM Tokens + RS. Similar to the former method, this approach generates tokens based on the inputted items' and users' features. The generated tokens capture potential preferences through semantic mining, which can be integrated into the decision-making process of a recommendation system.\n3. LLM as RS. Different from (1) and (2), this paradigm aims to directly transfer pre-trained LLM into a powerful recommendation system. The input sequence usually consists of the profile description, behavior prompt, and task instruction. The output sequence is expected to offer a reasonable recommendation result.", 
}

In [29]:
test_data = sys_review_chatgpt
for sample in samples:
    # print(sample)
    for attr in attr2question:
        if attr in attr2classfication:
            test_data[sample][f'{attr}_cls'] = f.llm.generate([[HumanMessage(content=attr2classfication[attr].format(context=test_data[sample][f'{attr}_gen']))]], max_tokens=200).generations[0][0].text
write_json('sys_review_chatgpt_eval.json', test_data)

In [5]:
sys_review_chatgpt_pdf_eval = read_json('sys_review_chatgpt_pdf_eval.json')
sys_review_chatgpt_eval = read_json('sys_review_chatgpt_eval.json')
# sys_review_ret_mistral_mpnet = read_json('sys_review_ret_10_Mistral-7B-Instruct-v0.3_all-mpnet-base-v2.json')
# sys_review_ret_mistral_bge = read_json('sys_review_ret_10_Mistral-7B-Instruct-v0.3_bge-large-en-v1.5.json')
# sys_review_ret_mistral_mistral = read_json('sys_review_ret_10_Mistral-7B-Instruct-v0.3_e5-mistral-7b-instruct.json')
# sys_review_ret_llama3_mpnet = read_json('sys_review_ret_10_Meta-Llama-3.1-8B-Instruct_all-mpnet-base-v2.json')
# sys_review_ret_llama3_bge = read_json('sys_review_ret_10_Meta-Llama-3.1-8B-Instruct_bge-large-en-v1.5.json')
# sys_review_ret_llama3_mistral = read_json('sys_review_ret_10_Meta-Llama-3.1-8B-Instruct_e5-mistral-7b-instruct.json')
sys_review_openllm_mistral_eval = read_json('sys_review_mistral_eval.json')
sys_review_openllm_llama3_eval = read_json('sys_review_llama3_eval.json')

In [None]:
sys_review_chatgpt_eval['ReprBERT.pdf'].keys()

In [None]:
test_data = sys_review_chatgpt_eval
for sample in samples:
    print(sample)
    for attr in attr2question:
        print(attr)
        print('Question:', attr2question[attr])
        print('Gold Standard:', samples[sample][attr])
        if f'{attr}_cls' in test_data[sample]:
            print('Generation:', test_data[sample][f'{attr}_cls'], '\n\n')
        print('Generation:', test_data[sample][f'{attr}_gen'])
        print('-----\n')
    print('-------------------------------------\n\n')

In [None]:
eval_results = {
    'chatgpt' : [
        {
            'file': 'ReprBERT.pdf',
            "adaption": 1,
            "base_model": 1,
            "task": 1,
            "paradigm": 1,
        },
        {
            'file': 'MESE.pdf',
            "adaption": 0,
            "base_model": 1,
            "task": 1,
            "paradigm": 0,
        },
        {
            'file': 'ChatGPT.pdf',
            "adaption": 1,
            "base_model": 1,
            "task": 1,
            "paradigm": 1,
        },
        {
            'file': 'LLM-Rate.pdf',
            "adaption": 1,
            "base_model": 1,
            "task": 1,
            "paradigm": 1,
        },
        # {
        #     'file': 'GIRL.pdf',
        #     "adaption": 1,
        #     "base_model": 0,
        #     "task": 0,
        #     "paradigm": 0,
        # },
        {
            'file': 'Agent4Rec.pdf',
            "adaption": 0,
            "base_model": 0,
            "task": 1,
            "paradigm": 0,
        },
        {
            'file': 'SpeedyFeed.pdf',
            "adaption": 1,
            "base_model": 1,
            "task": 1,
            "paradigm": 1,
        },
        {
            'file': 'RankGPT.pdf',
            "adaption": 1,
            "base_model": 1,
            "task": 1,
            "paradigm": 1,
        },
    ],
    'chatgpt_pdf' : [
        {
            'file': 'ReprBERT.pdf',
            "adaption": 0,
            "base_model": 0,
            "task": 0,
            "paradigm": 0,
        },
        {
            'file': 'MESE.pdf',
            "adaption": 0,
            "base_model": 0,
            "task": 0,
            "paradigm": 0,
        },
        {
            'file': 'ChatGPT.pdf',
            "adaption": 0,
            "base_model": 0,
            "task": 0,
            "paradigm": 0,
        },
        {
            'file': 'LLM-Rate.pdf',
            "adaption": 0,
            "base_model": 0,
            "task": 0,
            "paradigm": 0,
        },
        {
            'file': 'GIRL.pdf',
            "adaption": 0,
            "base_model": 0,
            "task": 0,
            "paradigm": 0,
        },
        {
            'file': 'Agent4Rec.pdf',
            "adaption": 0,
            "base_model": 0,
            "task": 0,
            "paradigm": 0,
        },
        {
            'file': 'SpeedyFeed.pdf',
            "adaption": 0,
            "base_model": 0,
            "task": 0,
            "paradigm": 0,
        },
        {
            'file': 'RankGPT.pdf',
            "adaption": 0,
            "base_model": 0,
            "task": 0,
            "paradigm": 0,
        },
    ],
    'openllm_mistral' : [
        {
            'file': 'ReprBERT.pdf',
            "adaption": 0,
            "base_model": 0,
            "task": 0,
            "paradigm": 0,
        },
        {
            'file': 'MESE.pdf',
            "adaption": 0,
            "base_model": 0,
            "task": 0,
            "paradigm": 0,
        },
        {
            'file': 'ChatGPT.pdf',
            "adaption": 0,
            "base_model": 0,
            "task": 0,
            "paradigm": 0,
        },
        {
            'file': 'LLM-Rate.pdf',
            "adaption": 0,
            "base_model": 0,
            "task": 0,
            "paradigm": 0,
        },
        {
            'file': 'GIRL.pdf',
            "adaption": 0,
            "base_model": 0,
            "task": 0,
            "paradigm": 0,
        },
        {
            'file': 'Agent4Rec.pdf',
            "adaption": 0,
            "base_model": 0,
            "task": 0,
            "paradigm": 0,
        },
        {
            'file': 'SpeedyFeed.pdf',
            "adaption": 0,
            "base_model": 0,
            "task": 0,
            "paradigm": 0,
        },
        {
            'file': 'RankGPT.pdf',
            "adaption": 0,
            "base_model": 0,
            "task": 0,
            "paradigm": 0,
        },
    ],
    'openllm_llama3' : [
        {
            'file': 'ReprBERT.pdf',
            "adaption": 0,
            "base_model": 0,
            "task": 0,
            "paradigm": 0,
        },
        {
            'file': 'MESE.pdf',
            "adaption": 0,
            "base_model": 0,
            "task": 0,
            "paradigm": 0,
        },
        {
            'file': 'ChatGPT.pdf',
            "adaption": 0,
            "base_model": 0,
            "task": 0,
            "paradigm": 0,
        },
        {
            'file': 'LLM-Rate.pdf',
            "adaption": 0,
            "base_model": 0,
            "task": 0,
            "paradigm": 0,
        },
        {
            'file': 'GIRL.pdf',
            "adaption": 0,
            "base_model": 0,
            "task": 0,
            "paradigm": 0,
        },
        {
            'file': 'Agent4Rec.pdf',
            "adaption": 0,
            "base_model": 0,
            "task": 0,
            "paradigm": 0,
        },
        {
            'file': 'SpeedyFeed.pdf',
            "adaption": 0,
            "base_model": 0,
            "task": 0,
            "paradigm": 0,
        },
        {
            'file': 'RankGPT.pdf',
            "adaption": 0,
            "base_model": 0,
            "task": 0,
            "paradigm": 0,
        },
    ]
}

In [15]:
chatgpt_section = {
    'adaption': ['INTRODUCTION'] * 4 + ['INTRODUCTION'] * 3 + ['RECOMMENDATION WITH CHATGPT'] * 2 + ['Implementation Details'] * 2 + ['Specialization by Permutation Distillation'] + ['Experimental Results of Specialization'] + ['Abstract'] * 3 + ['Approach'],
    'base_model': ['INTRODUCTION', 'METHODOLOGY'] + ['Implementation Details'] * 2 + ['Specialization by Permutation Distillation'] * 2 + ['Abstract'] * 2 + ['Approach'] * 3,
    'task': ['EXPERIMENTS'] * 2 + ['INTRODUCTION'] + ['RECOMMENDATION WITH CHATGPT'] * 5 + ['INTRODUCTION'] * 2 + ['Datasets'] + ['Abstract', 'Experiments']
}

In [None]:
for k, v in chatgpt_section.items():
    print(k)
    print(Counter([s.lower() for s in v]))

In [None]:

a = []
for model, data in result.items():
    temp = {'model': model}
    for q, (p, r) in data.items():
        temp[f'{q}_p'] = p
        temp[f'{q}_r'] = r
    a.append(temp)
df = pd.DataFrame(a)
df

In [None]:
print(sys_review_openllm_mistral['haq-et-al-2017.pdf']['research_gap_gen'])

In [None]:
print(sys_review_openllm_mistral['haq-et-al-2017.pdf']['variable_definition_gen'])

In [None]:
retrieval_content = sys_review_ret_llama3_bge['haq-et-al-2017.pdf']['research_gap_retrieve_context']
for sent, _ in retrieval_content:
    print(sent)
    print('-------------------------\n\n')

In [None]:
retrieval_content = sys_review_ret_mistral_bge['haq-et-al-2017.pdf']['variable_definition_retrieve_context']
for sent, _ in retrieval_content:
    print(sent)
    print('-------------------------\n\n')

In [28]:
df_record = []
for model, temp_results in eval_results.items():
    ebp = np.mean([temp_result['EBP'] for temp_result in temp_results])
    age_r = np.mean([temp_result['age'][0] for temp_result in temp_results if temp_result['file'] not in ["kamps-et-al-2016.pdf"]])
    age_p = np.mean([temp_result['age'][1] for temp_result in temp_results if temp_result['file'] not in ["kamps-et-al-2016.pdf"]])
    gender_r = np.mean([temp_result['gender'][0] for temp_result in temp_results if temp_result['file'] not in ["kamps-et-al-2016.pdf"]])
    gender_p = np.mean([temp_result['gender'][1] for temp_result in temp_results if temp_result['file'] not in ["kamps-et-al-2016.pdf", "kodak-et-al-2011.pdf", "brogan-et-al-2017.pdf"]])
    df_record.append({
        'model': model,
        'ebp': ebp,
        'age_r': age_r,
        'age_p': age_p,
        'gender_r': gender_r,
        'gender_p': gender_p,
    })

In [None]:
pd.DataFrame(df_record)

In [None]:
pd.DataFrame(eval_chatgpt, columns=['file', 'EBP', 'age', 'gender'])#, 'diagnosis', 'comorbidity', 'race', 'research_strategy'])

In [None]:
chatgpt_acc = {
    'EBP': 0.5,
    'age': 1.0,
    'diagnosis': 1.0,
    'comorbidity': 0.9,
    'gender': 1.0,
    'race': 1.0,
    'research_strategy': 1.0
}

In [None]:
pd.DataFrame(eval_openllm, columns=['file', 'EBP', 'age', 'diagnosis', 'comorbidity', 'gender', 'race', 'research_strategy'])

In [None]:
openllm_acc = {
    'EBP': 0.5,
    'age': 0.3,
    'diagnosis': 0.95,
    'comorbidity': 0.7,
    'gender': 0.4,
    'race': 0.8,
    'research_strategy': 1.0
}

In [None]:
pd.DataFrame(eval_ret_all, columns=['file', 'EBP', 'age', 'diagnosis', 'comorbidity', 'gender', 'race', 'research_strategy'])

In [None]:
rag_acc = {
    'EBP': 0.4,
    'age': 0.25,
    'diagnosis': 0.7,
    'comorbidity': 0.7,
    'gender': 0.4,
    'race': 0.8,
    'research_strategy': 1.0
}

In [None]:
sys_review_chatgpt['jung-et-al-2008.pdf']['comorbidity_gen']

In [None]:
test_file = 'drasgow-et-al-2015.pdf'
test_attr = 'age'
retrieve_context = sys_review_ret_llama3_mpnet[test_file][f'{test_attr}_retrieve_context']
retrieve_context

In [65]:
db_chroma.delete_collection()

In [33]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=50,
    chunk_overlap=20,
    length_function=lambda x: len(f.embed_tokenizer.encode(x, add_special_tokens=False)),
    separators=[
        "\n\n",
        "\n",
        ".",
        ",",
        "\u200b",  # Zero-width space
        "\uff0c",  # Fullwidth comma
        "\u3001",  # Ideographic comma
        "\uff0e",  # Fullwidth full stop
        "\u3002",  # Ideographic full stop
        " ",
        "",
    ],
    # Existing args
)

In [None]:
article = pymupdf4llm.to_markdown(os.path.join('../../data/systematic_review_papers/', test_file))#, page_chunks=True)
concated_article = ''
chunks = []
concated_article = '\n\n'.join([' '.join(passage.split()) for passage in article.replace('\n\n\n-----\n\n', ' ').split('\n\n')])
chunks = text_splitter.create_documents([concated_article])
chunks = [Document(page_content) for page_content in set([chunk.page_content for chunk in chunks])]
print(len(chunks))

db_chroma = Chroma.from_documents(chunks, f.embeder)

In [None]:
db_chroma.search()

In [67]:
doc_list = db_chroma.similarity_search_with_score(attr2question[test_attr], k=74)
for pid, ((doc, score1), (text, score2)) in enumerate(zip(doc_list[:len(retrieve_context)], retrieve_context)):
    if doc.page_content != text:
        print(pid)