# Set up

## Imports

In [54]:
import os
import getpass
import langchain
import pandas as pd
import shutil
import logging
import time
import ast
from typing import Optional
from typing import Dict

from langchain.chains.openai_functions import (
    create_openai_fn_chain,
    create_structured_output_chain,
)
from langchain.chat_models import ChatOpenAI
from langchain.schema import HumanMessage, SystemMessage


from langchain.output_parsers import StructuredOutputParser, ResponseSchema
from langchain.prompts import PromptTemplate, ChatPromptTemplate, HumanMessagePromptTemplate
from langchain.llms import OpenAI

from pydantic import BaseModel, Field


from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma

from pydantic import BaseModel, Field



from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain.chains import RetrievalQA


from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma


## Paths

Set paths.

**folder_path** : Path where all the pdf files exist.


**chromadb_folder_path** : Path where you want chromadb indexes to be saved.

In [55]:
folder_path = './FullTextArticles/'
chromadb_folder_path = "./chroma_db/"

## READ excel file

Loading the excel file as a pandas dataframe with each row representing an article. We are setting row indices to refer to files more easily when manipulating the dataframe.

In [56]:
df = pd.read_excel('Articles_for_extraction.xlsx')

df['index'] = range(1, len(df) + 1)
df.set_index('index', inplace=True)


## OpenAI model name

In [59]:
openai_model_name='gpt-3.5-turbo'

# openai_model_name = 'gpt-3.5-turbo-1106'

#openai_model_name = 'gpt-4'

#openai_model_name ="gpt-4-32k"

In [60]:
# from dotenv import load_dotenv

# load_dotenv()

# os.environ['OPENAI_API_KEY'] = getpass.getpass('OpenAI API Key:')


os.environ['OPENAI_API_KEY'] = ''

True

## Diagram of the processing pipeline

![pipeline_diagram.jpg](pipeline_diagram.jpg)

# Step 1: Loading Documents

### PDFs vs FullAbstract


We operate with two types of documents:

- **PDFs**: pdf documents of articles, located at `folder_path` set above
- **FullAbstract**: a text copy of the abstract if there is no pdf for the corresponding articles

The type of the document can be determined by referring to the dataframe.

For **PDFs**, the `FullText` column will contain the name of the pdf file to be loaded and processed.

For **FullAbstract** documents, the `FullText` column will contain the text `'Meeting Abstract'` and the `FullAbstract` column will contain the text of the actual abstract that is to be processed.

### LangChain Document loaders

Documents are loaded via LangChain's document loaders, ensuring seamless access from PDFs or text columns. These loaders require an object of `Document` class, so for FullAbstract documents we will create objects using the following structure:

In [None]:
#This pydantic class will be used for incorporating the text in FullAbstract column. Those text need to be of Document class to be processed by Langchain.

class Document(BaseModel):
    page_content: str
    metadata: Dict[str, str]


The actual document loading will be taking place below, in the next section.

# Step 2: Chunking and Embedding

In the  chunking and embedding part of the process, we segment text into contextually meaningful chunks (1500-2000 characters), convert these chunks into vector embeddings for a nuanced contextual representation and store the embeddings in a Vector Storage.


## Single file processing

For each row in our dataframe, we load the corresponding document, perform the chunking via RecursiveCharacterSplitter, use Chroma to convert the chunks into embeddings and then store the results in `chromadb_folder_path` under the name corresponding to the index of the row.

In [11]:

def embed_each_file(index, file, pdf=True, doc='' ):
    """Function to create chunking and embedding of a single document.

    Parameters:
    index (int): The index identifying the document in the pandas dataframe loaded above (excel file)
    file (str): File path for pdf file of that specific document, or empty string if document is a FullAbstract
    pdf (boolean): True, if the document at index in the pandas dataframe is a pdf, else False.
    doc (str | dict) : Empty string if document is a pdf and a dictionary if document is a FullAbstract (see requirements
                       for the dict structure in the text cell below)

    Output:
    Creates and saves the embeddings for that particular index inside the chromadb_folder_path.
    For example, the data at index 1 in the pandas dataframe will be stored in "./chroma_db/1/" if the chromadb_folder_path
    is set to chromadb_folder_path = "./chroma_db/"

    """

    if pdf:
        # Load document from internal storage and split by pages
        loader = PyPDFLoader(file)
        pages = loader.load_and_split()
    else:
        # Create a single-page Document with the information from the 'doc' parameter
        document_data = doc
        pages = [Document(**document_data)]

    text_splitter = RecursiveCharacterTextSplitter(
        # Set a really small chunk size, just to show.
        chunk_size = 1200,
        chunk_overlap  = 400,
        length_function = len,
    )

    # Break down documents into chunks
    all_splits = text_splitter.split_documents(pages)


    # Generate embeddings for each chunk and store them in the Vector Storage
    vectorstore = Chroma.from_documents(documents=all_splits, embedding=OpenAIEmbeddings(), persist_directory= chromadb_folder_path + str(index))
    print(f"Embedded {index} Successfully")

    return True


#### Note on `doc` parameter

As mentioned in Step 1 (Loading Documents), LangChain document loaders require a Document object. In order to load the information for FullAbstract documents in the correct format, `embed_each_file()` takes in the `doc` parameter, which is a python dictionary object of the following form

    dict: {'page_content': 'String containing text version of FullAbstract document',
           'metadata': {'source': 'Index of FullAbstract document in the pandas dataframe'}}

This dictionary is used to create a Document class object that behaves the same way as the pdf files loaded via the LangChain loaders.

PDFs do not need to have additional Document objects created, so if an article has a PDF file, the `doc` parameter is set to an empty string.


#### Note on chunking parameters

The `chunk_size` and `chunk_overlap` variables in the code above can be changed to better suit your model.

gpt-3.5-turbo performs well for chunk_size 1000. [Source: https://eminent-crowberry-bb4.notion.site/Lex-GPT-74400b08f5f84cb5830baf2528f86b2f?pvs=4].

If you're using gpt-4 we recommend you use `chunk size` more than 15000 and `chunk_overlap` between 1500-2000

## Processing the whole dataframe

In this part we iterate through the whole data frame line-by-line processing each document according to the procedure defined above in the 'Single file processing'

In [12]:

def embed_all():
    """Function used to embed all the documents in the dataframe.
    """

    #If such a path exists, delete that path,
    if os.path.exists(chromadb_folder_path):
        shutil.rmtree(chromadb_folder_path)

    # Each row represent an article
    for index, row in df.iterrows():

        print(f'Processing index number: {index}')

        try:
            if row['FullText'] == 'Meeting Abstract':
                # Chunk only the abstract
                doc = {
                    'page_content': df.loc[index,'FullAbstract'],
                    'metadata': {'source': f'{index}'}
                }
                embed_each_file(index, '', pdf=False, doc=doc)
            else:
                # Chunk the whole pdf file
                file_path = folder_path + row['FullText']
                embed_each_file(index, file_path)

        except Exception as e:
            print(f'Error while reading index {index}', e)



### Embed all

In [13]:
embed_all()

Processing index number: 1
Embedded 1 Successfully
Processing index number: 2
Embedded 2 Successfully
Processing index number: 3
Embedded 3 Successfully
Processing index number: 4
Embedded 4 Successfully
Processing index number: 5
Embedded 5 Successfully
Processing index number: 6
Embedded 6 Successfully
Processing index number: 7
Embedded 7 Successfully
Processing index number: 8
Embedded 8 Successfully
Processing index number: 9
Embedded 9 Successfully
Processing index number: 10
Embedded 10 Successfully
Processing index number: 11
Embedded 11 Successfully
Processing index number: 12
Embedded 12 Successfully
Processing index number: 13
Embedded 13 Successfully
Processing index number: 14
Embedded 14 Successfully
Processing index number: 15
Embedded 15 Successfully
Processing index number: 16
Embedded 16 Successfully
Processing index number: 17
Embedded 17 Successfully
Processing index number: 18
Embedded 18 Successfully
Processing index number: 19
Embedded 19 Successfully
Processing 

# Step 3, Part 1: Retrieval, Multi-Query Generation

In this part of the process, we use a multi-query retriever technique to strategically retrieve relevant text from the vector storage.

We use our LLM to divide each of the main questions into 3 sub-questions and aggregate everything in a multiple-question query collection. Later, in Step 3 Part 2, we will use the collection to query the Vector Storage, retrieve the `K` most relevant chunks per question and amalgamate the results to form the Full Context.


## Question type and Schemas

The variable **`question_types`** is a dictionary mapping each question type (a short string identifier for each question) to the actual text of the question that we ask to our LLM.

whereas 

The variable **`schemas`** contains specifications of the **output** that we want for each question. An example of the specification would be requiring certain structures or data types, such as 'list of dictionary', 'Integer' or 'String'.

**NOTE: the key in `question_types` and `schemas` should always match**

In [61]:
question_types = {
    "review": "Identify and determine if the manuscript under consideration is a review or meta-analysis article.",
    "llm" : "Assess if one or several large language model (LLMs) are components within the patient-trial matching system.",
    "llm_name": "Extract and list all large language models that are mentioned as components within the patient-trial matching system.",
    "structured_data":"Assess if the patient-trial matching system incorporate structured data from the electronic health record system?",

    "list_of_medical_conditions" : "Extract and list all medical conditions or diseases referenced within the text related to the clinical trials.",
    "evaluate_patient_trial" : "Evaluate the patient-trial matching system described in the paper and assess if it positively impacted the diversity of subjects participating in the trial. Provide a concise textual assessment, detailing whether the system contributed to enhancing the diversity of subjects or not."  
}

schemas = {
    "review": [
        ResponseSchema(name="answer", description="output should be in YES or NO"),
    ],
    "llm": [
        ResponseSchema(name="answer", description="output should be in YES or NO"),
    ],
    "llm_name": [
        ResponseSchema(
            name="answer",
            description="output this in python list i.e [] of DISTINCT comma separated strings. Leave empty list i.e [] if the information is missing.",
        ),
    ],
    "structured_data": [
        ResponseSchema(name="answer", description="output should be in YES or NO"),
    ],
    "list_of_medical_conditions": [
        ResponseSchema(
            name="answer",
            description="output this in python list i.e [] of DISTINCT comma separated strings. Leave empty list i.e [] if the information is missing.",
        )
    ],
    "evaluate_patient_trial": [
        ResponseSchema(
            name="answer", description="output this in a detailed text format"
        )
    ],
}



## Generating multiple sub-questions from each question

The function **`generate_multiple_queries(question, max_results)`** generates multiple version of a question. The reason for generating multiple sub-questions is beacause while retrieving context, using only one query may not be sufficient to retrieve relevant context that we pass to the LLM.

Another reason for generating multiple sub-questions is because a specific question may sometimes be a combination of multiple question, it would be easier if we could break down that question into multiple sub-questions.


For example:

    Extract the number of patients participating in each trial discussed within the paper.

This question can be a combination of two sub-questions.

        1. The number of patients
        2. The trials discussed in the paper.


**By generating multiple sub-questions like above, we can use those to retrieve relevant chunks from our index for each sub-question and combine all the chunks in one place, forming the FullContext. So, the Full Context represents the information from index that is most relevant to answer the original question and will be passed to the LLM (gpt-3.5-turbo or gpt-4) instead of the entire index datafile, which also contains less-relevant information. The LLM will then be asked to use the FullContext to generate an accurate answer for the original question. So, the main thing to notice here, is that we use multiple queries of sub-questions to retrieve chunks from our index but we use only the original main question that is defined in `question_types` list to ask the LLM.**

To understand more deeply, please refer to the diagram of the processing pipeline at the beginning of the document.

In [18]:
def generate_multiple_queries(question, max_results):
    """ Function that generates multiple versions of a question.

    Parameters:
    question (str): The original question to generate multiple questions for.
    max_results (int): The number of questions to generate


    Returns:
    str: List of generated questions, formatted as an AST

    """

    # Sample Prompt for generating multiple quesitons
    sample = """You will be given a question. That question can be a complex question or it can be combination of
    more than one questions. Your task is to decompose that question into {max_results} different questions {format_instructions}
    Question: {question}
    Helpful Answer:"""

    template = ChatPromptTemplate.from_messages(
        [
            SystemMessage(
                content=(
                    "You are a helpful assistant that does tasks faithfully described to you."
                )
            ),
            HumanMessagePromptTemplate.from_template(sample),
        ]
    )

    #We've used gpt for generating multiple questions.
    llm = ChatOpenAI(model_name= openai_model_name)

    #define the output structure
    schema = [
        ResponseSchema(
            name="answer",
            description="output should be in list of string format for example: ['query1','query2','query3'] ",
        ),
    ]

    #format the prompt
    output_parser = StructuredOutputParser.from_response_schemas(schema)
    format_instructions = output_parser.get_format_instructions()

    #Get the response from LLM
    answer = llm(
        template.format_messages(
            max_results=max_results,
            question=question,
            format_instructions=format_instructions,
        )
    )

    try:
        #Parse the JSON output from the LLM
        final_ans = str(output_parser.parse(answer.content)["answer"])

    except:
        final_ans = str(answer.content)

    return final_ans


The code below generates multiple sub-questions for each question in `question_types` list.

**NOTE: Our original question is also included at the end of each sub-question list so that we don't miss out quering using our original question.**

In [11]:
query_collection = {}
for key,value in question_types.items():
    print('Generating Multiple Question for: ', key)
    query_collection[key] = ast.literal_eval(generate_multiple_queries(value, 3)) + [value]

Generating Multiple Question for:  review
Generating Multiple Question for:  llm
Generating Multiple Question for:  llm_name
Generating Multiple Question for:  structured_data
Generating Multiple Question for:  list_of_medical_conditions
Generating Multiple Question for:  evaluate_patient_trial




**NOTE: We have already generated multiple sub-questions for each question below and saved it in a dictionary called `query_collection`. You may run the code below, if you want to generate multiple sub-questions yourself or you can just execute the code cell below this code cell.**

In [12]:
query_collection

{'review': ['What type of manuscript is under consideration?',
  'Does the manuscript meet the criteria for a review article?',
  'Does the manuscript meet the criteria for a meta-analysis article?',
  'Identify and determine if the manuscript under consideration is a review or meta-analysis article.'],
 'llm': ['What is the structure of the patient-trial matching system?',
  'Are there any large language models incorporated in the patient-trial matching system?',
  'If yes, how many large language models are used and what are their roles in the system?',
  'Assess if one or several large language model (LLMs) are components within the patient-trial matching system.'],
 'llm_name': ['What are large language models?',
  'What components are mentioned within the patient-trial matching system?',
  'Among those components, which are large language models?',
  'Extract and list all large language models that are mentioned as components within the patient-trial matching system.'],
 'structur

In [19]:
# These were the sub-questions generated initially when I ran the cell above. I've saved them in a dictionary so that we don't need to generate multiple sub-questions each time.

query_collection = {
    'review': [
         'What is the definition of a review article?',
         'What is the definition of a meta-analysis article?',
         'What are the key characteristics of the manuscript under consideration?',
         'Identify and determine if the manuscript under consideration is categorized as a review or meta-analysis article.'],
     'llm': [
        'Does the text mention large language models?',
        'Are there any indications in the text that large language models are being used in the system being described?',
        'Does the text provide any context or details about the use of large language models in the system?',
        'Assess from the text if large language models are part of the system that the text describes.'],
    'llm_name': [
        'What are the large language models mentioned in the text?',
        'Which large language models are identified as components within the system?',
        'Are any of these models specifically used for patient matching in clinical trials?',
        'Extract and list all large language models that are specifically mentioned or identified as components within the system described in the text related to patient matching for clinical trials.'],
    'structured_data': [
        'What is the patient-trial matching system described in the text?',
        'Does the patient-trial matching system incorporate structured data as part of its input?',
        'What type of non-textual data does the system incorporate, if any?',
        'Analyze the text to determine if the patient-trial matching system described incorporates structured data (non-textual) as part of its input.'],
    'list_of_medical_conditions': [
        'What are all the medical conditions or diseases mentioned in the text?',
        'Which of these conditions or diseases are related to clinical trials?',
        'Which of these conditions or diseases are unique?',
        'Extract and list all unique medical conditions or diseases referenced within the text related to the clinical trials.'],
    'evaluate_patient_trial': [
        'What is the patient-trial matching system described in the paper?',
        'Did the patient-trial matching system positively impact the diversity of subjects participating in the trial?',
        'Did the system contribute to enhancing the diversity of subjects participating in the trial?',
        'Evaluate the patient-trial matching system described in the paper and assess if it positively impacted the diversity of subjects participating in the trial. Provide a concise textual assessment, detailing whether the system contributed to enhancing the diversity of subjects or not.']
}

# Step 3, Part 2 and Step 4: Retrieving Full Context and Question Answering

The concluding phase of our data extraction process, involves generating answers to our questions through a meticulous recipe. To produce our answers, we amalgamate several crucial components. First, we use the query collection generated above to retrieve relevant text from the Vector Space and form the **Full Context**. Then, we use the **Full Context**, the desired **Output Schema** and a carefully crafted **Prompt** to ask the LLM for an accurate answer to the **original question**. Following this orchestrated assembly, we conduct Question Answering for all queries and archive the responses, utilizing an Excel file as our storage repository.


A more detailed explanation of the process can be found below.



For each `index` and each **original question** we will go throught the following procedure:

- Query the Vector Storage at `index` with each sub-question from the query collection that is associated with the **original question**. This query will retrieve the `K` most relevant text chunks for each sub-question. (A larger `K` value accommodates complex questions, facilitating a comprehensive understanding by yielding more relevant text. In our implementation we set it around 5-20.)

- Amalgamate the chunks from the previous step and remove duplicates to form the **Full Context**. The **Full Context** represents the relevant contextual backdrop of `index` that is sufficient and necessary to answer **original question**

- Retrieve the specific **Output Schema** corresponding to the question type of **original question** from `schemas` and then populate the **Prompt** template with the **Full Context**, **original question** and **Output Schema** information.

- Generate the LLM answer by passing in the populated **Prompt** from the above step and save the answer in the dataframe.




**NOTE: While running this code below for each of the index in the dataframe, when the output of `llm` is `'NO'` we don't extract or process further for it's `llm_name` because we assume that there is no use of LLM in the paper. The code that's responsible for this skipping is**

        if (answers.get('llm') is not None) and (answers.get('llm_name') is None):
            if answers.get('llm').lower() == 'no':
                answers['llm_name'] = '[]'
                print('skipping llm_name call..')
                continue


In [62]:
def question_answer(query_collection, index, schemas, k):
    """ Function that asks the LLM all the questions defined in `query_collection` with respect to the document
    located at `index` and outputs the result formatted according to the structure defined in `schemas`.

    Parameters:
    query_collection (dict {str: list[str]}):  Collection of multiple questions mapping each question type to
                                               the corresponding sub-questions generated above in Step 3, Part 1
    index (int):  The index identifying the document of interest in the dataframe
    schemas (dict {str: list[ResponseSchema]}):  Collection mapping each question type to the desired Output Schema
    k (int) :  The number of relevant chunks of text to be retrieved for each of the queries.

    Returns:
    dict {str: str}: A collection mapping each question type to the corresponding answer of the LLM. Additionally maps
                     'file_name' to `index`.

    An example of a return dictionary:

    {'file_name': 5,
     'review': 'NO',
     'llm': 'Yes',
     'llm_name': "['Watson for Clinical Trial Matching (CTM) cognitive system']",
     'structured_data': 'NO',
     'no_of_medical_conditions': '1',
     'list_of_medical_conditions': "['cancer', 'breast cancer']",
     'patient_per_clinical_trial_7a': "[6.3, 'N/A', 'N/A', 'N/A', 'N/A']",
     'patient_per_clinical_trial': "[{'Systemic therapy trials enrolling breast cancer patients': 6.3}, {'Breast cancer cohorts of phase I trials within the experimental therapeutics program': 8.1}]",
     'evaluate_patient_trial': 'The paper does not provide specific information about the impact of the patient-trial matching system on the diversity of subjects participating in the trial. It primarily focuses on the increase in breast cancer clinical trial enrollment and the efficiency of the screening process. Therefore, it is difficult to assess whether the system positively impacted the diversity of subjects.'}

    """

    #Initialize empty directory to save all the answer for each question, for each file
    answers = {"file_name" : index }

    #get the Chroma embeddings of index from the vector storage
    embeddings = OpenAIEmbeddings()
    vectorstore = Chroma(persist_directory= chromadb_folder_path + str(index), embedding_function= embeddings )

    #Sample Prompt for our QA
    sample = """Use the following pieces of context to answer the question at the end.
    If you don't know the answer, just say that you don't know, don't try to make up an answer.
    Try to be on point and don't make unwanted and unrelated answers.
    You will be asked questions which will be related to research paper.
    There may be questions that require complex reasoning, in such cases, You should think step-by-step and process your answer to the question
    You will be provided context which are from the research paper, the contexts are: {context} {format_instructions}
    Question: {question}
    Helpful Answer:"""

    template = ChatPromptTemplate.from_messages(
        [
            SystemMessage(
                content=(
                    "You are a helpful assistant that answers question from the provided context which are retrieved from research papers"
                )
            ),
            HumanMessagePromptTemplate.from_template(sample),
        ]
    )


    #initialize our LLM
    llm = ChatOpenAI(model_name = openai_model_name)

    #We ask GPT to answer our questions, which is saved in query_collection list.
    for question_key, question_value in query_collection.items():

        # if llm is 'NO' don't run the query for 'llm_name'
        if (answers.get('llm') is not None) and (answers.get('llm_name') is None):
            if answers.get('llm').lower() == 'no':
                answers['llm_name'] = '[]'
                print('skipping llm_name call..')
                continue

        #For each question, retrieve relevant chunks, and ignore the duplicates.
        full_context = []
        for sub_query in question_value: #question_value contains multiple questions
            sub_query_documents = vectorstore.similarity_search(sub_query, k = k)
            #filter out the duplicates
            for each_docs in sub_query_documents:
                if each_docs not in full_context:
                    full_context.append(each_docs)

        #Choose which schema to use for our questions, question specific schema is chosen
        question_schema = schemas[question_key] 
        output_parser = StructuredOutputParser.from_response_schemas(question_schema)

        #format the schema in way that our LLM will understand, this code exactly does this.
        format_instructions = output_parser.get_format_instructions()

        #get the original question
        main_question = question_types[question_key]

        #Ask question to our LLM, and save answer
        answer = llm(template.format_messages(context=full_context, question=main_question, format_instructions=format_instructions ))

        print(answer.content)

        #save each answer in our dictionary called answers
        try:
            answers[question_key] = str(output_parser.parse(answer.content)["answer"])
        except:
            answers[question_key] = str(answer.content)


    return answers

The code below runs the Question Answering task for each rows in our dataframe and saves the results for each question in their respective columns in the pandas dataframe.



**NOTE: The code that needs to be changed/experimented with is the `k` value here. If you're using larger model with larger context length like gpt-4-32k then increasing the `k` value to 20-25 yields better results. And I've found that our last question evaluate_patient_trial requires a lot of context to answer the question properly otherwise it will not yield better results whereas other questions with normal chunk size of `k`=10 can also answer quite easily**



In [63]:
def run(query_collection, schemas, k=10):
    """ Function performing the Question Answering process for all the datafiles.

    Parameters:
    query_collection (dict {str: list[str]}):  Collection of multiple questions mapping each question type to
                                               the corresponding sub-questions generated above in Step 3, Part 1
    schemas (dict {str: list[ResponseSchema]}):  Collection mapping each question type to the desired Output Schema
    k (int) :  The number of relevant chunks of text to be retrieved for each of the queries.

    Output:

    Saves all the results in our pandas dataframe `df` in their specific columns.
    """

    #The code below creates column for each key value in our question_types.
    for key,value in question_types.items():
        df[key] = None

    #Perform question answering and save it in results dictionary
    results = dict()
    for index, row in df.iterrows():
        try:
            print(f'---------------------- processing index {index} -------------------------')
            results[index]  = question_answer(query_collection, index, schemas, k = k)

            #Time sleeping to avoid rate limit error.
            time.sleep(20)
        except Exception as e:
            print('Error!', e)

    #save the answers from results dictionary to pandas dataframe, in their specific column
    for index,_ in df.iterrows():
        result_by_index = results.get(index, '')

        #get all the keys from question_types
        for key,value in question_types.items():
            df.at[index,key] = result_by_index.get(key,'')


In [64]:
run(query_collection, schemas, k=10)

---------------------- processing index 1 -------------------------
```json
{
	"answer": "NO"
}
```
```json
{
	"answer": "YES"
}
```
```json
{
	"answer": ["BERT", "Transformer", "LSTM", "match-LSTM", "SPINN", "Word-by-word Attention"]
}
```
```json
{
	"answer": "YES"
}
```
```json
{
	"answer": ["Alzheimer’s disease", "heart failure", "idiopathic pulmonary fibrosis", "cancer", "diabetes"]
}
```
```json
{
	"answer": "The patient-trial matching system described in the paper, DeepEnroll, positively impacted the diversity of subjects participating in the trial. It outperformed the best state-of-the-art baselines by up to 12.4% in average F1 and 6.8% in PR-AUC, demonstrating improved patient enrollment for trials. Additionally, it showed minimal performance reduction when evaluating patient-trial matching for rare diseases, indicating its effectiveness in recruiting suitable patients for challenging cases. Therefore, DeepEnroll contributed to enhancing the diversity of subjects participating

# Post-processing


In [65]:
df

Unnamed: 0_level_0,Rank,FullTextLink,FullText,Title,Link,FullAbstract,PublicationDate,novel,novel_source,AI_ML_source,...,AI_ML,NLP,Resolution_AI_ML,Resolution_NLP,review,llm,llm_name,structured_data,list_of_medical_conditions,evaluate_patient_trial
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2,https://arxiv.org/abs/2001.08179,Zhang_2020.pdf,DeepEnroll: patient-trial matching with deep e...,https://dl.acm.org/doi/abs/10.1145/3366423.338...,Clinical trials are essential for drug develop...,2020-04-20 00:00:00,YES,,"To address these challenges, we proposed a cro...",...,YES,YES,YES,YES,NO,YES,"['BERT', 'Transformer', 'LSTM', 'match-LSTM', ...",YES,"['Alzheimer’s disease', 'heart failure', 'idio...",The patient-trial matching system described in...
2,3,https://arxiv.org/abs/2006.08765,Gao_2020.pdf,COMPOSE: Cross-modal pseudo-siamese network fo...,https://dl.acm.org/doi/abs/10.1145/3394486.340...,Clinical trials play important roles in drug d...,2020-06-15 00:00:00,YES,,"In this paper, we proposed CrOss-Modal PseudO-...",...,YES,YES,YES,YES,NO,YES,"['BERT', 'GloVe']",YES,"['chronic pain', 'chronic obstructive pulmonar...",The patient-trial matching system described in...



### Removing brackets and quotes

The code below removes the brackets and quotes from the **`llm_name`** and **`list_of_medical_conditions`** columns of the dataframe. We do this by extracting what's already saved in those columns, converting the entries to python lists using `ast.literal_eval()` and then convert it to a string joined by ', ' for each value in it.

For example:

The value that was initially

    ['BERT', 'Clinical BERT']

will be converted to

    BERT, Clinical BERT

**NOTE: The value that had empty array [] will now have no value in it, because this array contains no values and we also strip the brackets**

In [66]:
for index, row in df.iterrows():
    
    llm_name = row['llm_name']
    list_of_medical_conditions = row['list_of_medical_conditions']
    
    # modification 
    llm_name = ast.literal_eval(llm_name)
    list_of_medical_conditions = ast.literal_eval(list_of_medical_conditions)
    
    #make a string separated by comma
    df.at[index,'llm_name'] = ', '.join(llm_name)
    df.at[index, 'list_of_medical_conditions'] = ', '.join(list_of_medical_conditions)


In [67]:
df

Unnamed: 0_level_0,Rank,FullTextLink,FullText,Title,Link,FullAbstract,PublicationDate,novel,novel_source,AI_ML_source,...,AI_ML,NLP,Resolution_AI_ML,Resolution_NLP,review,llm,llm_name,structured_data,list_of_medical_conditions,evaluate_patient_trial
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2,https://arxiv.org/abs/2001.08179,Zhang_2020.pdf,DeepEnroll: patient-trial matching with deep e...,https://dl.acm.org/doi/abs/10.1145/3366423.338...,Clinical trials are essential for drug develop...,2020-04-20 00:00:00,YES,,"To address these challenges, we proposed a cro...",...,YES,YES,YES,YES,NO,YES,"BERT, Transformer, LSTM, match-LSTM, SPINN, Wo...",YES,"Alzheimer’s disease, heart failure, idiopathic...",The patient-trial matching system described in...
2,3,https://arxiv.org/abs/2006.08765,Gao_2020.pdf,COMPOSE: Cross-modal pseudo-siamese network fo...,https://dl.acm.org/doi/abs/10.1145/3394486.340...,Clinical trials play important roles in drug d...,2020-06-15 00:00:00,YES,,"In this paper, we proposed CrOss-Modal PseudO-...",...,YES,YES,YES,YES,NO,YES,"BERT, GloVe",YES,"chronic pain, chronic obstructive pulmonary di...",The patient-trial matching system described in...


## Save to excel

Finally, we archive the dataframe by saving it as an excel file.

In [68]:
df.drop('index')
df.to_excel('results/FullText_reviewer_GPT4.xlsx')