In [None]:
!ls ../../data/systematic_review_papers

In [26]:
import sys
sys.path.append('../..')

from src.index_files import *

from langchain_text_splitters import RecursiveCharacterTextSplitter
import pymupdf4llm

In [3]:
ebp_description_dict = {
    "ABI": ("Antecedent-Based Interventions", "Arrangement of events or circumstances that precede an activity or demand in order to increase the occurrence of a behavior or lead to the reduction of the challenging/interfering behaviors."), 
    "AAC": ("Augmentative and Alternative Communication", "Interventions using and/or teaching the use of a system of communication that is not verbal/vocal which can be aided (e.g., device, communication book) or unaided (e.g., sign language)"), 
    "BMI": ("Behavioral Momentum Intervention", "The organization of behavior expectations in a sequence in which low probability, or more difficult, responses are embedded in a series of high probability, or less effortful, responses to increase persistence and the occurrence of the low probability responses."), 
    "CBIS": ("Cognitive Behavioral/Instructional Strategies", "Instruction on management or control of cognitive processes that lead to changes in behavioral, social, or academic behavior."), 
    "DR": ("Differential Reinforcement of Alternative, Incompatible, or Other Behavior", "A systematic process that increases desirable behavior or the absence of an undesirable behavior by providing positive consequences for demonstration/non-demonstration of such behavior. These consequences may be provided when the learner is: a) engaging in a specific desired behavior other than the undesirable behavior (DRA), b) engaging in a behavior that is physically impossible to do while exhibiting the undesirable behavior (DRI), or c) not engaging in the undesirable behavior (DRO)."), 
    "DI": ("Direct Instruction", "A systematic approach to teaching using a sequenced instructional package with scripted protocols or lessons. It emphasizes teacher and student dialogue through choral and independent student responses and employs systematic and explicit error corrections to promote mastery and generalization."), 
    "DTT": ("Discrete Trial Training", "Instructional approach with massed or repeated trials with each trial consisting of the teacher's instruction/presentation, the child's response, a carefully planned consequence, and a pause prior to presenting the next instruction."), 
    "EXM": ("Exercise and Movement", "Interventions that use physical exertion, specific motor skills/techniques, or mindful movement to target a variety of skills and behaviors."), 
    "EXT": ("Extinction", "The removal of reinforcing consequences of a challenging behavior in order to reduce the future occurrence of that behavior."), 
    "FBA": ("Functional Behavioral Assessment", "A systematic way of determining the underlying function or purpose of a behavior so that an effective intervention plan can be developed."), 
    "FCT": ("Functional Communication Training", "A set of practices that replace a challenging behavior that has a communication function with more appropriate and effective communication behaviors or skills."), 
    "MD": ("Modeling", "Demonstration of a desired target behavior that results in use of the behavior by the learner and that leads to the acquisition of the target behavior."), 
    "MMI": ("Music-Mediated Intervention", "Intervention that incorporates songs, melodic intonation, and/or rhythm to support learning or performance of skills/behaviors. It includes music therapy, as well as other interventions that incorporate music to address target skills."), 
    "NI": ("Naturalistic Intervention", "A collection of techniques and strategies that are embedded in typical activities and/or routines in which the learner participates to naturally promote, support, and encourage target skills/behaviors."), 
    "PII": ("Parent-Implemented Intervention", "Parent delivery of an intervention to their child that promotes their social communication or other skills or decreases their challenging behavior."), 
    "PBII": ("Peer-Based Instruction and Intervention", "Intervention in which peers directly promote autistic children's social interactions and/or other individual learning goals, or the teacher/other adult organizes the social context (e.g. play groups, social network groups, recess) and when necessary provides support (e.g., prompts, reinforcement) to the autistic children and their peer to engage in social interactions."), 
    "PP": ("Prompting", "Verbal, gestural, or physical assistance given to learners to support them in acquiring or engaging in a targeted behavior or skill."), 
    "R": ("Reinforcement", "The application of a consequence following a learner's use of a response or skills that increases the likelihood that the learner will use the response/skills in the future."), 
    "RIR": ("Response Interruption/Redirection", "The introduction of a prompt, comment, or other distractors when an interfering behavior is occurring that is designed to divert the learner's attention away from the interfering behavior and results in its reduction."), 
    "SM": ("Self-Management", "Instruction focusing on learners discriminating between appropriate and inappropriate behaviors, accurately monitoring and recording their own behaviors, and rewarding themselves for behaving appropriately."), 
    "SI": ("Sensory Integration", "Interventions that target a person's ability to integrate sensory information (visual, auditory, tactile, proprioceptive, and vestibular) from their body and environment in order to respond using organized and adaptive behavior."), 
    "SN": ("Social Narratives", "Interventions that describe social situations in order to highlight relevant features of a target behavior or skill and offer examples of appropriate responding."), 
    "SST": ("Social Skills Training", "Group or individual instruction designed to teach learners ways to appropriately and successfully participate in their interactions with others."), 
    "TA": ("Task Analysis", "A process in which an activity or behavior is divided into small, manageable steps in order to assess and teach the skill. Other practices, such as reinforcement, video modeling, or time delay, are often used to facilitate acquisition of the smaller steps."), 
    "TAII": ("Technology-Aided Instruction and Intervention", "Instruction or intervention in which technology is the central feature and the technology is specifically designed or employed to support the learning or performance of a behavior or skill for the learner."), 
    "TD": ("Time Delay", "A practice used to systematically fade the use of prompts during instructional activities by using a brief delay between the initial instruction and any additional instructions or prompts."), 
    "VM": ("Video Modeling", "A video-recorded demonstration of the targeted behavior or skill shown to the learner to assist learning in or engaging in a desired behavior or skill."), 
    "VS": ("Visual Supports", "A visual display that supports the learner engaging in a desired behavior or skills independent of additional prompts."), 
}

diagnosis = [
    "autism spectrum disorder", 
    "autism", 
    "sperger syndrome", 
    "pervasive developmental disorder", 
    "pervasive developmental disorder-not otherwise specified", 
    "high-functioning autism"
]

co_conditions = [
    "intellectual disability", 
    "genetic syndrome (e.g., Fragile X, Down syndrome)", 
    "seizure disorder", 
    "mental health condition (e.g., anxiety, depression, obsessive compulsive disorder)", 
    "attention deficit/hyperactivity disorder (i.e. ADHD)", 
    "physical disability (e.g., cerebral palsy, orthopedic impairment)", 
    "hearing impairment", 
    "visual impairment", 
    "learning disability"
]

races = [
    "African American/Black", 
    "Asian", 
    "Hispanic/Latino", 
    "Middle Eastern", 
    "Native American", 
    "Native Hawaiian/Pacific Islander", 
    "White", 
    "Two or more/Multi-racial", 
    "Other", 
    "Not specified", 
    "Not reported", 
]

outcome_area = {
    "Academic/Pre-academic": "Outcomes broadly related to performance on tasks typically taught and used in school settings", 
    "Adaptive/Self-help": "Outcomes related to independent living skills and personal care skills", 
    "Challenging/Interferring behavior": "Outcomes related to decreasing or eliminating behaviors that interfere with the individual's ability to learn", 
    "Cognitive": "Outcomes related to performance on measures of intelligence, executive function, problem solving, information processing, reasoning, theory of mind, memory, creativity, or attention", 
    "Communication": "Outcomes related to ability to express wants, needs, choices, feelings, or ideas", 
    "Joint attention": "Outcomes related to behaviors needed for sharing interests and/or experiences", 
    "Mental health": "Outcomes related to emotional well-being", 
    "Motor": "Outcomes related to movement or motion, including both fine and gross motor skills, or related to sensory system/sensory functioning", 
    "Play": "Outcomes related to the use of toys or leisure materials", 
    "Self-determination": "Outcomes related to self-directed actions in setting and achieving goals or making decisions and problem-solving", 
    "School readiness": "Outcomes related to task performance versus task content or curriculum area (e.g., on task behavior, engagement)", 
    "Social": "Outcomes related to skills needed to interact with others", 
    "Vocational": "Outcomes related to employment or employment preparation or relate to technical skills required for a specific job"
}

In [4]:
random.seed(0)

ebp_description_list = [f"Practice name: {name}\nDescription: {desc}" for k, (name, desc) in ebp_description_dict.items()]
ebp_description_list.sort()
random.shuffle(ebp_description_list)
ebp_description_str = "\n\n".join(ebp_description_list)
outcome_str = "\n\n".join([f"Outcome type: {k}\nDescription: {v}" for k, v in outcome_area.items()])

In [5]:
attr2question = {
    # "EBP": "Which type of Evidence-Based Practice is mainly studied in this paper?", 
    "age": "What are the ages and genders of the studied participants with autism in the experiment?", 
    # "diagnosis": "What are the diagnoses of the studied participants in the experiment?", 
    # "comorbidity": "What are the comorbidities of the studied participants with autism in the experiment?", 
    # "gender": "What are the genders of the studied participants with autism in the experiment?", 
    "variable_definition": "Extract all the text that conceptually introduces the key variables associated with the practice design.", 
    "research_gap": "Extract all the text that explicitly identifies or discusses the research gap addressed in the paper.", 
    # "race": "What are the races of the studied participants with autism in the experiment?", 
    # "ethnicity": "What are the ethnicities of the participants in the experiment?", 
    # "nationality": "What are the nationalities of the participants in the experiment?", 
    # "research_strategy": "What type of research strategy is implemented in the experiment?", 
    # "outcome": "What are the outcomes of the participants reported in the experiment?"
}

attr2format = {
    # "EBP": "The answer should be one of the following practices. The description of each practice is provided.\n\n" + ebp_description_str,
    "age": "For each participant, return the age and gender reported in the paper.", 
    # "diagnosis": "The answer should be one of the following symptoms:\n\n" + '\n'.join(diagnosis), 
    # "comorbidity": "If the comorbidities belong to following conditions, reply the conditions of the comorbidities.\nIf the comorbidity does not belong to any of the conditions, reply 'Other'.\nIf the participants have no comorbidity, reply 'No comorbidity'.\nIf the comorbidity is not reported, reply 'Not reported'.\n\nConditions:\n" + '\n'.join(co_conditions), 
    # "gender": "If the genders of the participants are not reported, reply 'Not reported'.", 
    "variable_definition": "Separate each extracted piece with '\n\n'.", 
    "research_gap": "Separate each extracted piece with '\n\n'.", 
    # "race": "The answer should be one of the following choices:\n\n" + '\n'.join(races), 
    # "research_strategy": "The answer should be either 'group design' or 'single case design'.", 
    # "outcome": "The answer should belong to following outcome types. The description of each type is provided.\n\n" + outcome_str
}

prompts = {
    attr: 'Answer the question based only on the following context:\n\nContext:\n\n{context}\n\n' + f'Answer the question based on the above context: {question}\n{attr2format[attr]}' for attr, question in attr2question.items()
    # attr: f'Answer the question based on the above context: {question}\n{attr2format[attr]}' for attr, question in attr2question.items()
}

In [None]:
prompts

Answer the question based on the above context: What are the ages and genders of the studied participants with autism in the experiment?\nFor each participant, return the age and gender reported in the paper.

Answer the question based on the above context: Describe the detailed implementation of the evidence-based practice proposed by the paper.\nOrganize your answers with bullet points.

Answer the question based on the above context: Describe the novel ideas proposed by the paper.\nOrganize your answers with bullet points.

In [7]:
# dataset = [
#     {"file": "haq-et-al-2017.pdf", "EBP": "AAC", "age": [10], "diagnosis": ["ASD"], "comorbidity": [], "gender": ["male"], "race": [], "research_strategy": "single case design", "outcome": []},
#     {"file": "brogan-et-al-2017.pdf", "EBP": "DR", "age": [5,5,6,6,5,6,7,3,5,5,4,3,4], "diagnosis": ["ASD", "Speech delay"], "comorbidity": ["Not reported"], "gender": ["Not reported"], "race": ["Not reported"], "research_strategy": "single case design", "outcome": []},
#     {"file": "enloe-rapp-2013.pdf", "EBP": "ABI", "age": [8,13,6], "diagnosis": ['autism'], "comorbidity": ["hearing impairmen"], "gender": ["Both male and female"], "race": [], "research_strategy": "single case design", "outcome": []},
#     {"file": "leaf-et-al-2014.pdf", "EBP": "DTT", "age": [5,4,5,6], "diagnosis": ['autistic disorder'], "comorbidity": [], "gender": ['Male'], "race": [], "research_strategy": "single case design", "outcome": []},
#     {"file": "jung-et-al-2008.pdf", "EBP": "BMI", "age": [5,6,6], "diagnosis": ["autism spectrum disorder", "PDD-NOS"], "comorbidity": [], "gender": ["Male"], "race": [], "research_strategy": "single case design", "outcome": []},
#     {"file": "neely-et-al-2014.pdf", "EBP": "EXM", "age": [7,8], "diagnosis": ["ASD"], "comorbidity": ['intellectual disability'], "gender": ["Both male and female"], "race": [], "research_strategy": "single case design", "outcome": []},
#     {"file": "howorth-et-al-2016.pdf", "EBP": "CBIS", "age": [10,11], "diagnosis": ["ASD"], "comorbidity": [], "gender": ["Male"], "race": [], "research_strategy": "single case design", "outcome": []},
#     {"file": "drasgow-et-al-2015.pdf", "EBP": "EXT", "age": [4,3,3], "diagnosis": ["autism"], "comorbidity": ['genetic syndrome (e.g., Fragile X, Down syndrome)'], "gender": ["Male"], "race": ["White"], "research_strategy": "single case design", "outcome": []},
#     {"file": "kamps-et-al-2016.pdf", "EBP": "DI", "age": [62,82], "diagnosis": ['ASD'], "comorbidity": [], "gender": ["Both male and female"], "race": ['White', 'African-American', 'Hispanic', 'Asian', 'other'], "research_strategy": "group design", "outcome": []},
#     {"file": "kodak-et-al-2011.pdf", "EBP": "FBA", "age": [8,4,5,3,4,10,3,4,7,4,4], "diagnosis": ["autism"], "comorbidity": [], "gender": [], "race": [], "research_strategy": "single case design", "outcome": []},
# ]

In [6]:
dataset = read_json('../../data/systematic_review_papers/dataset.json')

# Experiments

In [9]:
# embeder = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2", model_kwargs={'device': 'cuda:1'}, encode_kwargs={'normalize_embeddings': False})
# embeder = HuggingFaceBgeEmbeddings(model_name="BAAI/bge-large-en-v1.5", model_kwargs={'device': 'cuda:1'}, encode_kwargs={'normalize_embeddings': True})
# embeder = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-large", model_kwargs={'device': 'cuda:1'}, encode_kwargs={'normalize_embeddings': True})
# embeder = HuggingFaceBgeEmbeddings(model_name="intfloat/e5-mistral-7b-instruct", model_kwargs={'device': 'cuda:1'}, query_instruction='Instruct: Given a search query, retrieve relevant passages that answer the query\nQuery: ')


In [None]:
f = Factory(llm_name='meta-llama/Meta-Llama-3.1-8B-Instruct')
results = defaultdict(dict)
llm_tokenizer = AutoTokenizer.from_pretrained(f.llm_name)
for test_data in tqdm(dataset):
    test_file = test_data['file']
    article = pymupdf4llm.to_markdown(os.path.join('../../data/systematic_review_papers/', test_file))#, page_chunks=True)
    concated_article = '\n\n'.join([' '.join(passage.split()) for passage in article.replace('\n\n\n-----\n\n', ' ').split('\n\n')])
    cropped_article = llm_tokenizer.decode(llm_tokenizer.encode(concated_article, add_special_tokens=False)[:28000])
    
    prompt2attr = dict[str, str]()
    for attr, question in attr2question.items():
        prompt2attr[prompts[attr].format(context=cropped_article)] = attr
    
    test_prompts = list(prompt2attr)
    for prompt, gen in zip(test_prompts, f.llm.generate([[HumanMessage(content=full_prompt)] for full_prompt in test_prompts], max_tokens=2000).generations):
        results[test_file][f'{prompt2attr[prompt]}_gen'] = gen[0].text
        
write_json(f'sys_review_openllm_{f.llm_name.split("/")[-1]}.json', results)

In [None]:
embeders = [
    HuggingFaceBgeEmbeddings(model_name="intfloat/e5-mistral-7b-instruct", model_kwargs={'device': 'cuda:1'}, query_instruction='Instruct: Given a search query, retrieve relevant passages that answer the query\nQuery: '),
    HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2", model_kwargs={'device': 'cuda:1'}, encode_kwargs={'normalize_embeddings': False}),
    HuggingFaceBgeEmbeddings(model_name="BAAI/bge-large-en-v1.5", model_kwargs={'device': 'cuda:1'}, encode_kwargs={'normalize_embeddings': True})
]
for embeder in embeders:
    f = Factory(embeder=embeder, llm_name='meta-llama/Meta-Llama-3.1-8B-Instruct')


    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=300,
        chunk_overlap=20,
        length_function=lambda x: len(f.embed_tokenizer.encode(x, add_special_tokens=False)),
        separators=[
            "\n\n",
            "\n",
            ".",
            ",",
            "\u200b",  # Zero-width space
            "\uff0c",  # Fullwidth comma
            "\u3001",  # Ideographic comma
            "\uff0e",  # Fullwidth full stop
            "\u3002",  # Ideographic full stop
            " ",
            "",
        ],
        # Existing args
    )
    
    results = defaultdict(dict)
    for ret_num in [10]:
        print('ret_num:', ret_num)
        for test_data in tqdm(dataset):
            test_file = test_data['file']
            print(test_file)
            article = pymupdf4llm.to_markdown(os.path.join('../../data/systematic_review_papers/', test_file))#, page_chunks=True)
            concated_article = '\n\n'.join([' '.join(passage.split()) for passage in article.replace('\n\n\n-----\n\n', ' ').split('\n\n')])
            chunks = text_splitter.create_documents([concated_article])
            # for cid, chunk in enumerate(chunks):
            #     chunk.metadata['cid'] = str(cid)
            print(len(chunks))

            db_chroma = Chroma.from_documents(chunks, f.embeder, ids=[str(i) for i in range(len(chunks))])
            
            prompt2attr = dict[str, str]()
            for attr, question in attr2question.items():
                # docs_chroma = db_chroma.similarity_search_with_score('Instruct: Given a search query, retrieve relevant passages that answer the query.\nQuery: ' + question, k=ret_num)
                docs_chroma = db_chroma.similarity_search_with_score(question, k=len(chunks))[:ret_num]
                # docs_chroma.sort(key=lambda x: x[0].metadata['cid'])
                results[test_file][f'{attr}_retrieve_context'] = [(doc.page_content, _score) for doc, _score in docs_chroma]
                context_text = "\n\n".join([doc.page_content for doc, _score in docs_chroma])
                prompt2attr[prompts[attr].format(context=context_text)] = attr
            
            db_chroma.delete_collection()
            del db_chroma
            test_prompts = list(prompt2attr)
            for prompt, gen in zip(test_prompts, f.llm.generate([[HumanMessage(content=full_prompt)] for full_prompt in test_prompts], max_tokens=2000).generations):
                results[test_file][f'{prompt2attr[prompt]}_gen'] = gen[0].text
            # break
                
        write_json(f'sys_review_ret_{ret_num}_{f.llm_name.split("/")[-1]}_{f.embeder.model_name.split("/")[-1]}.json', results)

In [12]:
sys_review_chatgpt = {
    "haq-et-al-2017.pdf": {
        "EBP_gen": '''The evidence-based practice studied in the paper is **Functional Communication Training (FCT)**. FCT is a set of practices designed to replace challenging behaviors, which serve a communication function, with more appropriate and effective communication behaviors or skills. In this study, FCT was used to teach participants with autism spectrum disorder (ASD) alternative communication responses to reduce challenging behaviors such as aggression and self-injury [oai_citation:1,haq-et-al-2017.pdf](file-service://file-nzak4c6Ukc3C0etfqRjyxx4L).''',
        "age_gen": '''The participants in the experiment were both 10 years old [oai_citation:1,haq-et-al-2017.pdf](file-service://file-nzak4c6Ukc3C0etfqRjyxx4L).''',
        "diagnosis_gen": '''The diagnoses of the participants in the experiment are **autism spectrum disorder** [oai_citation:1,haq-et-al-2017.pdf](file-service://file-nzak4c6Ukc3C0etfqRjyxx4L).''',
        "comorbidity_gen": '''The comorbidities of the participants in the experiment are **Not reported** [oai_citation:1,haq-et-al-2017.pdf](file-service://file-nzak4c6Ukc3C0etfqRjyxx4L).''',
        "gender_gen": '''The genders of the participants in the experiment are reported as both being male. Specifically, both participants were 10-year-old boys with parent-reported diagnoses of Autism Spectrum Disorder (ASD) [oai_citation:1,haq-et-al-2017.pdf](file-service://file-mFXbl8jIK8PWJvGejgWMKWxP).''',
        "race_gen": '''The races of the participants in the experiment are **Not reported** [oai_citation:1,haq-et-al-2017.pdf](file-service://file-nzak4c6Ukc3C0etfqRjyxx4L).''',
        "research_strategy_gen": '''The research strategy implemented in the experiment is **single case design** [oai_citation:1,haq-et-al-2017.pdf](file-service://file-nzak4c6Ukc3C0etfqRjyxx4L).''',
        "outcome_gen": '''The outcomes of the participants reported in the experiment belong to the following types:\n\n- **Challenging/Interfering behavior**: The study focused on reducing challenging behaviors such as aggression, disruption, and self-injury.\n- **Communication**: The study aimed to increase functional communication through functional communication training (FCT) [oai_citation:1,haq-et-al-2017.pdf](file-service://file-nzak4c6Ukc3C0etfqRjyxx4L).'''
    },
    "brogan-et-al-2017.pdf": {
        "EBP_gen": '''The paper **"Further Analysis of the Predictive Effects of a Free-Operant Competing Stimulus Assessment on Stereotypy"** focuses on **Differential Reinforcement of Alternative Behavior (DRA)**, particularly in the context of reducing stereotypy. In Experiment 5, a DRA procedure is used where participants earn access to a high-preference, low-stereotypy (HP-LS) stimulus contingent upon correct responses during discrete-trial training sessions. This suggests that the primary practice studied in the paper falls under **Reinforcement**, as the interventions rely on reinforcing desired behaviors to reduce stereotypy [oai_citation:1,brogan-et-al-2017.pdf](file-service://file-ulwBdjGNe9n8MqxNvyQ0jbTl).''',
        "age_gen": '''The ages of the participants in the experiment are reported as follows:\n\n- Aiden: 5 years\n- Max: 5 years\n- Jeremy: 6 years\n- Bentley: 6 years\n- Casey: 5 years\n- Caden: 6 years\n- Gavin: 7 years\n- Mark: 3 years\n- Steven: 5 years\n- James: 5 years\n- Nick: 4 years\n- Alex: 3 years\n- Sam: 4 years [oai_citation:1,brogan-et-al-2017.pdf](file-service://file-ulwBdjGNe9n8MqxNvyQ0jbTl).''',
        "diagnosis_gen": '''Based on the document content provided, all the participants in the experiments, except one, were diagnosed with **autism spectrum disorder (ASD)**. Specifically, 12 out of 13 participants were diagnosed with ASD, while one participant, Alex, was diagnosed with a **speech delay** [oai_citation:1,brogan-et-al-2017.pdf](file-service://file-w5NXAl6V226tEteVxYLOk5Fs). \n\nTherefore, the primary diagnosis for the participants in the experiment would be **autism spectrum disorder**.''',
        "comorbidity_gen": '''The document does not explicitly mention comorbidities that align with the provided conditions. Therefore, based on the search results, the comorbidities of the participants are **not reported**.''',
        "gender_gen": '''Based on the document, the genders of the participants in the experiment are **not reported** [oai_citation:1,brogan-et-al-2017.pdf](file-service://file-w5NXAl6V226tEteVxYLOk5Fs).''',
        "race_gen": '''Based on the document, the races of the participants in the experiment are **not reported** [oai_citation:1,brogan-et-al-2017.pdf](file-service://file-w5NXAl6V226tEteVxYLOk5Fs).''',
        "research_strategy_gen": '''Based on the document, the research strategy implemented in the experiment is **single case design** [oai_citation:1,brogan-et-al-2017.pdf](file-service://file-w5NXAl6V226tEteVxYLOk5Fs).''',
        "outcome_gen": '''The outcomes of the participants reported in the experiment are related to **Challenging/Interfering behavior**, as the experiment focused on decreasing or eliminating behaviors such as stereotypy that interfere with the participants' ability to learn [oai_citation:1,brogan-et-al-2017.pdf](file-service://file-w5NXAl6V226tEteVxYLOk5Fs).'''
    },
    "enloe-rapp-2013.pdf": {
        "EBP_gen": '''Based on the details from the paper "Effects of Noncontingent Social Interaction on Immediate and Subsequent Engagement in Vocal and Motor Stereotypy in Children With Autism" by Enloe and Rapp, the type of evidence-based practice studied is **Noncontingent Reinforcement**, which aligns most closely with **Reinforcement** as defined in the provided descriptions.\n\nIn the study, social interaction was used as a form of noncontingent reinforcement to reduce engagement in vocal and motor stereotypy in children with autism. This practice focuses on the application of a consequence (in this case, social interaction) that decreases unwanted behaviors and could increase the likelihood of desired social or communication behaviors in the future [oai_citation:1,enloe-rapp-2013.pdf](file-service://file-9mw2Bqn40eECLtYtAwhWQF4f).''',
        "age_gen": '''The ages of the participants in the experiment are as follows:\n\n- Jessie: 8 years old\n- Buddy: 13 years old\n- George: 6 years old [oai_citation:1,enloe-rapp-2013.pdf](file-service://file-9mw2Bqn40eECLtYtAwhWQF4f).''',
        "diagnosis_gen": '''The diagnoses of the participants in the experiment are as follows:\n\n- Jessie: autism\n- Buddy: autism\n- George: autism [oai_citation:1,enloe-rapp-2013.pdf](file-service://file-9mw2Bqn40eECLtYtAwhWQF4f).''',
        "comorbidity_gen": '''The comorbidities of the participants in the experiment are as follows:\n\n- Jessie: Not reported\n- Buddy: No comorbidity\n- George: Seizure disorder (epilepsy), hearing impairment [oai_citation:1,enloe-rapp-2013.pdf](file-service://file-9mw2Bqn40eECLtYtAwhWQF4f).''',
        "gender_gen": '''The genders of the participants in the study are as follows:\n\n- Jessie: Girl (8 years old)\n- Buddy: Boy (13 years old)\n- George: Boy (6 years old) [oai_citation:1,enloe-rapp-2013.pdf](file-service://file-oMQQYutFjYqWF32MPv7Tp8Mg).''',
        "race_gen": '''The races of the participants in the experiment are **Not reported** [oai_citation:1,enloe-rapp-2013.pdf](file-service://file-9mw2Bqn40eECLtYtAwhWQF4f).''',
        "research_strategy_gen": '''The type of research strategy implemented in the experiment is **single case design** [oai_citation:1,enloe-rapp-2013.pdf](file-service://file-9mw2Bqn40eECLtYtAwhWQF4f).''',
        "outcome_gen": '''The outcomes of the participants reported in the experiment fall under the **Challenging/Interfering behavior** and **Motor** outcome types. The study focuses on reducing vocal and motor stereotypy, which are behaviors that interfere with the individual's ability to learn and engage appropriately [oai_citation:1,enloe-rapp-2013.pdf](file-service://file-9mw2Bqn40eECLtYtAwhWQF4f).'''
    },
    "leaf-et-al-2014.pdf": {
        "EBP_gen": '''The paper you uploaded focuses on comparing **Flexible Prompt Fading** and **Error Correction** for teaching children with Autism Spectrum Disorder how to expressively label pictures. This study evaluates two instructional strategies within the framework of **Discrete Trial Teaching (DTT)**.\n\nGiven the list of evidence-based practices, this study fits under the **Prompting** category. Prompting is described as verbal, gestural, or physical assistance provided to support learners in acquiring or engaging in a targeted behavior or skill. Both **Flexible Prompt Fading** and **Error Correction** are prompt-based strategies designed to teach correct responses by providing various levels of assistance based on the learner's needs.\n\nThus, the evidence-based practice studied in this paper is **Prompting**.''',
        "age_gen": '''The ages of the participants in the experiment are reported. The participants ranged in age from 4 to 6 years old:\n\n- Rob: 5 years old\n- Jimmy: 4 years old\n- Billy: 5 years old\n- Kenny: 6 years old [oai_citation:1,leaf-et-al-2014.pdf](file-service://file-VrjVcWpvIqysQFL51rCPcFK7).''',
        "diagnosis_gen": '''The diagnoses of the participants in the experiment are **autism**. All participants were independently diagnosed with **autistic disorder** [oai_citation:1,leaf-et-al-2014.pdf](file-service://file-VrjVcWpvIqysQFL51rCPcFK7).''',
        "comorbidity_gen": '''The comorbidities of the participants in the experiment are **Not reported** [oai_citation:1,leaf-et-al-2014.pdf](file-service://file-VrjVcWpvIqysQFL51rCPcFK7).''',
        "gender_gen": '''The genders of the participants in the experiment are not reported in the document. Therefore, the answer is "Not reported."''',
        "race_gen": '''The races of the participants in the experiment are **Not reported** [oai_citation:1,leaf-et-al-2014.pdf](file-service://file-VrjVcWpvIqysQFL51rCPcFK7).''',
        "research_strategy_gen": '''The type of research strategy implemented in the experiment is **single case design** [oai_citation:1,leaf-et-al-2014.pdf](file-service://file-VrjVcWpvIqysQFL51rCPcFK7).''',
        "outcome_gen": '''The outcomes of the participants reported in the experiment belong to the **Communication** outcome type. The experiment focused on teaching children with autism spectrum disorder to expressively label pictures, which is related to their ability to express wants, needs, or ideas [oai_citation:1,leaf-et-al-2014.pdf](file-service://file-VrjVcWpvIqysQFL51rCPcFK7).'''
    },
    "jung-et-al-2008.pdf": {
        "EBP_gen": '''The evidence-based practice studied in this paper is "Behavioral Momentum Intervention." The intervention described involves using high-probability request sequences (a sequence of easy tasks, followed by more difficult ones) to increase social interactions in young children with autism. This approach matches the description of Behavioral Momentum Intervention, where low-probability responses are embedded in a sequence of high-probability responses to increase the persistence and occurrence of the more challenging responses [oai_citation:1,jung-et-al-2008.pdf](file-service://file-HnF6mTjqfydhWacL1shOWhU2).''',
        "age_gen": '''The ages of the participants in the experiment are as follows:\n\n- Eric: 5 years, 3 months\n- Paul: 6 years, 3 months\n- Chris: 6 years, 5 months [oai_citation:1,jung-et-al-2008.pdf](file-service://file-HnF6mTjqfydhWacL1shOWhU2).''',
        "diagnosis_gen": '''The diagnoses of the participants in the experiment are:\n\n- Eric: pervasive developmental disorder-not otherwise specified (PDD-NOS)\n- Paul: pervasive developmental disorder-not otherwise specified (PDD-NOS)\n- Chris: autism [oai_citation:1,jung-et-al-2008.pdf](file-service://file-HnF6mTjqfydhWacL1shOWhU2).''',
        "comorbidity_gen": '''The comorbidities of the participants in the experiment are:\n\n- Eric: intellectual disability\n- Paul: intellectual disability\n- Chris: intellectual disability [oai_citation:1,jung-et-al-2008.pdf](file-service://file-HnF6mTjqfydhWacL1shOWhU2).''',
        "gender_gen": '''The genders of the participants with autism in the study are all male. Specifically, the study reports three participants: Eric, Paul, and Chris, who are all identified as male [oai_citation:1,jung-et-al-2008.pdf](file-service://file-Np92HvlOuAS6Etcs3K7hOE6p).''',
        "race_gen": '''The races of the participants in the experiment are: Not reported [oai_citation:1,jung-et-al-2008.pdf](file-service://file-HnF6mTjqfydhWacL1shOWhU2).''',
        "research_strategy_gen": '''The type of research strategy implemented in the experiment is "single case design" [oai_citation:1,jung-et-al-2008.pdf](file-service://file-HnF6mTjqfydhWacL1shOWhU2).''',
        "outcome_gen": '''The outcomes of the participants reported in the experiment fall under the following outcome types:\n\n- Social: The intervention increased the participants' social interactions with their peers.\n- Challenging/Interfering behavior: The intervention decreased disruptive behaviors during social interactions [oai_citation:1,jung-et-al-2008.pdf](file-service://file-HnF6mTjqfydhWacL1shOWhU2).'''
    },
    "neely-et-al-2014.pdf": {
        "EBP_gen": '''The evidence-based practice studied in the paper you uploaded, titled "Effects of Antecedent Exercise on Academic Engagement and Stereotypy During Instruction," is **Exercise and Movement**. \n\nThe study evaluates the effects of physical exertion, specifically antecedent physical exercise, on reducing challenging behavior (stereotypy) and increasing academic engagement in children with autism spectrum disorder (ASD) [oai_citation:1,neely-et-al-2014.pdf](file-service://file-JlXxqN2mTsaL4pEnPm6Q98xw).''',
        "age_gen": '''The ages of the participants in the experiment are reported as follows:\n\n- Ally: 8 years old\n- Chad: 7 years old [oai_citation:1,neely-et-al-2014.pdf](file-service://file-JlXxqN2mTsaL4pEnPm6Q98xw).''',
        "diagnosis_gen": '''The diagnoses of the participants in the experiment are **autism spectrum disorder** [oai_citation:1,neely-et-al-2014.pdf](file-service://file-JlXxqN2mTsaL4pEnPm6Q98xw).''',
        "comorbidity_gen": '''The comorbidities of the participants in the experiment are as follows:\n\n- Ally: **No comorbidity** (no comorbidities reported).\n- Chad: **Intellectual disability** (severe intellectual disability) [oai_citation:1,neely-et-al-2014.pdf](file-service://file-JlXxqN2mTsaL4pEnPm6Q98xw).''',
        "gender_gen": '''The genders of the participants with autism in the experiment are reported as follows:\n\n- One participant, Ally, is an 8-year-old girl.\n- The other participant, Chad, is a 7-year-old boy [oai_citation:1,neely-et-al-2014.pdf](file-service://file-goosfNsbPwXB7fGFu2zJTsrL).''',
        "race_gen": '''The races of the participants in the experiment are **Not reported** [oai_citation:1,neely-et-al-2014.pdf](file-service://file-JlXxqN2mTsaL4pEnPm6Q98xw).''',
        "research_strategy_gen": '''The type of research strategy implemented in the experiment is **single case design** [oai_citation:1,neely-et-al-2014.pdf](file-service://file-JlXxqN2mTsaL4pEnPm6Q98xw).''',
        "outcome_gen": '''The outcomes of the participants reported in the experiment belong to the following outcome types:\n\n- **Academic/Pre-academic**: The study measured **academic engagement** during instructional sessions.\n- **Challenging/Interfering behavior**: The study measured **stereotypy** (a type of challenging behavior) and its reduction following antecedent physical exercise [oai_citation:1,neely-et-al-2014.pdf](file-service://file-JlXxqN2mTsaL4pEnPm6Q98xw).'''
    },
    "howorth-et-al-2016.pdf": {
        "EBP_gen": '''The type of Evidence-Based Practice studied in the paper is **Cognitive Behavioral/Instructional Strategies**. This is evident from the use of the TWA (think before reading, think while reading, think after reading) strategy, which teaches students self-regulated strategy development for improving reading comprehension. The strategy explicitly focuses on managing cognitive processes, aligning with the description of Cognitive Behavioral/Instructional Strategies [oai_citation:1,howorth-et-al-2016.pdf](file-service://file-TIvMEU5LGPrAlKd0vSE1ynEE).''',
        "age_gen": '''The ages of the participants in the experiment are reported as 10 and 11 years old [oai_citation:1,howorth-et-al-2016.pdf](file-service://file-TIvMEU5LGPrAlKd0vSE1ynEE).''',
        "diagnosis_gen": '''The diagnoses of the participants in the experiment are **autism spectrum disorder** [oai_citation:1,howorth-et-al-2016.pdf](file-service://file-TIvMEU5LGPrAlKd0vSE1ynEE).''',
        "comorbidity_gen": '''The comorbidities of the participants are **Not reported** [oai_citation:1,howorth-et-al-2016.pdf](file-service://file-TIvMEU5LGPrAlKd0vSE1ynEE).''',
        "gender_gen": '''The genders of the studied participants with autism in the experiment are all male, as the study involved four boys with autism spectrum disorder (ASD) [oai_citation:1,howorth-et-al-2016.pdf](file-service://file-jt3gZhw7yHSpP1aZ9ELCoLM6).''',
        "race_gen": '''The races of the participants in the experiment are **Not reported** [oai_citation:1,howorth-et-al-2016.pdf](file-service://file-TIvMEU5LGPrAlKd0vSE1ynEE).''',
        "research_strategy_gen": '''The type of research strategy implemented in the experiment is **single case design** [oai_citation:1,howorth-et-al-2016.pdf](file-service://file-TIvMEU5LGPrAlKd0vSE1ynEE).''',
        "outcome_gen": '''The outcomes of the participants reported in the experiment belong to the **Academic/Pre-academic** outcome type. The study focuses on improving reading comprehension, a task typically taught and used in school settings [oai_citation:1,howorth-et-al-2016.pdf](file-service://file-TIvMEU5LGPrAlKd0vSE1ynEE).'''
    },
    "drasgow-et-al-2015.pdf": {
        "EBP_gen": '''The evidence-based practice studied in this paper is **Functional Communication Training (FCT)**. The paper focuses on replacing existing challenging behaviors used for communication (such as reaching) with more appropriate and effective communication behaviors (mand training with the signs for "More" and "Please") for children with autism and severe language delays. This aligns with FCT, which is designed to replace inappropriate behaviors that serve a communication function with more appropriate alternatives [oai_citation:1,drasgow-et-al-2015.pdf](file-service://file-Iiey8jG5rQz5ghLQRZ66RhtQ).''',
        "age_gen": '''The ages of the participants in the experiment are reported as follows:\n\n- Charles: 4 years old\n- David: 3 years old\n- Jacob: 3 years old [oai_citation:1,drasgow-et-al-2015.pdf](file-service://file-Iiey8jG5rQz5ghLQRZ66RhtQ).''',
        "diagnosis_gen": '''The diagnoses of the participants in the experiment are **autism**. Specifically, all three children—Charles, David, and Jacob—were diagnosed with autism. Additionally, Jacob also had a diagnosis of Fragile X syndrome [oai_citation:1,drasgow-et-al-2015.pdf](file-service://file-Iiey8jG5rQz5ghLQRZ66RhtQ).''',
        "comorbidity_gen": '''The comorbidity reported for one of the participants, Jacob, is **genetic syndrome (Fragile X syndrome)**. There are no other comorbidities mentioned for the other participants [oai_citation:1,drasgow-et-al-2015.pdf](file-service://file-Iiey8jG5rQz5ghLQRZ66RhtQ).''',
        "gender_gen": '''The genders of the participants in the study were reported as follows:\n\n- Charles: 4-year-old male\n- David: 3-year-old male\n- Jacob: 3-year-old male [oai_citation:1,drasgow-et-al-2015.pdf](file-service://file-opjKFDW6o4hjK9PT5wkPMiVj).''',
        "race_gen": '''The races of the participants in the experiment are reported as **White** [oai_citation:1,drasgow-et-al-2015.pdf](file-service://file-Iiey8jG5rQz5ghLQRZ66RhtQ).''',
        "research_strategy_gen": '''The type of research strategy implemented in the experiment is **single case design** [oai_citation:1,drasgow-et-al-2015.pdf](file-service://file-Iiey8jG5rQz5ghLQRZ66RhtQ).''',
        "outcome_gen": '''The outcomes of the participants reported in the experiment belong to the **Communication** outcome type. The study focused on improving the participants' ability to use new mands (i.e., communication behaviors such as "More" and "Please") to express their needs and replace less socially appropriate behaviors [oai_citation:1,drasgow-et-al-2015.pdf](file-service://file-Iiey8jG5rQz5ghLQRZ66RhtQ).'''
    },
    "kamps-et-al-2016.pdf": {
        "EBP_gen": '''The evidence-based practice studied in the paper "Effects of Reading Mastery as a Small Group Intervention for Young Children with ASD" is **Direct Instruction**.\n\nThe study examines the use of the **Reading Mastery curriculum**, a systematic and sequenced instructional package characterized by scripted protocols. This aligns with the description of Direct Instruction, which emphasizes teacher-student dialogue, error correction, and mastery of content [oai_citation:1,kamps-et-al-2016.pdf](file-service://file-3eqBuYHMi9x2JW28QnNLtJZG).''',
        "age_gen": '''The ages of the participants in the experiment are reported as ranging from **62 to 82 months** [oai_citation:1,kamps-et-al-2016.pdf](file-service://file-3eqBuYHMi9x2JW28QnNLtJZG).''',
        "diagnosis_gen": '''The diagnoses of the participants in the experiment are **autism spectrum disorder** [oai_citation:1,kamps-et-al-2016.pdf](file-service://file-3eqBuYHMi9x2JW28QnNLtJZG).''',
        "comorbidity_gen": '''**Not reported** [oai_citation:1,kamps-et-al-2016.pdf](file-service://file-3eqBuYHMi9x2JW28QnNLtJZG).''',
        "gender_gen": '''The study reports the genders of the participants with autism as 53 males and 9 females [oai_citation:1,kamps-et-al-2016.pdf](file-service://file-49MnUIeMsYHX4imkaCoIVpBd).''',
        "race_gen": '''The races of the participants in the experiment include:\n\n- **White**\n- **African American/Black**\n- **Hispanic/Latino**\n- **Asian** [oai_citation:1,kamps-et-al-2016.pdf](file-service://file-3eqBuYHMi9x2JW28QnNLtJZG).''',
        "research_strategy_gen": '''The type of research strategy implemented in the experiment is **group design** [oai_citation:1,kamps-et-al-2016.pdf](file-service://file-3eqBuYHMi9x2JW28QnNLtJZG).''',
        "outcome_gen": '''The outcomes of the participants reported in the experiment belong to the **Academic/Pre-academic** outcome type. The study focused on literacy skills, including decoding, word recognition, oral reading, and comprehension [oai_citation:1,kamps-et-al-2016.pdf](file-service://file-3eqBuYHMi9x2JW28QnNLtJZG).'''
    },
    "kodak-et-al-2011.pdf": {
        "EBP_gen": '''Based on the paper titled *"Functional assessment of instructional variables: Linking assessment and treatment"* [oai_citation:1,kodak-et-al-2011.pdf](file-service://file-0Safwz05QDI9YtuhjPZl98Gh), the evidence-based practice being studied is **Reinforcement**. \n\nThe paper describes how different instructional variables, including **reinforcement (Sr+)**, were assessed to influence the acquisition of conditional discriminations in children diagnosed with autism. Specifically, reinforcement was used as a key component of the interventions, with stimuli or preferred items provided contingent on correct responses to increase correct behavior.''',
        "age_gen": '''The ages of the participants in the experiment are reported in Table 1 of the paper. Here are the ages of the participants:\n\n- Victor: 8 years, 7 months\n- Doug: 4 years, 2 months\n- Mark: 5 years, 2 months\n- Oliver: 3 years, 8 months\n- Eric: 4 years, 10 months\n- Andrew: 10 years, 1 month\n- Rose: 3 years, 8 months\n- Kevin: 4 years, 4 months\n- Linda: 7 years, 1 month\n- Bobby: 4 years, 2 months\n- Hal: 4 years, 1 month [oai_citation:1,kodak-et-al-2011.pdf](file-service://file-0Safwz05QDI9YtuhjPZl98Gh).''',
        "diagnosis_gen": '''The diagnoses of the participants in the experiment are all **autism** [oai_citation:1,kodak-et-al-2011.pdf](file-service://file-0Safwz05QDI9YtuhjPZl98Gh).''',
        "comorbidity_gen": '''The comorbidities of the participants in the experiment are **Not reported** [oai_citation:1,kodak-et-al-2011.pdf](file-service://file-0Safwz05QDI9YtuhjPZl98Gh).''',
        "gender_gen": '''The genders of the participants in the experiment are **Not reported** [oai_citation:1,kodak-et-al-2011.pdf](file-service://file-0Safwz05QDI9YtuhjPZl98Gh).''',
        "race_gen": '''The races of the participants in the experiment are **Not reported** [oai_citation:1,kodak-et-al-2011.pdf](file-service://file-0Safwz05QDI9YtuhjPZl98Gh).''',
        "research_strategy_gen": '''The type of research strategy implemented in the experiment is **single case design** [oai_citation:1,kodak-et-al-2011.pdf](file-service://file-0Safwz05QDI9YtuhjPZl98Gh).''',
        "outcome_gen": '''The outcomes of the participants reported in the experiment fall under the **Academic/Pre-academic** outcome type, as the study focuses on the acquisition of conditional discriminations, which are skills typically taught and used in school settings [oai_citation:1,kodak-et-al-2011.pdf](file-service://file-0Safwz05QDI9YtuhjPZl98Gh).'''
    },
}

write_json(f'sys_review_chatgpt.json', sys_review_chatgpt)

# Evaluation

In [7]:
samples = {sample['file']:sample for sample in dataset}

In [9]:
# sys_review_chatgpt = read_json('sys_review_chatgpt.json')
sys_review_ret_mistral_mpnet = read_json('sys_review_ret_10_Mistral-7B-Instruct-v0.3_all-mpnet-base-v2.json')
sys_review_ret_mistral_bge = read_json('sys_review_ret_10_Mistral-7B-Instruct-v0.3_bge-large-en-v1.5.json')
sys_review_ret_mistral_mistral = read_json('sys_review_ret_10_Mistral-7B-Instruct-v0.3_e5-mistral-7b-instruct.json')
sys_review_ret_llama3_mpnet = read_json('sys_review_ret_10_Meta-Llama-3.1-8B-Instruct_all-mpnet-base-v2.json')
sys_review_ret_llama3_bge = read_json('sys_review_ret_10_Meta-Llama-3.1-8B-Instruct_bge-large-en-v1.5.json')
sys_review_ret_llama3_mistral = read_json('sys_review_ret_10_Meta-Llama-3.1-8B-Instruct_e5-mistral-7b-instruct.json')
sys_review_openllm_mistral = read_json('sys_review_openllm_Mistral-7B-Instruct-v0.3.json')
sys_review_openllm_llama3 = read_json('sys_review_openllm_Meta-Llama-3.1-8B-Instruct.json')

In [None]:
test_data = sys_review_openllm_mistral
for sample in samples:
    print(sample)
    for attr in samples[sample]:
        if attr == 'file':
            continue
        print(attr)
        if attr == 'EBP':
            continue
        #     print('Gold Standard:', samples[sample][attr], ebp_description_dict[samples[sample][attr]][0])
        # else:
        print('Gold Standard:', samples[sample][attr])
        print('Generation:', test_data[sample][f'{attr}_gen'])
        print('-----\n')
    print('-------------------------------------\n\n')

In [None]:
sys_review_ret_mistral_bge['haq-et-al-2017.pdf'].keys()

In [52]:
result = {
    'ret_mistral_mistral': {
        'variable_definition': (2./9, 0.25),
        'research_gap': (3./9, 0.6)
    },
    'ret_mistral_bge': {
        'variable_definition': (0, 0),
        'research_gap': (0, 0)
    },
    # 'ret_mistral_mpnet': {
    #     'variable_definition': (1./11, 0.25),
    #     'research_gap': (0, 0)
    # },
    'ret_llama3_mistral': {
        'variable_definition': (2./50, 0.25),
        'research_gap': (2./5, 0.4)
    },
    'ret_llama3_bge': {
        'variable_definition': (0, 0),
        'research_gap': (0, 0)
    },
    # 'ret_mistral_mpnet': {
    #     'variable_definition': (1./11, 0.25),
    #     'research_gap': (0, 0)
    # },
    'openllm_llama3': {
        'variable_definition': (4./15, 1),
        'research_gap': (3./15, 0.6)
    },
    'openllm_mistral': {
        'variable_definition': (None, 0.75),
        'research_gap': (1, 0.6)
    },
    'chatgpt': {
        'variable_definition': (3./7, 0.75),
        'research_gap': (1, 0.6)
    }
}

In [None]:

a = []
for model, data in result.items():
    temp = {'model': model}
    for q, (p, r) in data.items():
        temp[f'{q}_p'] = p
        temp[f'{q}_r'] = r
    a.append(temp)
df = pd.DataFrame(a)
df

In [None]:
print(sys_review_openllm_mistral['haq-et-al-2017.pdf']['research_gap_gen'])

In [None]:
print(sys_review_openllm_mistral['haq-et-al-2017.pdf']['variable_definition_gen'])

In [None]:
retrieval_content = sys_review_ret_llama3_bge['haq-et-al-2017.pdf']['research_gap_retrieve_context']
for sent, _ in retrieval_content:
    print(sent)
    print('-------------------------\n\n')

In [None]:
retrieval_content = sys_review_ret_mistral_bge['haq-et-al-2017.pdf']['variable_definition_retrieve_context']
for sent, _ in retrieval_content:
    print(sent)
    print('-------------------------\n\n')

In [27]:
eval_results = {'chatgpt' : [
    {
        'file': 'haq-et-al-2017.pdf',
        'EBP': 0,
        'age': (1,1),
        'gender': (1,1),
    },
    {
        'file': 'brogan-et-al-2017.pdf',
        'EBP': 0,
        'age': (1,1),
        'gender': (1,1),
    },
    {
        'file': 'enloe-rapp-2013.pdf',
        'EBP': 0,
        'age': (1,1),
        'gender': (1,1),
    },
    {
        'file': 'leaf-et-al-2014.pdf',
        'EBP': 0,
        'age': (1,1),
        'gender': (0,0),
    },
    {
        'file': 'jung-et-al-2008.pdf',
        'EBP': 1,
        'age': (1,1),
        'gender': (1,1),
    },
    {
        'file': 'neely-et-al-2014.pdf',
        'EBP': 1,
        'age': (1,1),
        'gender': (1,1),
    },
    {
        'file': 'howorth-et-al-2016.pdf',
        'EBP': 1,
        'age': (1,1),
        'gender': (1,1),
    },
    {
        'file': 'drasgow-et-al-2015.pdf',
        'EBP': 0,
        'age': (1,1),
        'gender': (1,1),
    },
    {
        'file': 'kamps-et-al-2016.pdf',
        'EBP': 1,
        'age': (1,1),
        'gender': (1,1),
    },
    {
        'file': 'kodak-et-al-2011.pdf',
        'EBP': 0,
        'age': (1,1),
        'gender': (1,1),
    },
],
'openllm_mistral' : [
    {
        'file': 'haq-et-al-2017.pdf',
        'EBP': 0,
        'age': (0,0),
        'gender': (0,0),
    },
    {
        'file': 'brogan-et-al-2017.pdf',
        'EBP': 1,
        'age': (0,0),
        'gender': (1,1),
    },
    {
        'file': 'enloe-rapp-2013.pdf',
        'EBP': 1,
        'age': (0,0),
        'gender': (0,0),
    },
    {
        'file': 'leaf-et-al-2014.pdf',
        'EBP': 1,
        'age': (1,1),
        'gender': (0,0),
    },
    {
        'file': 'jung-et-al-2008.pdf',
        'EBP': 0,
        'age': (0,0),
        'gender': (0,0),
    },
    {
        'file': 'neely-et-al-2014.pdf',
        'EBP': 0,
        'age': (0,0),
        'gender': (0,0),
    },
    {
        'file': 'howorth-et-al-2016.pdf',
        'EBP': 1,
        'age': (1,1),
        'gender': (0,0),
    },
    {
        'file': 'drasgow-et-al-2015.pdf',
        'EBP': 0,
        'age': (0,0),
        'gender': (0,0),
    },
    {
        'file': 'kamps-et-al-2016.pdf',
        'EBP': 1,
        'age': (0,0),
        'gender': (0,0),
    },
    {
        'file': 'kodak-et-al-2011.pdf',
        'EBP': 1,
        'age': (1,2./11),
        'gender': (1,1),
    },
],
'openllm_llama3' : [
    {
        'file': 'haq-et-al-2017.pdf',
        'EBP': 0,
        'age': (0,0),
        'gender': (0,0),
    },
    {
        'file': 'brogan-et-al-2017.pdf',
        'EBP': 1,
        'age': (1, 1),
        'gender': (1, 1),
    },
    {
        'file': 'enloe-rapp-2013.pdf',
        'EBP': 0,
        'age': (1, 1),
        'gender': (1, 1),
    },
    {
        'file': 'leaf-et-al-2014.pdf',
        'EBP': 0,
        'age': (1, 1),
        'gender': (0,0),
    },
    {
        'file': 'jung-et-al-2008.pdf',
        'EBP': 0,
        'age': (1, 1),
        'gender': (1, 1),
    },
    {
        'file': 'neely-et-al-2014.pdf',
        'EBP': 1,
        'age': (1, 1),
        'gender': (1, 1),
    },
    {
        'file': 'howorth-et-al-2016.pdf',
        'EBP': 0,
        'age': (1, 1),
        'gender': (1, 1),
    },
    {
        'file': 'drasgow-et-al-2015.pdf',
        'EBP': 0,
        'age': (1, 1),
        'gender': (1, 1),
    },
    {
        'file': 'kamps-et-al-2016.pdf',
        'EBP': 1,
        'age': (1, 1),
        'gender': (1, 1),
    },
    {
        'file': 'kodak-et-al-2011.pdf',
        'EBP': 0,
        'age': (1, 1),
        'gender': (1, 1),
    },
],
'ret_mistral_mistral' : [
    {
        'file': 'haq-et-al-2017.pdf',
        'EBP': 0,
        'age': (0,0),
        'gender': (1, 1),
    },
    {
        'file': 'brogan-et-al-2017.pdf',
        'EBP': 1,
        'age': (0,0),
        'gender': (1, 1),
    },
    {
        'file': 'enloe-rapp-2013.pdf',
        'EBP': 0,
        'age': (0,0),
        'gender': (0,0),
    },
    {
        'file': 'leaf-et-al-2014.pdf',
        'EBP': 0,
        'age': (0,0),
        'gender': (0,0),
    },
    {
        'file': 'jung-et-al-2008.pdf',
        'EBP': 0,
        'age': (1, 1),
        'gender': (1, 1),
    },
    {
        'file': 'neely-et-al-2014.pdf',
        'EBP': 0,
        'age': (1, 1),
        'gender': (1, 1),
    },
    {
        'file': 'howorth-et-al-2016.pdf',
        'EBP': 0,
        'age': (0,0),
        'gender': (0,0),
    },
    {
        'file': 'drasgow-et-al-2015.pdf',
        'EBP': 0,
        'age': (1, 1),
        'gender': (1, 1),
    },
    {
        'file': 'kamps-et-al-2016.pdf',
        'EBP': 1,
        'age': (1, 1),
        'gender': (1, 1),
    },
    {
        'file': 'kodak-et-al-2011.pdf',
        'EBP': 1,
        'age': (2./3, 10./11),
        'gender': (1, 1),
    },
],
'ret_mistral_bge' : [
    {
        'file': 'haq-et-al-2017.pdf',
        'EBP': 0,
        'age': (0,0),
        'gender': (1, 1),
    },
    {
        'file': 'brogan-et-al-2017.pdf',
        'EBP': 1,
        'age': (0,0),
        'gender': (1, 1),
    },
    {
        'file': 'enloe-rapp-2013.pdf',
        'EBP': 0,
        'age': (1, 2./3),
        'gender': (0,0),
    },
    {
        'file': 'leaf-et-al-2014.pdf',
        'EBP': 0,
        'age': (0,0),
        'gender': (0,0),
    },
    {
        'file': 'jung-et-al-2008.pdf',
        'EBP': 0,
        'age': (1, 1),
        'gender': (0,0),
    },
    {
        'file': 'neely-et-al-2014.pdf',
        'EBP': 0,
        'age': (1, 1),
        'gender': (0,0),
    },
    {
        'file': 'howorth-et-al-2016.pdf',
        'EBP': 0,
        'age': (0,0),
        'gender': (1, 1),
    },
    {
        'file': 'drasgow-et-al-2015.pdf',
        'EBP': 0,
        'age': (0,0),
        'gender': (1, 1),
    },
    {
        'file': 'kamps-et-al-2016.pdf',
        'EBP': 1,
        'age': (1, 1),
        'gender': (1, 1),
    },
    {
        'file': 'kodak-et-al-2011.pdf',
        'EBP': 1,
        'age': (0,0),
        'gender': (1, 1),
    },
],
'ret_mistral_mpnet' : [
    {
        'file': 'haq-et-al-2017.pdf',
        'EBP': 0,
        'age': (0,0),
        'gender': (0,0),
    },
    {
        'file': 'brogan-et-al-2017.pdf',
        'EBP': 1,
        'age': (0,0),
        'gender': (1, 1),
    },
    {
        'file': 'enloe-rapp-2013.pdf',
        'EBP': 0,
        'age': (1, 1),
        'gender': (0,0),
    },
    {
        'file': 'leaf-et-al-2014.pdf',
        'EBP': 0,
        'age': (0.8, 1),
        'gender': (0,0),
    },
    {
        'file': 'jung-et-al-2008.pdf',
        'EBP': 0,
        'age': (1, 1),
        'gender': (1, 2./3),
    },
    {
        'file': 'neely-et-al-2014.pdf',
        'EBP': 0,
        'age': (1, 1),
        'gender': (1, 1),
    },
    {
        'file': 'howorth-et-al-2016.pdf',
        'EBP': 0,
        'age': (1, 1),
        'gender': (0,0),
    },
    {
        'file': 'drasgow-et-al-2015.pdf',
        'EBP': 0,
        'age': (0,0),
        'gender': (0,0),
    },
    {
        'file': 'kamps-et-al-2016.pdf',
        'EBP': 1,
        'age': (1, 1),
        'gender': (1, 1),
    },
    {
        'file': 'kodak-et-al-2011.pdf',
        'EBP': 1,
        'age': (1, 1),
        'gender': (1, 1),
    },
],
'ret_llama3_mistral' : [
    {
        'file': 'haq-et-al-2017.pdf',
        'EBP': 0,
        'age': (1, 1),
        'gender': (1, 1),
    },
    {
        'file': 'brogan-et-al-2017.pdf',
        'EBP': 1,
        'age': (0,0),
        'gender': (1, 1),
    },
    {
        'file': 'enloe-rapp-2013.pdf',
        'EBP': 0,
        'age': (1, 1./3),
        'gender': (0,0),
    },
    {
        'file': 'leaf-et-al-2014.pdf',
        'EBP': 0,
        'age': (1, 0.75),
        'gender': (0,0),
    },
    {
        'file': 'jung-et-al-2008.pdf',
        'EBP': 0,
        'age': (1, 1),
        'gender': (1, 1),
    },
    {
        'file': 'neely-et-al-2014.pdf',
        'EBP': 0,
        'age': (1, 1),
        'gender': (1, 1),
    },
    {
        'file': 'howorth-et-al-2016.pdf',
        'EBP': 0,
        'age': (0.5,0.5),
        'gender': (1, 1),
    },
    {
        'file': 'drasgow-et-al-2015.pdf',
        'EBP': 0,
        'age': (1, 1),
        'gender': (1, 1),
    },
    {
        'file': 'kamps-et-al-2016.pdf',
        'EBP': 1,
        'age': (1, 1),
        'gender': (1, 1),
    },
    {
        'file': 'kodak-et-al-2011.pdf',
        'EBP': 0,
        'age': (0,0),
        'gender': (1, 1),
    },
],
'ret_llama3_bge' : [
    {
        'file': 'haq-et-al-2017.pdf',
        'EBP': 0,
        'age': (0,0),
        'gender': (1, 1),
    },
    {
        'file': 'brogan-et-al-2017.pdf',
        'EBP': 1,
        'age': (0,0),
        'gender': (1, 1),
    },
    {
        'file': 'enloe-rapp-2013.pdf',
        'EBP': 1,
        'age': (1, 2./3),
        'gender': (1, 1),
    },
    {
        'file': 'leaf-et-al-2014.pdf',
        'EBP': 0,
        'age': (1, 1),
        'gender': (0,0),
    },
    {
        'file': 'jung-et-al-2008.pdf',
        'EBP': 1,
        'age': (1, 1),
        'gender': (0,0),
    },
    {
        'file': 'neely-et-al-2014.pdf',
        'EBP': 1,
        'age': (1, 1),
        'gender': (1, 1),
    },
    {
        'file': 'howorth-et-al-2016.pdf',
        'EBP': 1,
        'age': (1, 1),
        'gender': (1, 1),
    },
    {
        'file': 'drasgow-et-al-2015.pdf',
        'EBP': 0,
        'age': (1, 1),
        'gender': (1, 1),
    },
    {
        'file': 'kamps-et-al-2016.pdf',
        'EBP': 1,
        'age': (1, 1),
        'gender': (1, 1),
    },
    {
        'file': 'kodak-et-al-2011.pdf',
        'EBP': 0,
        'age': (1, 1),
        'gender': (1, 1),
    },
],
'ret_llama3_mpnet' : [
    {
        'file': 'haq-et-al-2017.pdf',
        'EBP': 0,
        'age': (0,0),
        'gender': (1, 1),
    },
    {
        'file': 'brogan-et-al-2017.pdf',
        'EBP': 1,
        'age': (0,0),
        'gender': (1, 1),
    },
    {
        'file': 'enloe-rapp-2013.pdf',
        'EBP': 0,
        'age': (1, 1),
        'gender': (1, 2./3),
    },
    {
        'file': 'leaf-et-al-2014.pdf',
        'EBP': 0,
        'age': (1, 1),
        'gender': (0,0),
    },
    {
        'file': 'jung-et-al-2008.pdf',
        'EBP': 0,
        'age': (1, 1),
        'gender': (1, 1),
    },
    {
        'file': 'neely-et-al-2014.pdf',
        'EBP': 1,
        'age': (1, 1),
        'gender': (1, 1),
    },
    {
        'file': 'howorth-et-al-2016.pdf',
        'EBP': 0,
        'age': (1, 1),
        'gender': (1, 1),
    },
    {
        'file': 'drasgow-et-al-2015.pdf',
        'EBP': 0,
        'age': (1, 1),
        'gender': (1, 1),
    },
    {
        'file': 'kamps-et-al-2016.pdf',
        'EBP': 1,
        'age': (1, 1),
        'gender': (1, 1),
    },
    {
        'file': 'kodak-et-al-2011.pdf',
        'EBP': 0,
        'age': (1, 1),
        'gender': (1, 1),
    },
]}

In [28]:
df_record = []
for model, temp_results in eval_results.items():
    ebp = np.mean([temp_result['EBP'] for temp_result in temp_results])
    age_r = np.mean([temp_result['age'][0] for temp_result in temp_results if temp_result['file'] not in ["kamps-et-al-2016.pdf"]])
    age_p = np.mean([temp_result['age'][1] for temp_result in temp_results if temp_result['file'] not in ["kamps-et-al-2016.pdf"]])
    gender_r = np.mean([temp_result['gender'][0] for temp_result in temp_results if temp_result['file'] not in ["kamps-et-al-2016.pdf"]])
    gender_p = np.mean([temp_result['gender'][1] for temp_result in temp_results if temp_result['file'] not in ["kamps-et-al-2016.pdf", "kodak-et-al-2011.pdf", "brogan-et-al-2017.pdf"]])
    df_record.append({
        'model': model,
        'ebp': ebp,
        'age_r': age_r,
        'age_p': age_p,
        'gender_r': gender_r,
        'gender_p': gender_p,
    })

In [None]:
pd.DataFrame(df_record)

In [None]:
eval_template = [
    {
        'file': 'haq-et-al-2017.pdf',
        'EBP': 1,
        'age': (1, 1),
        'gender': (1, 1),
    },
    {
        'file': 'brogan-et-al-2017.pdf',
        'EBP': 1,
        'age': (1, 1),
        'gender': (1, 1),
    },
    {
        'file': 'enloe-rapp-2013.pdf',
        'EBP': 1,
        'age': (1, 1),
        'gender': (1, 1),
    },
    {
        'file': 'leaf-et-al-2014.pdf',
        'EBP': 1,
        'age': (1, 1),
        'gender': (1, 1),
    },
    {
        'file': 'jung-et-al-2008.pdf',
        'EBP': 1,
        'age': (1, 1),
        'gender': (1, 1),
    },
    {
        'file': 'neely-et-al-2014.pdf',
        'EBP': 1,
        'age': (1, 1),
        'gender': (1, 1),
    },
    {
        'file': 'howorth-et-al-2016.pdf',
        'EBP': 1,
        'age': (1, 1),
        'gender': (1, 1),
    },
    {
        'file': 'drasgow-et-al-2015.pdf',
        'EBP': 1,
        'age': (1, 1),
        'gender': (1, 1),
    },
    {
        'file': 'kamps-et-al-2016.pdf',
        'EBP': 1,
        'age': (1, 1),
        'gender': (1, 1),
    },
    {
        'file': 'kodak-et-al-2011.pdf',
        'EBP': 1,
        'age': (1, 1),
        'gender': (1, 1),
    },
]

In [None]:
pd.DataFrame(eval_chatgpt, columns=['file', 'EBP', 'age', 'gender'])#, 'diagnosis', 'comorbidity', 'race', 'research_strategy'])

In [None]:
chatgpt_acc = {
    'EBP': 0.5,
    'age': 1.0,
    'diagnosis': 1.0,
    'comorbidity': 0.9,
    'gender': 1.0,
    'race': 1.0,
    'research_strategy': 1.0
}

In [None]:
pd.DataFrame(eval_openllm, columns=['file', 'EBP', 'age', 'diagnosis', 'comorbidity', 'gender', 'race', 'research_strategy'])

In [None]:
openllm_acc = {
    'EBP': 0.5,
    'age': 0.3,
    'diagnosis': 0.95,
    'comorbidity': 0.7,
    'gender': 0.4,
    'race': 0.8,
    'research_strategy': 1.0
}

In [None]:
pd.DataFrame(eval_ret_all, columns=['file', 'EBP', 'age', 'diagnosis', 'comorbidity', 'gender', 'race', 'research_strategy'])

In [None]:
rag_acc = {
    'EBP': 0.4,
    'age': 0.25,
    'diagnosis': 0.7,
    'comorbidity': 0.7,
    'gender': 0.4,
    'race': 0.8,
    'research_strategy': 1.0
}

In [None]:
sys_review_chatgpt['jung-et-al-2008.pdf']['comorbidity_gen']

In [None]:
test_file = 'drasgow-et-al-2015.pdf'
test_attr = 'age'
retrieve_context = sys_review_ret_llama3_mpnet[test_file][f'{test_attr}_retrieve_context']
retrieve_context

In [65]:
db_chroma.delete_collection()

In [33]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=50,
    chunk_overlap=20,
    length_function=lambda x: len(f.embed_tokenizer.encode(x, add_special_tokens=False)),
    separators=[
        "\n\n",
        "\n",
        ".",
        ",",
        "\u200b",  # Zero-width space
        "\uff0c",  # Fullwidth comma
        "\u3001",  # Ideographic comma
        "\uff0e",  # Fullwidth full stop
        "\u3002",  # Ideographic full stop
        " ",
        "",
    ],
    # Existing args
)

In [None]:
article = pymupdf4llm.to_markdown(os.path.join('../../data/systematic_review_papers/', test_file))#, page_chunks=True)
concated_article = ''
chunks = []
concated_article = '\n\n'.join([' '.join(passage.split()) for passage in article.replace('\n\n\n-----\n\n', ' ').split('\n\n')])
chunks = text_splitter.create_documents([concated_article])
chunks = [Document(page_content) for page_content in set([chunk.page_content for chunk in chunks])]
print(len(chunks))

db_chroma = Chroma.from_documents(chunks, f.embeder)

In [None]:
db_chroma.search()

In [67]:
doc_list = db_chroma.similarity_search_with_score(attr2question[test_attr], k=74)
for pid, ((doc, score1), (text, score2)) in enumerate(zip(doc_list[:len(retrieve_context)], retrieve_context)):
    if doc.page_content != text:
        print(pid)

In [None]:
retrieve_context[5]

In [None]:
doc_list

In [None]:
doc_list[7]

In [None]:
attr2question['gender']

In [None]:
db_chroma.similarity_search_with_relevance_scores('What are the ages of the participants?', k=58)