## Overview  
  
The **AzureOpenAI-ResearchPaperAnalyzer** project is designed to facilitate the analysis of research papers using the powerful capabilities of Azure OpenAI. The primary objective of this project is to automate the extraction and categorization of key insights from research papers written in markdown format. By leveraging Azure OpenAI's advanced natural language processing abilities, the script can:  
  
- Process markdown files of research papers.  
- Extract key insights and bullet points from the text.  
- Categorize the extracted information according to different research fields, funding sources, and affiliations.  
- Generate a summary of the paper's content and structure.  
- Format and save the results into a CSV file for easy analysis and reporting.  
  
This tool aims to save researchers significant time and effort by automating the tedious process of reading and summarizing lengthy research papers, allowing them to focus more on the critical aspects of their work.  



In [32]:
from openai import AzureOpenAI

import os
from dotenv import load_dotenv

from IPython.display import Markdown, display, Image

import glob


In [None]:
load_dotenv()


In [None]:
# Setting up the deployment name
deployment_name = "gpt-4o-mini"

# The API key for your Azure OpenAI resource.
api_key = os.environ["AZURE_OPENAI_API_KEY"]

# The base URL for your Azure OpenAI resource. e.g. "https://<your resource name>.openai.azure.com"
azure_endpoint = os.environ['AZURE_OPENAI_ENDPOINT']


api_version = "2024-02-15-preview"  # This seems to work

#print the environment variables
print("Azure OpenAI API Key: ", api_key)
print("Azure OpenAI Endpoint: ", azure_endpoint)
print("Azure OpenAI API Version: ", api_version)


client = AzureOpenAI(
  api_key=api_key,  
  azure_endpoint=azure_endpoint,
  api_version=api_version
)

In [None]:
completion = client.chat.completions.create(
  model=deployment_name,
  messages=[
    {"role": "system", "content": "You are a helpful assistant. Help me with my math homework!"}, # <-- This is the system message that provides context to the model
    {"role": "user", "content": "Hello! Could you solve 2+2?"}  # <-- This is the user message for which the model will generate a response
  ]
)
  
print("Assistant: " + completion.choices[0].message.content)

In [36]:
#load the article file from the markdown folder
article_files = glob.glob("*.md")

#lets limit the files to first 5 for now
article_files = article_files[:5]



In [37]:
#lets chunk the transcript into smaller parts and we want to include the last sentence of the previous chunk in the next chunk to ensure that the context is maintained.

#we will use the split_text function to split the text into smaller parts
def split_text(text, limit):
    """
    Split the text into smaller parts that are less than the limit
    """
    text_parts = []
    current_part = ""
    current_length = 0
    for sentence in text.split("."):
        if current_length + len(sentence) < limit:
            current_part += sentence + "."
            current_length += len(sentence)
        else:
            text_parts.append(current_part)
            current_part = sentence + "."
            current_length = len(sentence)
    text_parts.append(current_part)
    return text_parts


#lets split the full transcript into smaller parts




In [38]:
first_system_message = {"role": "system", "content": "You are an AI assistant that helps with creating bullet points of long research papers. you will be given chunks of the research paper. From the eyes of a experienced researcher you will check the chunk and then using condensed bullet points encapsulate the key points of the paper without extraneous details. We need to use this to determine the Field Of Research Code, any Funding sources, and any affiliaitons to La Trobe university so meticulous attention to detail is required."}

In [None]:
#for each article file in the markdown folder call the function to load the data into a variable

for article_file in article_files:
    with open(article_file, 'r', encoding="utf-8") as file:
        article = file.read()
        chunk_size = 8000
        article_length = len(article)
        chunks = article_length//chunk_size
        print(f"The article {article_file} is {article_length} tokens long and will be split into {chunks} chunks.")
        text_parts = split_text(article, chunk_size)
        print(f"Transcript split into {len(text_parts)} parts.")
        #setup the notes.md file using the format file name + notes.md
        notes_file = article_file.replace(".md", "_notes.md")
        with open(notes_file, 'w', encoding='utf-8') as file:
            file.write("# Notes\n\n")
            for i, text_part in enumerate(text_parts):
                print(f"Processing part {i+1}/{len(text_parts)}")
                completion = client.chat.completions.create(
                    model=deployment_name,
                    messages=[first_system_message, {"role": "user", "content": text_part}]
                )
                response = completion.choices[0].message.content
                file.write(f"## Part {i+1}\n\n")
                file.write(f"{response}\n\n")
                #print(response)
        
  

In [None]:
#lets run three system prompts across each of the _notes.md files to get the final output

#load the article file from the markdown folder
article_notes = glob.glob("*_notes.md")

#lets limit the files to first 2 for now
article_notes = article_notes[:5]

article_notes

In [54]:
field_of_research_code_system_prompt = {"role": "system", "content": """You are an expert in the Australian and New Zealand Standard Research Classification (ANZSRC) system, specifically the Fields of Research (FoR) codes. Your task is to analyze detailed notes about a research paper and identify the most appropriate **4-digit Fields of Research (FoR) code** based on the study's focus, methodology, subject matter, and key findings.  
  
The notes may be divided into multiple parts (e.g., Part 1, Part 2, etc.), but they all pertain to the same research paper. Treat all parts as a single cohesive input and use the entire set of notes to determine the primary field of research.  
  
The ANZSRC FoR classification system is hierarchical:  
1. **2-digit codes** represent broad research divisions (e.g., 11 Medical and Health Sciences).  
2. **4-digit codes** represent specific research groups within the divisions (e.g., 1106 Human Movement and Sports Science).  
3. **6-digit codes** provide even finer granularity but are not required for this task.  
  
Your task is to:  
1. Focus on the **4-digit code** level to classify the research into its most relevant research group.  
2. Analyze all parts of the notes provided, including the study’s focus, keywords, methodology, findings, and implications, to determine the research discipline.  
3. Prioritize the **primary field of research** rather than secondary or interdisciplinary areas, even if the study spans multiple fields.  
  
### Guidance for Identifying the Correct 4-Digit FoR Code:  
- Consider the **research focus**: What is the primary subject or discipline of the study? (e.g., sports performance, biomechanics, environmental science).  
- Look for **methodology and tools**: What scientific methods, techniques, or tools were used? (e.g., statistical parametric mapping, biomechanical analysis).  
- Refer to **keywords and findings**: What are the main themes or keywords associated with the research? (e.g., para-biathlon, pacing strategies, skiing performance).  
- Contextualize the **discipline**: Think about which academic field the study's content aligns with most closely.  
  
If the research overlaps multiple disciplines, choose the **primary discipline** that best represents the core focus of the study. Use the content, keywords, and context provided in the notes to guide your classification.  
  
### Examples:  
Input: "This research examines the biomechanics of elite swimmers to improve stroke efficiency."  
Output: "1106"  
  
Input: "This study investigates the impact of social media algorithms on user behavior and engagement."  
Output: "0806"  
  
Input: "This paper analyzes the genetic basis of drought resistance in wheat using genome-wide association studies."  
Output: "0604"  
  
### Analyze the following notes (divided into multiple parts) and provide the **4-digit Fields of Research (FoR) code**:  
""" } # Update prompt for FOR Code detection here if needed"}

In [55]:
funding_sources_system_prompt = {"role": "system", "content": """You are an expert research assistant tasked with identifying funding sources for academic studies. Your job is to analyze detailed notes about a research paper, which may be divided into multiple parts but pertains to a single study. Your goal is to extract all mentions of funding sources or statements indicating the lack of funding from the notes.  
  
Carefully review all parts of the notes as a single cohesive input and provide one of the following outputs:  
1. A list of funding sources mentioned in the notes, including grant names, funding agencies, organizations, or any other details related to funding.  
2. If no funding information is mentioned in the notes, explicitly state: "No funding sources were reported in the provided notes."  
  
### Examples:  
Input:  
"The research was funded by the European Research Council (ERC Starting Grant 6789). Additional funding was provided by the Swedish Research Council."  
Output:  
"Funding Sources: European Research Council (ERC Starting Grant 6789), Swedish Research Council."  
  
Input:  
"The authors declared no funding sources for this research. Ethical approval was obtained from the university ethics committee."  
Output:  
"No funding sources were reported in the provided notes."  
  
Input:  
"The study acknowledges financial support from the National Institutes of Health (NIH grant R01-AB123456)."  
Output:  
"Funding Sources: National Institutes of Health (NIH grant R01-AB123456)."  
  
Now, analyze the following notes and extract all funding-related information:  ."""}

In [None]:
university = "La Trobe University"

In [None]:
content = f"""You are an expert research assistant tasked with identifying affiliations to a specific university in academic research. Your job is to analyze detailed notes about a research paper, which may be divided into multiple parts but pertains to a single study. The university name will be provided as a placeholder `{university}`, and you must check if any authors or affiliations mentioned in the notes are explicitly connected to this university.  
  
Carefully review all parts of the notes as a single cohesive input and provide one of the following outputs:  
1. If any author or affiliation is explicitly linked to `{university}`, list the relevant details (e.g., author name, department, or any specific mention of the university).  
2. If no affiliation to `{university}` is mentioned in the notes, explicitly state: "No affiliations to {university} were reported in the provided notes."  
  
### Guidelines:  
- Look for explicit mentions of `{university}` in any part of the notes, including author affiliations, acknowledgments, or other sections.  
- Treat the notes as a single cohesive input, even if they are divided into multiple parts.  
- Only report affiliations explicitly linked to `{university}`. Do not infer or assume affiliations based on other universities or organizations.  
  
### Examples:  
Input:  
"University: La Trobe University    
Notes:    
- Authors: Dr. John Smith (La Trobe University, Department of Sports Science), Dr. Jane Doe (University of Melbourne).    
- Acknowledgments: The authors thank La Trobe University for providing access to research facilities."  
Output:  
"Affiliations to La Trobe University: Dr. John Smith (Department of Sports Science), acknowledgment of research facilities."  
  
Input:  
"university: La Trobe University    
Notes:    
- Authors: Dr. Emily Brown (University of Sydney), Dr. Mark Wilson (Monash University).    
- No mention of La Trobe University in the acknowledgments or affiliations."  
Output:  
"No affiliations to La Trobe University were reported in the provided notes."  
  
Input:  
"university: University of Sydney    
Notes:    
- Authors: Dr. Alex Green (University of Sydney, Faculty of Medicine), Dr. Lisa White (University of Queensland).    
- The study was supported by the University of Sydney research grant."  
Output:  
"Affiliations to University of Sydney: Dr. Alex Green (Faculty of Medicine), research grant support."""



In [58]:
affiliation_system_prompt = {"role": "system", "content": content}


In [59]:
md_friendly_format_system_prompt = {"role": "system", "content": """YYou are an expert research assistant tasked with consolidating the results of multiple analyses into a structured Markdown table. Each iteration will provide the following information about a research paper:  
1. **Title of the Paper:** The title of the research paper being analyzed.  
2. **FoR Code Classification:** The primary Fields of Research (FoR) 4-digit code that best represents the study, along with the reasons for selecting this code.  
3. **Funding Sources Extraction:** A list of funding sources mentioned in the notes or confirmation that no funding was reported.  
4. **University Affiliations Extraction:** A list of affiliations to a specific university or confirmation that no affiliations were reported.  
  
Your task is to:  
1. Extract the most important pieces of information from the results.  
2. Combine this information into **a single Markdown table row**, excluding the header row and divider row, as they are already pre-generated.  
  
### Output Format:  
The output must be a **Markdown-formatted row** for the table, with data separated by vertical bars (`|`). Do not include the header or divider rows. Make sure the row has the following structure:  
| [Title] | [FoR Code] | [Reason for FoR Code] | [Funding Sources] | [Affiliations] |   """}


In [None]:
#setup a markdown file with the following columns Article Title, Field of Research Code, Reasoning, Funding Sources, Affiliations
#this file will be appeneded with the results of the system prompts for each article

#setup the output markdown file
with open("results.md", 'w', encoding='utf-8') as file:  
    file.write("""\  
| Title                                   | FoR Code | Reason for FoR Code                     | Funding Sources          | Affiliations to University           |  
|-----------------------------------------|----------|-----------------------------------------|--------------------------|---------------------------------------|  
""")
#lets run the system prompts across each of the _notes.md files to get the final output


for article_note in article_notes:
    with open(article_note, 'r', encoding="utf-8") as file:
        article = file.read()
        completion = client.chat.completions.create(
            model=deployment_name,
            messages=[field_of_research_code_system_prompt, {"role": "user", "content": article}]
        )
        fieldOfResearchResult = completion.choices[0].message.content
        print(fieldOfResearchResult)

        completion = client.chat.completions.create(
            model=deployment_name,
            messages=[funding_sources_system_prompt, {"role": "user", "content": article}]
        )
        fundingSourcesResult = completion.choices[0].message.content
        print(fundingSourcesResult)

        completion = client.chat.completions.create(
            model=deployment_name,
            messages=[affiliation_system_prompt, {"role": "user", "content": article}]
        )  
        affiliationsResult = completion.choices[0].message.content
        print(affiliationsResult)

        #lets collate the results into a string
        collatedResults = f"{fieldOfResearchResult}, {fundingSourcesResult}, {affiliationsResult}"

        #lets call the completion function to get a csv friendly response for the results
        completion = client.chat.completions.create(
            model=deployment_name,
            messages=[md_friendly_format_system_prompt, {"role": "user", "content": collatedResults}]
        )
        md_friendly_format = completion.choices[0].message.content
        #append to the markdown file results.md with the results
        with open("results.md", 'a', encoding='utf-8') as file:
            file.write(f"{md_friendly_format}\n")
        print(md_friendly_format)
        print("\n\n")






        #save the response to the csv file
        with open("results.csv", 'a', encoding='utf-8') as file:
            file.write(f"{csv_friendly_format}\n")
        
        # #print the response
        # print(f"Article: {article_file}")
        # print(f"Field of Research Code: {fieldOfResearchResult}")
        # print(f"Funding Sources: {fundingSourcesResult}")
        # print(f"Affiliations: {affiliationsResult}")
        # #print a new line
        # print("\n\n")

