In [1]:
import os
import re
import json
# from cerebras.cloud.sdk import Cerebras
import ollama

### Preprocessing

In [2]:
def preprocess_tex(text):
    # Basic preprocessing to remove LaTeX commands, but keep section headers
    text = re.sub(r'\\(?!section|subsection|subsubsection)([a-zA-Z]+)(\[.*?\])?({.*?})?', '', text)
    return text

#? getting all the tex files from every subdirectory
def get_all_tex_files(directory):
    tex_files = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith('.tex'):
                tex_files.append(os.path.join(root, file))
    return tex_files

### Preparing prompt context for first inference

In [3]:
base_dir = 'dataset' #? Where all you extracted zip of every paper is.
tex_files = get_all_tex_files(base_dir) #? Getting all the tex files from the directory

# Process files
papers = {}
for file in tex_files: # Going through one folder at a time
    with open(file, 'r') as f:
        content = f.read()
    preprocessed_content = preprocess_tex(content) # Gets rid of all the latex commands

    # remove 1000 characters from the end of the content and the start of the content
    preprocessed_content = preprocessed_content[1000:-1000]

    paper_name = os.path.relpath(file, base_dir).split(os.sep)[0] #? Getting the name of the paper from the folder name

    # paper name here actually has all the subdirectories as well, so we remove them by omitting all which do not have the exact paper name.    
    if paper_name not in papers:
        papers[paper_name] = ""
    papers[paper_name] += preprocessed_content + "\n\n"

In [4]:
def generate_paper_prompt(paper_title, paper_content, analysis_questions):
    questions_str = "\n".join(f"- {q}" for q in analysis_questions)
    return f"""
======== START OF PAPER: {paper_title} ========

{paper_content}

======== END OF PAPER: {paper_title} ========

Please analyze this paper and answer the following questions:

{questions_str}

Base your analysis ONLY on the provided context. If you can't find information for a question, respond with "Not available".
"""

def parse_paper_analysis(response):
    lines = response.split('\n')
    analysis = {}
    current_key = None
    for line in lines:
        if line.strip().startswith('1.') or line.strip().startswith('2.'):
            current_key = line.strip()
            analysis[current_key] = ""
        elif current_key and line.strip():
            analysis[current_key] += ' ' + line.strip()
    return analysis

### Inference 1:

Inference on each paper indiviually.

In [5]:
# Initialize Cerebras client
# client = Cerebras(api_key='csk-83cjr2kjp5dke2f2ddpwe85529yyv8jc38e8rcmjkxxcx3hx')

# Define analysis questions
analysis_questions = [
    "Does the paper talk about a medical diseases? If so which?", 
    "What is the paper doing to tackle the disease?",
]

In [6]:
# Process each paper individually
paper_analyses = []

for paper_title, paper_content in papers.items():
    prompt = generate_paper_prompt(paper_title, paper_content, analysis_questions)

    # Ollama generate request
    response = ollama.generate(
        model='llama3',  # Change this to 'llama3' if you have that model available
        prompt=prompt,
        system="You are a helpful assistant analyzing research papers.",
        options={
            "num_predict": 2000,
            "temperature": 0.7,
            "top_p": 1
        }
    )
    
    generated_text = response['response']
    
    print('--------------------------------------------------------------')
    print(f"Paper: {paper_title}")
    print('--------------------------------------------------------------')
    print('RAW: \n', generated_text)

    analysis = parse_paper_analysis(generated_text)
    analysis['PAPER_TITLE'] = paper_title
    paper_analyses.append(analysis)
    
    print()
    print('FILTERED')
    print(json.dumps(analysis, indent=2))   
    print('--------------------------------------------------------------')

--------------------------------------------------------------
Paper: osteophyte
--------------------------------------------------------------
RAW: 
 Based on the provided context, here are the answers to your questions:

1. Does the paper talk about a medical diseases? If so which?
The paper talks about spinal osteophyte detection.

2. What is the paper doing to tackle the disease?

The paper presents a study for spinal osteophyte detection using a specialized patching strategy called SegPatch in conjunction with a fine-tuned DenseNet-121 network. The authors also investigate the use of off-the-shelf detectors, specifically FasterRCNN, but find poor performance. The paper concludes that a more efficient clinical approach to identifying osteophytes can be achieved through the proposed method, which achieves an accuracy score of 84.5% and a specificity score of 86.62%.

FILTERED
{
  "1. Does the paper talk about a medical diseases? If so which?": " The paper talks about spinal osteophy

### Preparing prompt context for second inference

In [7]:
def generate_summary_analysis_prompt(paper_analyses):
    def safe_get(analysis, key):
        # Try different possible keys
        possible_keys = [
            key,
            key.lower(),
            key.replace("?", "").strip(),
            ' '.join(key.split()[1:])  # Remove the number at the start
        ]
        for possible_key in possible_keys:
            if possible_key in analysis:
                return analysis[possible_key]
        return "N/A"  # Return N/A if no matching key is found

    papers_info = "\n\n".join([
        f"Paper: {analysis.get('PAPER_TITLE', 'Untitled')}\n"
        f"Disease: {safe_get(analysis, '1. Does the paper talk about a medical disease? If so, which?')}\n"
        f"Approach: {safe_get(analysis, '2. What is the paper doing to tackle the disease?')}"
        for analysis in paper_analyses
    ])

    prompt = f"""Based on the following summaries of research papers, provide a general overview of the medical diseases discussed and the approaches being used to tackle them. Your response should include:

1. A comprehensive answer summarizing the diseases mentioned and the various approaches used across all papers.
2. A list of the specific papers you considered in your analysis.

Here are the paper summaries:

{papers_info}

Please structure your response as follows:
1. Overall Summary: [Your comprehensive answer here]
2. Papers Analyzed: [List of paper titles]
"""
    return prompt

summary_prompt = generate_summary_analysis_prompt(paper_analyses)

### Inference 2:

Inference on the combined output of Inference 1 for each paper.

In [8]:
# Send the prompt to the Ollama model for summary
summary_response = ollama.generate(
    model='llama3',  # Change this to 'llama3' if you have that model available
    prompt=summary_prompt,
    system="You are a helpful assistant analyzing research papers.",
    options={
        "num_predict": 2000,
        "temperature": 0.7,
        "top_p": 1
    }
)

summary_text = summary_response['response']

print('SUMMARY ANALYSIS:')
print(summary_text)

SUMMARY ANALYSIS:
**Overall Summary**

The medical diseases discussed in these research papers are not explicitly stated, as three out of the four papers do not mention a specific disease. However, the approaches used in these studies focus on developing computer-aided diagnosis (CAD) systems for identifying and detecting various types of lesions or abnormalities.

Three papers employ deep learning models to analyze medical images: one uses a DenseNet-121 network for spinal osteophyte detection, another explores the use of 2D-Slice-CNN models with slice encoders pre-trained on natural images for predicting Alzheimer's disease (AD) and brain age from MRI scans, and the third develops a CAD system using nnU-Net to identify cystic fibrosis (CF) lesions in CT scans. These studies aim to improve the accuracy of disease detection and provide better patient outcomes.

**Papers Analyzed**

1. Osteophyte
2. P-ADIC
3. Natural_to_mri
4. CT


### Preparing prompt context for third inference

In [9]:
def handle_followup_questions(paper_analyses, summary_response):
    context = f"""
Summary of Research Papers:
{summary_response}

Detailed Paper Analyses:
{json.dumps(paper_analyses, indent=2)}
"""

    print("\nYou can now ask follow-up questions about the research papers. Type 'exit' to end the session.")

    while True:
        user_question = input("Your question: ")
        
        if user_question.lower() == 'exit':
            print("Ending the follow-up session. Thank you!")
            break

        prompt = f"""
Based on the following context about several research papers, please answer the user's question:

{context}

User's question: {user_question}

Please provide a concise and accurate answer based only on the information given in the context. If the question cannot be answered using the provided information, please respond with "I'm sorry, but I don't have enough information to answer that question based on the given context."
"""

        response = ollama.generate(
            model='llama3',  # Change this to 'llama3' if you have that model available
            prompt=prompt,
            system="You are a helpful assistant analyzing research papers.",
            options={
                "num_predict": 500,
                "temperature": 0.7,
                "top_p": 1
            }
        )

        answer = response['response']
        print("\nAnswer:", answer.strip())

    return response

### Inference 3:

Asking for a follow_up_response based on everything above.

In [10]:
# Start the interactive follow-up session
follow_up_response = handle_followup_questions(paper_analyses, summary_response)


You can now ask follow-up questions about the research papers. Type 'exit' to end the session.

Answer: Based on the provided context, there is no mention of corn or any related medical diseases. The papers discussed are about spinal osteophyte detection, Alzheimer's Disease (AD), and Chronic Fibrosis (CF). Therefore, I'm sorry, but I don't have enough information to answer that question based on the given context.

Answer: Based on the provided context, it appears that there is a paper titled "P-ADIC" which mentions no specific details about p-adic numbers or whether machine learning (ML) is used to tackle any related problems. Therefore, I can only say that there is no information in this context that suggests ML is good for p-adic numbers or if it is even relevant to the topic.

Answer: I'm sorry, but I don't have enough information to answer that question based on the given context.

Answer: Based on the provided context, there is some evidence of medical disease detection being e