In [1]:
import os
import re
import json
from cerebras.cloud.sdk import Cerebras

### Preprocessing

In [2]:
def preprocess_tex(text):
    # Basic preprocessing to remove LaTeX commands, but keep section headers
    text = re.sub(r'\\(?!section|subsection|subsubsection)([a-zA-Z]+)(\[.*?\])?({.*?})?', '', text)
    return text

#? getting all the tex files from every subdirectory
def get_all_tex_files(directory):
    tex_files = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith('.tex'):
                tex_files.append(os.path.join(root, file))
    return tex_files

### Preparing prompt context for first inference

In [3]:
base_dir = 'dataset' #? Where all you extracted zip of every paper is.
tex_files = get_all_tex_files(base_dir) #? Getting all the tex files from the directory

# Process files
papers = {}
for file in tex_files: # Going through one folder at a time
    with open(file, 'r') as f:
        content = f.read()
    preprocessed_content = preprocess_tex(content) # Gets rid of all the latex commands

    # remove 1000 characters from the end of the content and the start of the content
    preprocessed_content = preprocessed_content[500:-500]

    paper_name = os.path.relpath(file, base_dir).split(os.sep)[0] #? Getting the name of the paper from the folder name

    # paper name here actually has all the subdirectories as well, so we remove them by omitting all which do not have the exact paper name.    
    if paper_name not in papers:
        papers[paper_name] = ""
    papers[paper_name] += preprocessed_content + "\n\n"

In [4]:
def generate_paper_prompt(paper_title, paper_content, analysis_questions):
    questions_str = "\n".join(f"- {q}" for q in analysis_questions)
    return f"""
======== START OF PAPER: {paper_title} ========

{paper_content}

======== END OF PAPER: {paper_title} ========

Please analyze this paper and answer the following questions:

{questions_str}

Base your analysis ONLY on the provided context. If you can't find information for a question, respond with "Not available".
"""

def parse_paper_analysis(response):
    lines = response.split('\n')
    analysis = {}
    current_key = None
    for line in lines:
        if line.strip().startswith('1.') or line.strip().startswith('2.'):
            current_key = line.strip()
            analysis[current_key] = ""
        elif current_key and line.strip():
            analysis[current_key] += ' ' + line.strip()
    return analysis

### Inference 1:

Inference on each paper indiviually.

In [5]:
# Initialize Cerebras client
client = Cerebras(api_key='csk-83cjr2kjp5dke2f2ddpwe85529yyv8jc38e8rcmjkxxcx3hx')

# Define analysis questions
analysis_questions = [
    "Does the paper talk about a medical diseases? If so which?", 
    "What is the paper doing to tackle the disease?",
]

In [6]:
# Process each paper individually
paper_analyses = []

for paper_title, paper_content in papers.items():
    prompt = generate_paper_prompt(paper_title, paper_content, analysis_questions)

    stream = client.chat.completions.create(
        messages=[
            {"role": "system", "content": "You are a helpful assistant analyzing research papers."},
            {"role": "user", "content": prompt}
        ],
        model="llama3.1-70b",
        stream=True,
        max_tokens=2000,
        temperature=0.7,
        top_p=1
    )

    response = ""
    for chunk in stream:
        response += chunk.choices[0].delta.content or ""
    
    print('--------------------------------------------------------------')
    print(f"Paper: {paper_title}")
    print('--------------------------------------------------------------')
    print('RAW: \n', response)

    analysis = parse_paper_analysis(response)
    analysis['PAPER_TITLE'] = paper_title
    paper_analyses.append(analysis)
    
    print()
    print('FILTERED')
    print(json.dumps(analysis, indent=2))   
    print('--------------------------------------------------------------')

--------------------------------------------------------------
Paper: osteophyte
--------------------------------------------------------------
RAW: 
 Based on the provided paper, here are the answers to the questions:

1. Does the paper talk about a medical disease? If so, which?

Yes, the paper talks about a medical condition called Osteoarthritis (OA) and specifically focuses on the detection of osteophytes, which are small bone growths associated with the development and progression of OA.

2. What is the paper doing to tackle the disease?

The paper proposes a novel automated patch generation technique called SegPatch, which is used to detect osteophytes in spinal X-ray images. The method combines SegPatch with a fine-tuned DenseNet-121 network to classify patches and identify the presence of osteophytes. The goal is to develop an efficient clinical approach to identifying osteophytes in X-rays, which can help streamline the diagnostic process and improve patient care and treatmen

BadRequestError: Error code: 400 - {'message': 'Please reduce the length of the messages or completion. Current length is 10799 while limit is 8192', 'type': 'invalid_request_error', 'param': 'messages', 'code': 'context_length_exceeded'}

### Preparing prompt context for second inference

In [None]:
def generate_summary_analysis_prompt(paper_analyses):
    def safe_get(analysis, key):
        # Try different possible keys
        possible_keys = [
            key,
            key.lower(),
            key.replace("?", "").strip(),
            ' '.join(key.split()[1:])  # Remove the number at the start
        ]
        for possible_key in possible_keys:
            if possible_key in analysis:
                return analysis[possible_key]
        return "N/A"  # Return N/A if no matching key is found

    papers_info = "\n\n".join([
        f"Paper: {analysis.get('PAPER_TITLE', 'Untitled')}\n"
        f"Disease: {safe_get(analysis, '1. Does the paper talk about a medical disease? If so, which?')}\n"
        f"Approach: {safe_get(analysis, '2. What is the paper doing to tackle the disease?')}"
        for analysis in paper_analyses
    ])

    prompt = f"""Based on the following summaries of research papers, provide a general overview of the medical diseases discussed and the approaches being used to tackle them. Your response should include:

1. A comprehensive answer summarizing the diseases mentioned and the various approaches used across all papers.
2. A list of the specific papers you considered in your analysis.

Here are the paper summaries:

{papers_info}

Please structure your response as follows:
1. Overall Summary: [Your comprehensive answer here]
2. Papers Analyzed: [List of paper titles]
"""
    return prompt

summary_prompt = generate_summary_analysis_prompt(paper_analyses)

### Inference 2:

Inference on the combined output of Inference 1 for each paper.

In [None]:
# Send the prompt to the Cerebras model
stream = client.chat.completions.create(
    messages=[
        {"role": "system", "content": "You are a helpful assistant analyzing research papers."},
        {"role": "user", "content": summary_prompt}
    ],
    model="llama3.1-70b",
    stream=True,
    max_tokens=2000,
    temperature=0.7,
    top_p=1
)

# Collect the response
summary_response = ""
for chunk in stream:
    summary_response += chunk.choices[0].delta.content or ""

print('SUMMARY ANALYSIS:')
print(summary_response)

SUMMARY ANALYSIS:
**Overall Summary**

This analysis covers research papers discussing various medical diseases and the approaches being used to tackle them. The diseases mentioned include osteoarthritis (OA), Alzheimer's Disease (AD), and cystic fibrosis (CF). 

To address these diseases, researchers are leveraging deep learning techniques and Convolutional Neural Networks (CNNs) to analyze medical images such as X-ray images, MRI scans, and CT scans. The approaches being used include:

* Automated patch extraction and classification for osteophyte detection in spinal X-ray images (SegPatch and DenseNet-121 network)
* Deep learning methods, specifically 2D-Slice-CNN models, for Alzheimer's disease diagnosis and brain age prediction from MRI scans
* Convolutional Neural Networks (CNNs) in both 2D and 3D formats for segmenting cystic fibrosis lesions from CT scans

These approaches aim to improve diagnosis accuracy, streamline clinical workflows, and ultimately enhance patient care. The

### Preparing prompt context for third inference

In [None]:
def handle_followup_questions(client, paper_analyses, summary_response):

    context = f"""
Summary of Research Papers:
{summary_response}

Detailed Paper Analyses:
{json.dumps(paper_analyses, indent=2)}
"""

    print("\nYou can now ask follow-up questions about the research papers. Type 'exit' to end the session.")

    while True:
        user_question = input("\nYour follow-up question: ")
        
        if user_question.lower() == 'exit':
            print("Ending the follow-up session. Thank you!")
            break

        prompt = f"""
Based on the following context about several research papers, please answer the user's question:

{context}

User's question: {user_question}

Please provide a concise and accurate answer based only on the information given in the context. If the question cannot be answered using the provided information, please respond with "I'm sorry, but I don't have enough information to answer that question based on the given context."
"""

        stream = client.chat.completions.create(
            messages=[
                {"role": "system", "content": "You are a helpful assistant analyzing research papers."},
                {"role": "user", "content": prompt}
            ],
            model="llama3.1-70b",
            stream=True,
            max_tokens=500,
            temperature=0.7,
            top_p=1
        )

        response = ""
        for chunk in stream:
            response += chunk.choices[0].delta.content or ""

        print("\nAnswer:", response.strip())

        return response

### Inference 3:

Asking for a follow_up_response based on everything above.

In [None]:
# Start the interactive follow-up session
follow_up_response = handle_followup_questions(client, paper_analyses, summary_response)


You can now ask follow-up questions about the research papers. Type 'exit' to end the session.

Answer: I'm sorry, but I don't have enough information to answer that question based on the given context.

The provided context discusses the application of deep learning techniques and Convolutional Neural Networks (CNNs) in medical research, specifically in the diagnosis and treatment of osteoarthritis, Alzheimer's Disease, and cystic fibrosis. However, it does not provide a general evaluation or assessment of AI as a whole, nor does it discuss the broader implications or value of AI beyond the specific medical applications mentioned.
