In [None]:
pip install pypdf pandas your-google-generativeai python-dotenv python-docx

In [27]:
import os
import sys
import pandas as pd
from typing import Dict, List, Optional
from pypdf import PdfReader
import google.generativeai as genai
from dotenv import load_dotenv
from docx import Document
from docx.shared import Pt, Inches, RGBColor
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.enum.style import WD_STYLE_TYPE
from typing import List, Dict

In [None]:
# Global variables
BASE_DIR = "your-base-dir"
LLM = None

In [6]:
# Questions dictionary with main questions and their subquestions
DETAILED_QUESTIONS = {
    "What were the features used in the AI model for delirium prediction?": [],  # Empty list for no subquestions
    "What was the outcome that the model tried to predict?": [],
    "Which population did they study?": [],
    "What is the specific purpose and context of the AI model in delirium prediction?": [
        "Why was the model built?",
        "Was there a specific gap in the literature that it aimed to address?",
        "Does the model focus on prevalent vs. incident delirium?",
        "What is the prediction window?",
        "What type of ICU population is targeted?"
    ],
    "How was the model developed, and what data were used for training?": [
        "What data source was used for training the model?",
        "What is the distribution of the data?",
        "Is the data representative of the target population?",
        "What were the steps or methods used to define and select the features?",
        "How were missing data and outliers assessed and managed?",
        "What was the gold standard for delirium?",
        "What type of ML models were tested?",
        "How was hyperparameter tuning performed?",
        "Was cross-validation done? What type and how many fold?"
    ],
    "Has the model been externally validated, and how does it perform in different clinical settings?": [
        "Is the model externally validated?",
        "What are the performance metrics?",
        "What are the subgroup analysis performance metrics?",
        "Are fairness metrics reported?",
        "Was clinical utility tested?"
    ],
    "How interpretable are the model's outputs, and can clinicians understand the reasoning behind predictions?": [
        "Have they looked at results that are wrong and tried to understand the reasoning behind the mistakes?",
        "Report SHAP values or other feature ranking"
    ],
    "Are there any ethical, legal, or social concerns related to the use of the model?": [
        "Has the paper discussed any of these aspects?",
        "If so, which aspect and how the paper tried to address it?"
    ],
    "What training and support will be provided to clinicians to effectively use and interpret the model's predictions?": [
        "Has the paper discussed training and support?",
        "If so, what steps will be taken to allow the clinician to use the model and understand the model's interpretation output?"
    ],
    "How does the model integrate into existing clinical workflows and complement current practices?": [
        "Do the authors describe the role of the model in clinical practice?",
        "Are there steps on how the model should be used in a clinical setting? If yes, what are the steps?"
    ],
    "Has the use of the model been shown to improve patient care and outcomes in prospective studies?": [
        "Has the paper reported any prospective studies on patient outcomes?",
        "If so, what were the key findings regarding patient care improvement?"
    ],
    "What are the potential risks or harms associated with implementing the model in clinical practice?": [
        "Has a risk assessment been done?",
        "If so, what are the identified potential risks and harm?"
    ],
    "How will the model be maintained and updated over time to ensure continued accuracy and relevance?": [
        "Has the life cycle of the model been discussed?",
        "Are there specific plans for model updates and maintenance?"
    ],
    "What measures are in place to monitor the model's performance?": [
        "Are there monitoring systems after implementation suggested in the paper? If so, what are they?",
        "How frequently is the model's performance evaluated?"
    ],
    "How does the model compare to existing clinical methods for delirium prediction?": [
        "Was model performance in clinical practice compared with routine clinical practice?",
        "Was the Cost:Benefit ratio mentioned? If, so what was it, clinical as well as technical?"
    ]
}

In [7]:
def process_pdf(pdf_path: str) -> Optional[str]:
    """Extract text from a PDF file"""
    try:
        reader = PdfReader(pdf_path)
        text = ""
        for page in reader.pages:
            text += page.extract_text() + "\n"
        return text
    except Exception as e:
        print(f"Error processing PDF {pdf_path}: {e}")
        return None

In [8]:
def setup_genai(api_key: str):
    """Set up the Google Generative AI model"""
    try:
        genai.configure(api_key=api_key)
        return genai.GenerativeModel('gemini-1.5-pro', 
                                   generation_config=genai.GenerationConfig(
                                       temperature=0,
                                       top_p=1,
                                       top_k=1,
                                       max_output_tokens=2048))
    except Exception as e:
        print(f"Error setting up GenAI: {e}")
        sys.exit(1)

In [9]:
def analyze_text(text: str, question: str, llm) -> str:
    """Analyze text with a specific question using GenAI"""
    prompt = f"""Analyze this research paper and answer the following question:
    Question: {question}
    
    Paper text:
    {text}
    
    Provide a concise but detailed answer. If the information is not found, say "Not mentioned in the paper."
    """
    
    try:
        response = llm.generate_content(prompt)
        return response.text.strip()
    except Exception as e:
        print(f"Error analyzing text: {e}")
        return "Error in analysis"

In [10]:
def analyze_single_paper(pdf_path: str, llm) -> Dict:
    """Analyze a single paper and return all results"""
    # Extract text from PDF
    text = process_pdf(pdf_path)
    if text is None:
        return None
    
    results = {}
    
    # Extract title
    title_prompt = "What is the title of this research paper? Return only the title."
    results['title'] = analyze_text(text, title_prompt, llm)
    
    # Analyze main questions
    for main_question in DETAILED_QUESTIONS:
        column_name = f"Main__{main_question[:50]}"  # Truncate long questions
        results[column_name] = analyze_text(text, main_question, llm)
        
        # Analyze subquestions
        for subq in DETAILED_QUESTIONS[main_question]:
            column_name = f"Sub__{main_question[:30]}__{subq[:30]}"
            results[column_name] = analyze_text(text, subq, llm)
    
    return results

In [11]:
def create_analysis_dataframe() -> pd.DataFrame:
    """Create DataFrame from PDF analysis without saving"""
    try:
        # Setup paths
        pdf_directory = os.path.join(BASE_DIR, "algorithm pdfs")
        
        # Get list of PDF files
        pdf_files = [f for f in os.listdir(pdf_directory) if f.endswith('.pdf')]
        if not pdf_files:
            raise FileNotFoundError("No PDF files found in directory")
        
        print(f"Found {len(pdf_files)} PDF files to process")
        
        # Process all PDFs
        results = []
        for i, pdf_file in enumerate(pdf_files, 1):
            print(f"Processing file {i}/{len(pdf_files)}: {pdf_file}")
            pdf_path = os.path.join(pdf_directory, pdf_file)
            paper_results = analyze_single_paper(pdf_path, LLM)
            if paper_results:
                paper_results['File'] = pdf_file
                results.append(paper_results)
            else:
                print(f"Warning: Could not process {pdf_file}")
        
        if not results:
            raise ValueError("No results were generated from any PDFs")
        
        # Create and return DataFrame
        return pd.DataFrame(results)
    
    except Exception as e:
        print(f"Error in analysis: {e}")
        raise

In [12]:
def setup_environment() -> str:
    """Setup environment and return API key"""
    env_path = os.path.join(BASE_DIR, 'google_api_key.env')
    
    if not os.path.exists(env_path):
        raise FileNotFoundError(f"Environment file not found at: {env_path}")
    
    load_dotenv(env_path)
    api_key = os.getenv('GOOGLE_API_KEY')
    
    if not api_key:
        raise ValueError("GOOGLE_API_KEY not found in google_api_key.env file")
    
    return api_key

# Ensure to set your API keys and other environment variables in your_env_file.env.

In [13]:
def main() -> pd.DataFrame:
    """Main function to run the analysis"""
    global LLM
    
    try:
        # Setup environment and API key
        print("Setting up environment...")
        api_key = setup_environment()
        print("Using API key from google_api_key.env")
        
        # Setup GenAI model
        print("Initializing GenAI model...")
        LLM = setup_genai(api_key)
        
        # Create the analysis DataFrame
        print("Starting analysis...")
        df = create_analysis_dataframe()
        
        # Print summary
        print("\nAnalysis completed successfully")
        print(f"DataFrame shape: {df.shape}")
        print("First few columns:", list(df.columns)[:5])
        
        return df
        
    except FileNotFoundError as e:
        print(f"File error: {e}")
        print("Please ensure all required files are present")
        sys.exit(1)
    except ValueError as e:
        print(f"Configuration error: {e}")
        print("Please check your configuration files")
        sys.exit(1)
    except Exception as e:
        print(f"Unexpected error: {e}")
        print("Please check the logs for more details")
        sys.exit(1)
# Ensure to set your API keys and other environment variables in your_env_file.env.

In [None]:
if __name__ == "__main__":
    try:
        # Setup environment and API key
        env_path = os.path.join(BASE_DIR, 'google_api_key.env')
        
        if os.path.exists(env_path):
            load_dotenv(env_path)
            api_key = os.getenv('GOOGLE_API_KEY')
            if api_key:
                print("Using API key from google_api_key.env")
                # Set up the global LLM
                LLM = setup_genai(api_key)
            else:
                print("GOOGLE_API_KEY not found in google_api_key.env file")
                print("Please create google_api_key.env file with your API key")
                sys.exit(1)
        else:
            print(f"Environment file not found at: {env_path}")
            print("Please create google_api_key.env file with your API key")
            sys.exit(1)

        # Create the analysis DataFrame
        df = create_analysis_dataframe()
        
        print("Analysis completed successfully")
        print(f"DataFrame shape: {df.shape}")
        print("\nFirst few columns:", list(df.columns)[:5])
        
        # Keep the DataFrame in memory for further inspection/processing
        # You can now examine df or save it using a separate script
        
    except Exception as e:
        print(f"Error in setup: {e}")
        sys.exit(1)

In [20]:
def create_custom_styles(doc: Document):
    """Create custom styles for the document"""
    # Title style
    title_style = doc.styles.add_style('StudyTitle', WD_STYLE_TYPE.PARAGRAPH)
    title_style.font.size = Pt(16)
    title_style.font.bold = True
    title_style.font.color.rgb = RGBColor(0, 0, 139)  # Dark blue
    title_style.paragraph_format.space_before = Pt(24)
    title_style.paragraph_format.space_after = Pt(12)
    
    # Main question style
    main_q_style = doc.styles.add_style('MainQuestion', WD_STYLE_TYPE.PARAGRAPH)
    main_q_style.font.size = Pt(12)
    main_q_style.font.bold = True
    main_q_style.paragraph_format.space_before = Pt(12)
    main_q_style.paragraph_format.space_after = Pt(6)
    
    # Sub question style
    sub_q_style = doc.styles.add_style('SubQuestion', WD_STYLE_TYPE.PARAGRAPH)
    sub_q_style.font.size = Pt(11)
    sub_q_style.font.italic = True
    sub_q_style.paragraph_format.left_indent = Inches(0.25)
    sub_q_style.paragraph_format.space_before = Pt(6)
    sub_q_style.paragraph_format.space_after = Pt(6)

In [21]:
def format_question_text(col_name: str) -> str:
    """Format column names back into readable questions"""
    if col_name.startswith('Main__'):
        question = col_name.replace('Main__', '')
    elif col_name.startswith('Sub__'):
        question = col_name.split('__')[-1]
    else:
        return col_name
    
    question = question.replace('_', ' ')
    if not question.endswith('?'):
        question += '?'
    return question

In [None]:
# Now simply call this with your existing DataFrame
output_path = "delirium_analysis_report.docx"
doc = Document()
create_custom_styles(doc)
doc.add_heading('Delirium Prediction Models Analysis', 0)

In [23]:
# Process each paper
for idx, row in df.iterrows():
    # Add study title
    title = row['title']
    doc.add_paragraph(title, style='StudyTitle')
    
    # Add file name in smaller text
    file_info = doc.add_paragraph(f"Source file: {row['File']}")
    file_info.runs[0].font.size = Pt(8)
    file_info.runs[0].font.italic = True
    
    # Process main questions
    for col in df.columns:
        if col.startswith('Main__'):
            # Add main question
            question_text = format_question_text(col)
            main_q = doc.add_paragraph(question_text, style='MainQuestion')
            
            # Add main answer
            doc.add_paragraph(row[col])
            
            # Find and add related subquestions
            main_prefix = col.split('__')[1][:30]
            related_subqs = [c for c in df.columns if c.startswith(f'Sub__{main_prefix}')]
            
            for sub_col in related_subqs:
                if pd.notna(row[sub_col]) and row[sub_col] != "Not mentioned in the paper":
                    # Add subquestion and its answer
                    sub_text = format_question_text(sub_col)
                    sub_p = doc.add_paragraph(style='SubQuestion')
                    sub_p.add_run(f"{sub_text}\n").bold = True
                    sub_p.add_run(row[sub_col])
    
    # Add page break between papers
    doc.add_page_break()

In [None]:
# Save the document
try:
    doc.save(output_path)
    print(f"Successfully saved analysis to {output_path}")
except Exception as e:
    print(f"Error saving document: {e}")

In [28]:
def format_question(col_name: str) -> str:
    """Convert column names back to readable questions"""
    if col_name.startswith('Main__'):
        question = col_name.replace('Main__', '')
    elif col_name.startswith('Sub__'):
        question = col_name.split('__')[-1]
    else:
        return col_name
    
    question = question.replace('_', ' ')
    if not question.endswith('?'):
        question += '?'
    return question

In [29]:
def analyze_question_with_genai(responses: List[str], question: str, model) -> str:
    """Use GenAI to analyze responses for a specific question"""
    
    prompt = f"""You are analyzing multiple research papers about delirium prediction models. 
    Analyze these {len(responses)} different responses to the question: "{question}"
    
    Responses:
    {'\n'.join([f'Paper {i+1}: {resp}' for i, resp in enumerate(responses)])}
    
    Provide a comprehensive summary that includes:
    1. General trends and common approaches across papers
    2. Unique or innovative approaches mentioned
    3. Gaps or limitations commonly noted
    4. Important technical details (if relevant)
    5. Clinical implications
    
    Format the response in clear paragraphs. Focus on what's most relevant for a medical journal manuscript.
    Be specific about numbers (e.g., "3 out of 5 studies used...") when possible.
    """
    
    try:
        response = model.generate_content(prompt)
        return response.text.strip()
    except Exception as e:
        return f"Error generating summary: {e}"

In [30]:
def generate_comprehensive_summary(df: pd.DataFrame, model) -> str:
    """Generate a comprehensive summary of all questions and responses"""
    summary_text = "# Comprehensive Analysis of Delirium Prediction Models\n\n"
    
    # Process main questions
    main_questions = [col for col in df.columns if col.startswith('Main__')]
    
    for main_q in main_questions:
        question = format_question(main_q)
        responses = df[main_q].dropna().tolist()
        
        if responses:
            summary_text += f"## {question}\n\n"
            summary = analyze_question_with_genai(responses, question, model)
            summary_text += f"{summary}\n\n"
            
            # Find and process related subquestions
            main_prefix = main_q.split('__')[1][:30]
            related_subqs = [col for col in df.columns 
                           if col.startswith(f'Sub__{main_prefix}')]
            
            if related_subqs:
                summary_text += "### Detailed Analysis\n\n"
                for sub_q in related_subqs:
                    sub_question = format_question(sub_q)
                    sub_responses = df[sub_q].dropna().tolist()
                    
                    if sub_responses:
                        summary_text += f"#### {sub_question}\n\n"
                        sub_summary = analyze_question_with_genai(sub_responses, 
                                                               sub_question, 
                                                               model)
                        summary_text += f"{sub_summary}\n\n"
            
            summary_text += "---\n\n"
    
    return summary_text

In [31]:
def save_summary(summary_text: str, base_output_path: str):
    """Save the summary in multiple formats"""
    # Save as markdown
    md_path = f"{base_output_path}.md"
    with open(md_path, 'w', encoding='utf-8') as f:
        f.write(summary_text)
    print(f"Summary saved as markdown: {md_path}")
    
    # Save as plain text
    txt_path = f"{base_output_path}.txt"
    with open(txt_path, 'w', encoding='utf-8') as f:
        f.write(summary_text)
    print(f"Summary saved as text: {txt_path}")

In [None]:
if __name__ == "__main__":
    try:
        # Load environment variables
        load_dotenv('google_api_key.env')
        api_key = os.getenv('GOOGLE_API_KEY')
        if not api_key:
            raise ValueError("GOOGLE_API_KEY not found in environment variables")
        
        # Setup GenAI with API key
        print("Setting up Google's Generative AI...")
        model = setup_genai(api_key)
        
        # Generate comprehensive summary using the existing DataFrame (df)
        print("\nGenerating comprehensive summary...")
        summary_text = generate_comprehensive_summary(df, model)
        
        # Save the summary
        print("\nSaving summary...")
        save_summary(summary_text, "delirium_models_analysis")
        
        # Print preview
        print("\nFirst few lines of the summary:")
        print("\n".join(summary_text.split("\n")[:10]))
        
        print("\nSummary generation complete!")
        
    except Exception as e:
        print(f"Error in summary generation: {e}")