In [None]:
pip install pypdf pandas your-google-generativeai matplotlib seaborn python-dotenv

In [51]:
# Import required packages
import os
import sys
import subprocess

In [52]:
def install_required_packages():
    """Install required packages"""
    required_packages = {
        'pypdf': 'pypdf',
        'pandas': 'pandas',
        'your-google-generativeai': 'your-google-generativeai',
        'python-dotenv': 'python-dotenv'
    }
    
    for package_name, pip_name in required_packages.items():
        try:
            __import__(package_name.replace('-', ''))
            print(f"{package_name} is already installed")
        except ImportError:
            print(f"Installing {package_name}...")
            subprocess.check_call([sys.executable, "-m", "pip", "install", pip_name])
            print(f"Successfully installed {package_name}")

In [None]:
# Install required packages
install_required_packages()

In [54]:
# Now proceed with the rest of the imports
import pandas as pd
from dotenv import load_dotenv
import google.generativeai as genai

In [None]:
# Configuration
BASE_DIR = "your-base-dir"

In [56]:
# Global variable for LLM
llm = None

In [75]:
def check_dependencies():
    """Check if required packages are installed"""
    required_packages = {
        'pypdf': 'pypdf',
        'pandas': 'pandas',
        'google.generativeai': 'your-google-generativeai'  # Changed from 'your-google-generativeai' to 'google.generativeai'
    }
    
    missing_packages = []
    for package, pip_name in required_packages.items():
        try:
            __import__(package.split('.')[0])  # Split on '.' and import first part for nested packages
        except ImportError:
            missing_packages.append(pip_name)
    
    if missing_packages:
        print("Missing required packages. Please install them using:")
        print(f"pip install {' '.join(missing_packages)}")
        sys.exit(1)

In [59]:
def setup_genai(api_key: str):
    """Set up the Google Generative AI model"""
    try:
        genai.configure(api_key=api_key)
        return genai.GenerativeModel('gemini-1.5-flash', 
                                   generation_config=genai.GenerationConfig(temperature=0))
    except Exception as e:
        print(f"Error setting up GenAI: {e}")
        sys.exit(1)

In [60]:
def set_api_key(key: str):
    """Set up the API key and initialize LLM"""
    global llm
    llm = setup_genai(key)
    return llm

In [61]:
def process_pdf(pdf_path: str) -> Optional[str]:
    """Extract text from a PDF file"""
    try:
        from pypdf import PdfReader
        reader = PdfReader(pdf_path)
        text = ""
        for page in reader.pages:
            text += page.extract_text() + "\n"
        return text
    except Exception as e:
        print(f"Error processing PDF {pdf_path}: {e}")
        return None

In [89]:
def extract_title(text: str, llm) -> str:
    """Extract the title of the study"""
    prompt = f"""Analyze this research paper text and extract ONLY the main title.
    Return just the title, nothing else.
    If no title can be found, return 'Not found'.

    Text to analyze:
    {text}
    """
    try:
        response = llm.generate_content(prompt)
        return response.text.strip()
    except Exception:
        return 'Not found'

In [88]:
def extract_first_author(text: str, llm) -> str:
    """Extract the first author's last name"""
    prompt = f"""Analyze this research paper text and extract ONLY the last name of the first author.
    Return just the last name, nothing else.
    If no author can be found, return 'Not found'.

    Text to analyze:
    {text}
    """
    try:
        response = llm.generate_content(prompt)
        return response.text.strip()
    except Exception:
        return 'Not found'

In [87]:
def extract_feature_count(text: str, llm) -> str:
    """Extract number of features used in the model"""
    prompt = f"""Analyze this research paper and find the number of features or variables used in the final predictive model.
    Return ONLY the number.
    Look for phrases like 'features', 'variables', 'predictors', 'input parameters'.
    If multiple numbers are mentioned, return the final number used in the model.
    If no specific number is found, return 'Not reported'.

    Text to analyze:
    {text}
    """
    try:
        response = llm.generate_content(prompt)
        number = re.search(r'\d+', response.text)
        return number.group() if number else 'Not reported'
    except Exception:
        return 'Not reported'

In [86]:
def extract_outcome_variable(text: str, llm) -> str:
    """Extract the outcome variable of the model"""
    prompt = f"""Analyze this research paper and identify the main outcome variable that the model is trying to predict.
    Return ONLY the outcome in 1-2 words (e.g., 'incident delirium', 'mortality', 'ICU readmission').
    Look for phrases like 'outcome variable', 'target variable', 'predicted variable', 'dependent variable'.
    If not clearly specified, return 'Not specified'.

    Text to analyze:
    {text}
    """
    try:
        response = llm.generate_content(prompt)
        return response.text.strip()[:50]
    except Exception:
        return 'Not specified'

In [128]:
def extract_publication_year_from_pdf(text: str, llm) -> str:
    """Extract publication year with enhanced pattern matching"""
    # First try exact patterns in the text itself without LLM
    text_sample = text[:10000].lower()
    
    # Common patterns for publication years
    patterns = [
        r'copyright.*?(20\d{2})',
        r'published.*?(20\d{2})',
        r'publication\s+date.*?(20\d{2})',
        r'accepted.*?(20\d{2})',
        r'received.*?(20\d{2})',
        r'\b(20\d{2})\s*elsevier',
        r'\b(20\d{2})\s*springer',
        r'\b(20\d{2})\s*wiley',
        r'©\s*(20\d{2})',
        r'doi:.*?(20\d{2})',
        r'volume.*?(20\d{2})',
        r'\b(20\d{2})\s*by\s+the\s+authors',
        r'published online.*?(20\d{2})',
        r'\b(20\d{2})\s*;',
        r'\b(20\d{2})\s*\.',
        r'first published.*?(20\d{2})',
    ]
    
    years_found = []
    for pattern in patterns:
        matches = re.findall(pattern, text_sample)
        years_found.extend([int(y) for y in matches if 2000 <= int(y) <= 2024])
    
    # If found years through patterns, return the most recent one
    if years_found:
        return str(max(years_found))
    
    # If no years found through patterns, try LLM with more specific prompt
    prompt = f"""Analyze this academic paper text and find the publication year.
    Look for ALL of these patterns:
    1. Copyright statements (e.g., "© 2023")
    2. Publication dates (e.g., "Published: 2023", "First published 2023")
    3. Received/Accepted/Published dates
    4. Journal citations with years
    5. DOI dates
    6. Volume/Issue dates
    7. Footer/Header dates
    8. Conference proceedings dates
    9. Online publication dates
    10. Publisher copyright statements
    
    Return ONLY a 4-digit year between 2000-2024.
    If multiple years found, return the latest publication year.
    If no clear publication year found, return 'Not found'.

    Text to analyze:
    {text[:10000]}
    """
    
    try:
        response = llm.generate_content(prompt)
        year_match = re.search(r'\b(20\d{2})\b', response.text)
        if year_match:
            year = int(year_match.group(1))
            if 2000 <= year <= 2024:
                return str(year)
    except Exception as e:
        print(f"Error in LLM year extraction: {e}")
    
    # If still not found, try one more pattern set
    final_patterns = [
        r'20\d{2}\s*nov',
        r'20\d{2}\s*dec',
        r'20\d{2}\s*jan',
        r'20\d{2}\s*feb',
        r'20\d{2}\s*mar',
        r'20\d{2}\s*apr',
        r'20\d{2}\s*may',
        r'20\d{2}\s*jun',
        r'20\d{2}\s*jul',
        r'20\d{2}\s*aug',
        r'20\d{2}\s*sep',
        r'20\d{2}\s*oct'
    ]
    
    years_found = []
    for pattern in final_patterns:
        matches = re.findall(pattern, text_sample)
        years_found.extend([int(y[:4]) for y in matches if 2000 <= int(y[:4]) <= 2024])
    
    if years_found:
        return str(max(years_found))
    
    return 'Not found'

In [67]:
def extract_yes_no(text):
    """Extract yes/no answer from text with comprehensive pattern matching and scoring"""
    text_lower = text.lower()
    score = 0
    
    # Strong negative patterns [-2 points each]
    strong_negative = [
        'no evidence', 'not mentioned', 'not addressed', 'not discussed',
        'not described', 'not implemented', 'not provided', 'not included',
        'lacks', 'missing', 'absent', 'none', 'not found', 'did not',
        'does not', 'has not', 'have not', 'was not', 'were not',
        'no information', 'no data', 'no details', 'no assessment',
        'no analysis', 'no evaluation', 'no consideration', 'no documentation',
        'failed to', 'neglected to', 'omitted', 'excluded'
    ]
    
    # Weak negative patterns [-1 point each]
    weak_negative = [
        'limited', 'insufficient', 'inadequate', 'unclear how',
        'not clear', 'not specified', 'vague', 'ambiguous',
        'minimal', 'poor', 'lacking', 'without'
    ]
    
    # Strong positive patterns [+2 points each]
    strong_positive = [
        'clearly described', 'well documented', 'explicitly stated',
        'thoroughly', 'comprehensively', 'extensively', 'detailed',
        'implemented', 'established', 'confirmed', 'validated',
        'demonstrated', 'proved', 'showed', 'revealed', 'identified',
        'verified', 'conducted', 'performed', 'executed', 'completed',
        'achieved', 'succeeded in', 'accomplished'
    ]
    
    # Weak positive patterns [+1 point each]
    weak_positive = [
        'discussed', 'mentioned', 'noted', 'indicated', 'suggested',
        'proposed', 'considered', 'addressed', 'included', 'incorporated',
        'integrated', 'utilized', 'employed', 'applied', 'attempted',
        'presented', 'reported', 'described'
    ]
    
    # Context-specific positive indicators [+2 points each]
    context_positive = {
        'validation': ['externally validated', 'cross-validated', 'test set', 'validation cohort'],
        'methodology': ['methods include', 'methodology described', 'statistical analysis', 'approach involved'],
        'results': ['results show', 'findings indicate', 'analysis revealed', 'study found'],
        'implementation': ['implemented in', 'deployed', 'put into practice', 'integrated into workflow'],
        'evaluation': ['evaluated using', 'assessed through', 'measured by', 'tested with']
    }
    
    # Check patterns and calculate score
    for pattern in strong_negative:
        if pattern in text_lower:
            score -= 2
    
    for pattern in weak_negative:
        if pattern in text_lower:
            score -= 1
    
    for pattern in strong_positive:
        if pattern in text_lower:
            score += 2
    
    for pattern in weak_positive:
        if pattern in text_lower:
            score += 1
    
    for patterns in context_positive.values():
        for pattern in patterns:
            if pattern in text_lower:
                score += 2
    
    # Additional checks
    active_voice_patterns = [
        'we implemented', 'authors developed', 'study implemented',
        'researchers conducted', 'team performed', 'analysis showed'
    ]
    for pattern in active_voice_patterns:
        if pattern in text_lower:
            score += 1
    
    if ' therefore ' in text_lower or ' thus ' in text_lower or ' hence ' in text_lower:
        score += 1
    
    if any(char.isdigit() for char in text):
        score += 1
    
    # Decision thresholds
    if not text.strip() or len(text.split()) < 5:
        return "No"
    
    if score <= -2:
        return "No"
    elif score >= 2:
        return "Yes"
    else:
        return "No"

In [68]:
def analyze_subquestion(text: str, question: str, llm) -> str:
    """Analyze a single subquestion using the detailed yes/no extraction"""
    prompt = f"""
    Based on the following text, answer this specific question with a detailed explanation:
    Question: {question}
    
    If the information is not explicitly mentioned or is unclear, explain what is missing.
    
    Text to analyze:
    {text}
    """
    try:
        response = llm.generate_content(prompt)
        return extract_yes_no(response.text)
    except Exception:
        return 'No'

In [122]:
def analyze_single_paper(pdf_path: str, llm) -> Dict:
    """Analyze a single paper and return all results"""
    # Extract text from PDF
    text = process_pdf(pdf_path)
    if text is None:
        return None
  
    # Initialize results dictionary
    results = {}
    
    # Extract basic information
    results['title'] = extract_title(text, llm)
    results['first_author'] = extract_first_author(text, llm)
    results['publication_year'] = extract_publication_year_from_pdf(text, filename, llm)
    results['feature_count'] = extract_feature_count(text, llm)
    results['outcome_variable'] = extract_outcome_variable(text, llm)
    
    # Analyze main questions
    for main_question in DETAILED_QUESTIONS.keys():
        column_name = f"Main__{main_question.split('?')[0]}"
        results[column_name] = analyze_subquestion(text, main_question, llm)
    
    # Analyze subquestions
    for main_question, subquestions in DETAILED_QUESTIONS.items():
        for subq in subquestions:
            column_name = f"Sub__{main_question.split('?')[0]}__{subq.split('?')[0]}"
            results[column_name] = analyze_subquestion(text, subq, llm)
    
    return results

In [70]:
def save_results(results_list: List[Dict], output_path: str):
    """Save results to CSV file"""
    df = pd.DataFrame(results_list)
    df.to_csv(output_path, index=False)
    print(f"Results saved to {output_path}")
    return df

In [71]:
def print_summary_statistics(df: pd.DataFrame):
    """Print summary statistics for each question"""
    for col in df.columns:
        if col not in ['title', 'first_author', 'feature_count', 'outcome_variable']:
            yes_count = (df[col] == 'Yes').sum()
            total = len(df)
            print(f"\n{col}:")
            print(f"Yes: {yes_count} ({(yes_count/total*100):.1f}%)")
            print(f"No: {total-yes_count} ({((total-yes_count)/total*100):.1f}%)")

In [78]:
# Questions dictionary with main questions and their subquestions
DETAILED_QUESTIONS = {
    "What is the specific purpose and context of the AI model in delirium prediction?": [
        "Why was the model built?",
        "Was there a specific gap in the literature that it aimed to address?",
        "Does the model focus on prevalent vs. incident delirium?",
        "What is the prediction window?",
        "What type of ICU population is targeted?"
    ],
    "How was the model developed, and what data were used for training?": [
        "What data source was used for training the model?",
        "What is the distribution of the data?",
        "Is the data representative of the target population?",
        "What were the steps or methods used to define and select the features?",
        "How were missing data and outliers assessed and managed?",
        "What was the gold standard for delirium?",
        "What type of ML models were tested?",
        "How was hyperparameter tuning performed?",
        "Was cross-validation done? What type and how many fold?"
    ],
    "Has the model been externally validated, and how does it perform in different clinical settings?": [
        "Is the model externally validated?",
        "What are the performance metrics?",
        "What are the subgroup analysis performance metrics?",
        "Are fairness metrics reported?",
        "Was clinical utility tested?"
    ],
    "How interpretable are the model's outputs, and can clinicians understand the reasoning behind predictions?": [
        "Have they looked at results that are wrong and tried to understand the reasoning behind the mistakes?",
        "Report SHAP values or other feature ranking"
    ],
    "Are there any ethical, legal, or social concerns related to the use of the model?": [
        "Has the paper discussed any of these aspects?",
        "If so, which aspect and how the paper tried to address it?"
    ],
    "What training and support will be provided to clinicians to effectively use and interpret the model's predictions?": [
        "Has the paper discussed training and support?",
        "If so, what steps will be taken to allow the clinician to use the model and understand the model's interpretation output?"
    ],
    "How does the model integrate into existing clinical workflows and complement current practices?": [
        "Do the authors describe the role of the model in clinical practice?",
        "Are there steps on how the model should be used in a clinical setting? If yes, what are the steps?"
    ],
    "Has the use of the model been shown to improve patient care and outcomes in prospective studies?": [
        "Has the paper reported any prospective studies on patient outcomes?",
        "If so, what were the key findings regarding patient care improvement?"
    ],
    "What are the potential risks or harms associated with implementing the model in clinical practice?": [
        "Has a risk assessment been done?",
        "If so, what are the identified potential risks and harm?"
    ],
    "How will the model be maintained and updated over time to ensure continued accuracy and relevance?": [
        "Has the life cycle of the model been discussed?",
        "Are there specific plans for model updates and maintenance?"
    ],
    "What measures are in place to monitor the model's performance?": [
        "Are there monitoring systems after implementation suggested in the paper? If so, what are they?",
        "How frequently is the model's performance evaluated?"
    ],
    "How does the model compare to existing clinical methods for delirium prediction?": [
        "Was model performance in clinical practice compared with routine clinical practice?",
        "Was the Cost:Benefit ratio mentioned? If, so what was it, clinical as well as technical?"
    ]
}

In [73]:
def main():
    try:
        # Check dependencies
        check_dependencies()
        
        # Set up paths using global config
        pdf_directory = os.path.join(BASE_DIR, "algorithm pdfs")
        output_csv = os.path.join(BASE_DIR, "detailed_analysis_results.csv")
        
        # Verify directories exist
        if not os.path.exists(pdf_directory):
            raise FileNotFoundError(f"PDF directory not found: {pdf_directory}")
        
        # Get list of PDF files
        pdf_files = [f for f in os.listdir(pdf_directory) if f.endswith('.pdf')]
        if not pdf_files:
            raise FileNotFoundError("No PDF files found in directory")
        
        print(f"Found {len(pdf_files)} PDF files to process")
        
        # Process all PDFs
        results = []
        for i, pdf_file in enumerate(pdf_files, 1):
            print(f"Processing file {i}/{len(pdf_files)}: {pdf_file}")
            pdf_path = os.path.join(pdf_directory, pdf_file)
            paper_results = analyze_single_paper(pdf_path, llm)
            if paper_results:
                paper_results['File'] = pdf_file
                results.append(paper_results)
            else:
                print(f"Warning: Could not process {pdf_file}")
        
        if not results:
            raise ValueError("No results were generated from any PDFs")
        
        # Save and analyze results
        df = save_results(results, output_csv)
        
        # Print summary statistics
        print("\nAnalysis Summary:")
        print(f"Total papers processed: {len(results)} out of {len(pdf_files)}")
        print_summary_statistics(df)
        
        # Print column names for verification
        print("\nColumns in output:")
        for col in df.columns:
            print(f"- {col}")
            
        return df

    except Exception as e:
        print(f"Error in main execution: {e}")
        sys.exit(1)

In [None]:
# Execute this instead of the if __name__ == "__main__" block
try:
    # Use the global BASE_DIR
    env_path = os.path.join(BASE_DIR, 'google_api_key.env')
    
    if os.path.exists(env_path):
        load_dotenv(env_path)
        api_key = os.getenv('GOOGLE_API_KEY')
        if api_key:
            print("Using API key from google_api_key.env")
            set_api_key(api_key)
        else:
            print("GOOGLE_API_KEY not found in google_api_key.env file")
            print("Please create google_api_key.env file with your API key")
            sys.exit(1)
    else:
        print(f"Environment file not found at: {env_path}")
        print("Please create google_api_key.env file with your API key")
        sys.exit(1)
    
    # Run the main analysis
    df = main()
    
except Exception as e:
    print(f"Error in setup: {e}")
    sys.exit(1)

In [None]:
# Save the dataframe to a CSV file
df.to_csv(os.path.join(BASE_DIR, "delirium_analysis_results.csv"), index=False)

# If you want to verify
print(f"DataFrame saved to: {os.path.join(BASE_DIR, 'delirium_analysis_results.csv')}")

# Optional: Display first few rows to verify
print("\nFirst few rows of the saved data:")
print(df.head())

# Optional: Display shape of the DataFrame
print(f"\nDataFrame shape: {df.shape}")

In [151]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [159]:
# Create directory for saving plots
save_dir = os.path.join(BASE_DIR, 'analysis_plots')
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

In [160]:
def plot_and_save_main_questions(df, save_path):
    # Plot main questions
    plt.figure(figsize=(15, 12))
    main_questions = [col for col in df.columns if col.startswith('Main__')]
    yes_percentages = df[main_questions].apply(lambda x: (x == 'Yes').mean() * 100)
    labels = [q.replace('Main__', '') for q in main_questions]
    
    # Reverse order
    yes_percentages = yes_percentages[::-1]
    labels = labels[::-1]
    
    bars = plt.barh(range(len(labels)), yes_percentages)
    for i, v in enumerate(yes_percentages):
        plt.text(v + 1, i, f'{v:.1f}%', va='center')
    
    plt.yticks(range(len(labels)), labels, wrap=True)
    plt.xlabel('Percentage of "Yes" Responses')
    plt.title('Percentage of "Yes" Responses for Main Questions')
    plt.grid(axis='x', alpha=0.3)
    plt.tight_layout()
    
    # Save plot
    plt.savefig(os.path.join(save_path, 'main_questions.png'), 
                dpi=300, 
                bbox_inches='tight')
    plt.close()

In [161]:
def plot_and_save_subquestions(df, main_q, save_path):
    sub_cols = [col for col in df.columns if col.startswith(f'Sub__{main_q}')]
    yes_percentages = df[sub_cols].apply(lambda x: (x == 'Yes').mean() * 100)
    labels = [col.split('__')[-1] for col in sub_cols]
    
    # Reverse order
    yes_percentages = yes_percentages[::-1]
    labels = labels[::-1]
    
    # Set consistent bar height
    bar_height = 0.8
    spacing = 1.5
    total_height = len(labels) * spacing
    
    plt.figure(figsize=(12, total_height))
    y_positions = np.arange(len(labels)) * spacing
    bars = plt.barh(y_positions, yes_percentages, height=bar_height)
    
    for i, v in enumerate(yes_percentages):
        plt.text(v + 1, y_positions[i], f'{v:.1f}%', va='center')
    
    plt.yticks(y_positions, labels, wrap=True)
    plt.xlabel('Percentage of "Yes" Responses')
    plt.title(f'Subquestions Analysis: {main_q}')
    plt.grid(axis='x', alpha=0.3)
    plt.tight_layout()
    
    # Save plot
    plt.savefig(os.path.join(save_path, f'subquestions_{main_q}.png'), 
                dpi=300, 
                bbox_inches='tight')
    plt.close()

In [None]:
# Plot main questions
print("Main Questions Analysis:")
plot_main_questions(df)

In [165]:
# Save main questions plot
print("Saving main questions plot...")
plot_and_save_main_questions(df, save_dir)

Saving main questions plot...


In [None]:
# Plot subquestions for each main category
print("\nSubquestions Analysis by Category:")
main_categories = []
for col in df.columns:
    if col.startswith('Sub__'):
        category = col.split('__')[1].split('_')[0]
        if category not in main_categories:
            main_categories.append(category)

for category in main_categories:
    plot_subquestions_ordered(df, category)

In [None]:
# Save subquestions plots
print("\nSaving subquestions plots...")
main_categories = []
for col in df.columns:
    if col.startswith('Sub__'):
        category = col.split('__')[1].split('_')[0]
        if category not in main_categories:
            main_categories.append(category)

for category in main_categories:
    print(f"Processing {category}...")
    plot_and_save_subquestions(df, category, save_dir)

In [162]:
def plot_and_save_summary_heatmap(df, save_path):
    main_categories = []
    summary_data = []
    
    for col in df.columns:
        if col.startswith('Sub__'):
            category = col.split('__')[1].split('_')[0]
            if category not in main_categories:
                main_categories.append(category)
    
    for category in main_categories:
        sub_cols = [col for col in df.columns if col.startswith(f'Sub__{category}')]
        completeness = (df[sub_cols] == 'Yes').mean(axis=1) * 100
        summary_data.append(completeness)
    
    summary_df = pd.DataFrame(summary_data).T
    summary_df.columns = main_categories
    summary_df.index = df['title']
    
    plt.figure(figsize=(15, 35))
    colors = ['#ffffff', '#e6f3ff', '#bde0ff', '#94cdff', '#6abaff', '#41a7ff', '#1794ff', '#0077e6', '#005bb3', '#003d80']
    custom_cmap = sns.color_palette(colors)
    
    sns.heatmap(summary_df,
                cmap=custom_cmap,
                cbar_kws={'label': 'Percentage of "Yes" Responses'},
                annot=True,
                fmt='.0f',
                square=False,
                center=50)
    
    plt.title('Summary Heatmap: Percentage of Positive Responses by Category', pad=20)
    plt.xticks(rotation=45, ha='right')
    plt.subplots_adjust(left=0.4, right=0.95, top=0.95, bottom=0.1)
    
    # Save plot
    plt.savefig(os.path.join(save_path, 'summary_heatmap.png'), 
                dpi=300, 
                bbox_inches='tight')
    plt.close()
    
    return summary_df

In [None]:
print("\nSummary Heatmap:")
summary_df = plot_summary_heatmap(df)

In [None]:
# Save heatmap
print("\nSaving summary heatmap...")
summary_df = plot_and_save_summary_heatmap(df, save_dir)

print(f"\nAll plots saved in: {save_dir}")