---

# import a paper from Zotero, extract data, save to database

___

## proof of concept - summarize each paper

In [1]:
import os
import json
import time
import requests
import pandas as pd
from dotenv import load_dotenv
import PyPDF2
from pathlib import Path

In [2]:
# Load environment variables
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# Configuration
ZOTERO_PDF_FOLDER = r"D:\OneDrive\Apps\Zotero\Main"  # Update this to your Zotero PDF folder
DATABASE_PATH = "literature_data/literature_database_v3.csv"
OPENAI_URL = "https://api.openai.com/v1/chat/completions"


In [3]:
# Load the literature database and AI instructions
lit_db = pd.read_csv(r"C:\Users\Karl\Documents\_Python\CMU.49.007-Literature-Review\literature_data\literature_database_v3.csv").iloc[:,1:]
# ai_instructions = pd.read_csv(r"C:\Users\Karl\Documents\_Python\CMU.49.007-Literature-Review\literature_data\ai_instructions.csv")
ai_instructions = lit_db.loc[0]
lit_db

Unnamed: 0,Reference,Year,Location,"Charge Storage Capacity, mC/cm^2","Charge Injection Capacity, mC/cm^2","Transmittance, %","Resistivity, Ohm/square","Maximum Electrode Use, days",Electrode Material,Substrate\n,GABA Sensing? (Y/N),Substrate
0,"Add the Paper Reference, for example: Doe et a...",Supply the year of publication. Do So in YYYY ...,Specify the college or research institution in...,"Supply the charge storage capacity, CSC, in mC...","Supply the charge injection capacity, CIC, in ...",Supply the transmittance in percentage. If you...,"Add the sheet resistance of the device, in Ohm...",Enter the maximum usable days for the electrod...,"Supply the electrode material. Ex. Gold, PEDOT...","Supply the substrate material. Ex.PMDS, Paryle...","If GABA is sensed in the paper, enter Y. Else,...",
1,"Doe et al., 2024",2024,"Cambridge, MA",,,,,,Gold,,,
2,"Smith et al., 2023",2023,"New York, NY",,,,,,PEDOT:PSS,,,
3,"Johnson et al., 2022",2022,"London, UK",,,,,,ITO,,,
4,"Brown et al., 2021",2021,"Paris, France",,,,,,PMDS,,,
5,"Davis et al., 2020",2020,"Berlin, Germany",,,,,,Parylene C,,,
6,"Miller et al., 2019",2019,"Sydney, Australia",,,,,,SU-8,,,
7,"Wilson et al., 2018",2018,"Tokyo, Japan",,,,,,,,,
8,"Moore et al., 2017",2017,"Beijing, China",,,,,,,,,
9,"Taylor et al., 2016",2016,"Moscow, Russia",,,,,,,,,


In [4]:
def extract_text_from_pdf(pdf_path):
    """Extract text from a PDF file."""
    try:
        with open(pdf_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            text = ""
            for page in reader.pages:
                text += page.extract_text() + "\n"
            return text
    except Exception as e:
        print(f"Error extracting text from {pdf_path}: {e}")
        return None

def analyze_paper_with_gpt(pdf_text):
    """Use GPT to analyze the paper and extract relevant information."""
    headers = {
        "Content-Type": "application/json", 
        "Authorization": f"Bearer {OPENAI_API_KEY}"
    }
    
    # First check if it's a review paper
    review_check_prompt = """Determine if this is a review paper or a research paper about a specific device/material.
    If this is a review paper that does not present new experimental results about a specific device, respond with "REVIEW".
    If this is a research paper about a specific device or material with new experimental results, respond with "RESEARCH"."""
    
    review_check_data = {
        "model": "gpt-4",
        "messages": [
            {"role": "system", "content": review_check_prompt},
            {"role": "user", "content": pdf_text[:5000]}  # Use shorter text for initial check
        ],
        "temperature": 0.3,
        "max_tokens": 10
    }
    
    try:
        response = requests.post(OPENAI_URL, headers=headers, json=review_check_data)
        response.raise_for_status()
        paper_type = response.json()["choices"][0]["message"]["content"].strip()
        
        if paper_type == "REVIEW":
            print("Skipping review paper")
            return None
    except Exception as e:
        print(f"Error checking paper type: {e}")
        return None
    
    # Build JSON template and instructions from database columns
    json_template = "{\n"
    instructions = "\n"
    for col in lit_db.columns:
        # Extract the base name and unit from column header
        if "," in col:
            name, unit = col.split(",", 1)
            unit = unit.strip()
            json_template += f'        "{name}": "Value in {unit} if available, otherwise empty string",\n'
            instructions += f"For {name} ({unit}): {ai_instructions[col]}\n"
        else:
            json_template += f'        "{col}": "Value if available, otherwise empty string",\n'
            instructions += f"For {col}: {ai_instructions[col]}\n"
    json_template = json_template.rstrip(",\n") + "\n    }"

    prompt = f"""You are a research assistant analyzing academic papers about neural interfaces and electrodes. 
    Extract the following information from the paper and return it as a JSON object:
    {json_template}
    
    Please follow these specific instructions for each field:
    {instructions}
    
    Only include values that are explicitly mentioned in the paper. If a value is not mentioned, use an empty string.
    Ensure all numerical values include their units."""
    
    data = {
        "model": "gpt-4",
        "messages": [
            {"role": "system", "content": prompt},
            {"role": "user", "content": pdf_text[:15000]}  # Limit text length to avoid token limit
        ],
        "temperature": 0.3,
        "max_tokens": 1000  # Limit response length
    }
    
    try:
        # Add exponential backoff retry logic
        max_retries = 5
        base_delay = 1  # Start with 1 second delay
        
        for attempt in range(max_retries):
            try:
                response = requests.post(OPENAI_URL, headers=headers, json=data)
                response.raise_for_status()
                result = response.json()
                content = result["choices"][0]["message"]["content"]
                return json.loads(content)
                
            except requests.exceptions.HTTPError as e:
                if e.response.status_code == 429:  # Rate limit error
                    delay = base_delay * (2 ** attempt)  # Exponential backoff
                    print(f"Rate limited. Retrying in {delay} seconds...")
                    time.sleep(delay)
                    continue
                else:
                    print(f"HTTP Error: {e}")
                    return None
                    
            except json.JSONDecodeError:
                print("Error: Could not parse GPT response as JSON")
                print("Response content:", content)
                return None
                
    except Exception as e:
        print(f"Error calling GPT API: {e}")
        return None

def clean_database_formatting():
    """Use GPT to standardize formatting across the database."""
    try:
        print("Starting database cleaning process...")
        
        # Read the database
        print("Reading database file...")
        df = pd.read_csv(DATABASE_PATH)
        print(f"Successfully loaded database with {len(df)} rows")
        
        # Store the first row (instructions) separately
        first_row = df.iloc[0].copy()
        data_df = df.iloc[1:].copy()
        
        headers = {
            "Content-Type": "application/json", 
            "Authorization": f"Bearer {OPENAI_API_KEY}"
        }

        # Create prompt using the same instructions from ai_instructions
        print("Preparing cleaning instructions...")
        prompt = """You are a data cleaning assistant. Review this database and standardize its formatting according to these rules:

        For each row in the database, ensure:
        1. All values follow the specified format
        2. Units are consistent
        3. Empty values are represented as empty strings
        4. No extra formatting or special characters

        Return the entire cleaned database as a JSON array of objects, where each object represents a row.
        Maintain the exact same column names and structure.

        Here are the specific rules for each column:
        """
        
        for col in lit_db.columns:
            prompt += f"\n{col}: {ai_instructions[col]}"
            
        print("Sending entire database to GPT for cleaning...")
        
        # Convert DataFrame to JSON with proper escaping and chunk the data
        json_data = data_df.to_json(orient='records', force_ascii=False)
        chunk_size = 20  # Process fewer rows at a time to avoid token limits
        json_chunks = json.loads(json_data)
        cleaned_chunks = []
        
        for i in range(0, len(json_chunks), chunk_size):
            chunk = json_chunks[i:i+chunk_size]
            chunk_json = json.dumps(chunk)
            
            data = {
                "model": "gpt-4",
                "messages": [
                    {"role": "system", "content": prompt},
                    {"role": "user", "content": chunk_json}
                ],
                "temperature": 0.3,
                "max_tokens": 4000  # Increased token limit
            }
            
            # Add exponential backoff retry logic
            max_retries = 5
            base_delay = 1
            
            for attempt in range(max_retries):
                try:
                    response = requests.post(OPENAI_URL, headers=headers, json=data)
                    response.raise_for_status()
                    cleaned_chunk = json.loads(response.json()["choices"][0]["message"]["content"])
                    cleaned_chunks.extend(cleaned_chunk)
                    print(f"Successfully cleaned chunk {i//chunk_size + 1}")
                    break
                    
                except requests.exceptions.HTTPError as e:
                    if e.response.status_code == 429:  # Rate limit error
                        delay = base_delay * (2 ** attempt)
                        print(f"Rate limited. Retrying in {delay} seconds...")
                        time.sleep(delay)
                        continue
                    else:
                        print(f"HTTP Error cleaning chunk {i//chunk_size + 1}: {e}")
                        break
                        
                except Exception as e:
                    print(f"Error cleaning chunk {i//chunk_size + 1}: {e}")
                    break
                    
            time.sleep(1)  # Add delay between chunks
                
        print("Successfully received all cleaned data")
        
        # Convert cleaned data back to DataFrame
        cleaned_df = pd.DataFrame(cleaned_chunks)
        
        # Combine with the first row
        final_df = pd.concat([pd.DataFrame([first_row]), cleaned_df], ignore_index=True)
        
        # Save the cleaned database
        print("Saving cleaned database...")
        final_df.to_csv(DATABASE_PATH, index=False)
        print("Database cleaning complete!")
            
    except Exception as e:
        print(f"Error during database cleaning: {e}")

def update_database(paper_data):
    """Update the literature database with new paper data."""
    try:
        # Read existing database
        if os.path.exists(DATABASE_PATH):
            df = pd.read_csv(DATABASE_PATH)
        else:
            # Create new database with correct columns
            df = pd.DataFrame(columns=lit_db.columns)  # Use columns from loaded database
        
        # Add new paper data
        new_row = {}
        for col in df.columns:
            # Extract base column name before comma if present
            base_col = col.split(",")[0].strip()
            new_row[col] = paper_data.get(base_col, "")
        
        df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)
        
        # Save updated database
        df.to_csv(DATABASE_PATH, index=False)
        print(f"Successfully updated database with paper: {paper_data.get('Reference', 'Unknown')}")
        
    except Exception as e:
        print(f"Error updating database: {e}")

def is_paper_in_database(pdf_text):
    """Check if paper is already in database by comparing first author and year."""
    try:
        # Extract first few lines which likely contain title/authors
        first_page_text = pdf_text[:30000].lower()
        
        # Read existing database
        if os.path.exists(DATABASE_PATH):
            df = pd.read_csv(DATABASE_PATH)
            # Skip first row (instructions)
            df = df.iloc[1:]
            
            # Check each reference in database
            for ref in df['Reference'].dropna():
                ref = str(ref).lower()
                try:
                    # Extract year (assuming format includes ", YYYY")
                    year = next(y for y in ref.split() if y.isdigit() and len(y) == 4)
                    
                    # Get first author only (before "et al" or first comma)
                    first_author = ref.split("and")[0].split("et al")[0].split(",")[0].strip()
                    
                    # Check if first author and year match the PDF text
                    if first_author in first_page_text and year in first_page_text:
                        return True
                except (StopIteration, IndexError):
                    continue
            
        return False
        
    except Exception as e:
        print(f"Error checking database for paper: {e}")
        return False

def process_pdf_folder(folder_paths):
    """Process PDFs in the given folders.
    
    Args:
        folder_paths (list): List of folder paths containing PDFs to process
    """
    pdf_files = []
    
    # Collect all PDF files from the provided folder paths
    for folder_path in folder_paths:
        folder = Path(folder_path)
        if folder.exists():
            pdf_files.extend(list(folder.rglob("*.pdf")))
        else:
            print(f"Warning: Folder not found: {folder_path}")
    
    print(f"Found {len(pdf_files)} PDF files to process")
    
    for pdf_path in pdf_files:
        print(f"\nProcessing: {pdf_path.name}")
        
        # Extract text from PDF
        pdf_text = extract_text_from_pdf(pdf_path)
        if not pdf_text:
            print(f"Skipping {pdf_path.name} due to text extraction error")
            continue
            
        # Check if paper is already in database
        if is_paper_in_database(pdf_text):
            print(f"Already in database: skipping {pdf_path.name}")
            continue
        
        # Analyze paper with GPT
        paper_data = analyze_paper_with_gpt(pdf_text)
        if not paper_data:
            print(f"Skipping {pdf_path.name}; continuing...")
            continue
        
        # Update database
        update_database(paper_data)
        
        # Wait between files
        time.sleep(2)
    
    # Clean up database formatting after processing all files
    print("\nCleaning database formatting...")
    clean_database_formatting()

def main():
    # Specify the exact folder paths containing PDFs to process
    folder_paths = [
        r"D:\OneDrive\Apps\Zotero\Main\CMU\79 Research Articles\79.002 Biosensors",
        r"D:\OneDrive\Apps\Zotero\Main\CMU\79 Research Articles\79.022 Transparent Electrodes",
        r"D:\OneDrive\Apps\Zotero\Main\CMU\79 Research Articles\79.008 Neuromodulation",
    ]
    
    print("Starting to process PDFs from the following folders:")
    for path in folder_paths:
        print(f"- {path}")
    
    process_pdf_folder(folder_paths)
    print("\nProcessing complete!")

if __name__ == "__main__":
    main()

Starting to process PDFs from the following folders:
- D:\OneDrive\Apps\Zotero\Main\CMU\79 Research Articles\79.002 Biosensors
- D:\OneDrive\Apps\Zotero\Main\CMU\79 Research Articles\79.022 Transparent Electrodes
- D:\OneDrive\Apps\Zotero\Main\CMU\79 Research Articles\79.008 Neuromodulation
Found 88 PDF files to process

Processing: A and Cm - 2016 - Nano-Bioelectronics.pdf
Skipping review paper
Skipping A and Cm - 2016 - Nano-Bioelectronics.pdf; continuing...

Processing: Asif et al_2021_Pyrolytic carbon nanograss electrodes for electrochemical detection of dopamine.pdf


KeyboardInterrupt: 

In [5]:
clean_database_formatting()

Starting database cleaning process...
Reading database file...
Successfully loaded database with 54 rows
Preparing cleaning instructions...
Sending entire database to GPT for cleaning...
Error cleaning chunk 1: Expecting value: line 1 column 1 (char 0)


KeyboardInterrupt: 