In [3]:
!pip install PyMuPDF

Collecting PyMuPDF
  Obtaining dependency information for PyMuPDF from https://files.pythonhosted.org/packages/4a/26/8c72973b8833a72785cedc3981eb59b8ac7075942718bbb7b69b352cdde4/pymupdf-1.26.3-cp39-abi3-win_amd64.whl.metadata
  Downloading pymupdf-1.26.3-cp39-abi3-win_amd64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.3-cp39-abi3-win_amd64.whl (18.7 MB)
   ---------------------------------------- 0.0/18.7 MB ? eta -:--:--
   ---------------------------------------- 0.0/18.7 MB 960.0 kB/s eta 0:00:20
   ---------------------------------------- 0.2/18.7 MB 1.5 MB/s eta 0:00:13
    --------------------------------------- 0.4/18.7 MB 2.7 MB/s eta 0:00:07
   - -------------------------------------- 0.9/18.7 MB 4.7 MB/s eta 0:00:04
   --- ------------------------------------ 1.7/18.7 MB 7.6 MB/s eta 0:00:03
   ------ --------------------------------- 3.2/18.7 MB 11.3 MB/s eta 0:00:02
   ---------- ----------------------------- 5.1/18.7 MB 15.5 MB/s eta 0:00:01
   --------------- ----------

In [None]:
!pip uninstall fitz

In [None]:
import pdfplumber
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline
from pathlib import Path
from tqdm import tqdm
import json

# --- Setup ---
PDF_FOLDER = Path("files")  # Your folder of PDFs
OUTPUT_FOLDER = Path("scibert_output")
OUTPUT_FOLDER.mkdir(exist_ok=True)

# Load SciBERT QA pipeline
model_name = "allenai/scibert_scivocab_uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)

# --- Your schema mapped to questions ---
FIELD_QUESTIONS = {
    "efficiency_percent": "What is the power conversion efficiency?",
    "cell_area_cm2": "What is the cell area in square centimeters?",
    "short_circuit_current_a": "What is the short circuit current in amperes?",
    "short_circuit_current_density_ma_cm2": "What is the short circuit current density?",
    "open_circuit_voltage_v": "What is the open circuit voltage?",
    "fill_factor_percent": "What is the fill factor in percent?",
    "device_type": "What is the device type?",
    "absorber_material": "What absorber material is used?",
    "front_surface_morphology": "What is the front surface morphology?",
    "rear_surface_morphology": "What is the rear surface morphology?",
    "front_surface_passivation_material": "What is the front surface passivation material?",
    "rear_surface_passivation_material": "What is the rear surface passivation material?",
    "negative_metallization_material": "What is the negative metallization material?",
    "positive_metallization_material": "What is the positive metallization material?",
    "research_focus": "What is the main research focus of the paper?",
    "key_findings": "What are the key findings of the paper?"
}

# --- Process one PDF ---
def process_pdf(pdf_path):
    field_best_pages = {}
    field_best_answers = {}

    with pdfplumber.open(pdf_path) as pdf:
        for field, question in FIELD_QUESTIONS.items():
            best_score = 0
            best_page = None
            best_answer = "N/A"

            for i, page in enumerate(pdf.pages):
                text = page.extract_text()
                if not text or len(text) < 50:
                    continue

                try:
                    result = qa_pipeline({
                        "context": text,
                        "question": question
                    })

                    if result["score"] > best_score and result["answer"].strip() and result["answer"].lower() != "n/a":
                        best_score = result["score"]
                        best_page = i + 1  # 1-based index
                        best_answer = result["answer"]
                except Exception as e:
                    continue  # Handle long inputs or blank pages gracefully

            field_best_pages[field] = best_page
            field_best_answers[field] = best_answer

    return {
        "pdf_name": pdf_path.name,
        "best_pages": field_best_pages,
        "answers": field_best_answers
    }

# --- Main loop ---
def process_all_pdfs():
    for pdf_file in PDF_FOLDER.rglob("*.pdf"):
        print(f"Processing {pdf_file.name}")
        result = process_pdf(pdf_file)

        output_path = OUTPUT_FOLDER / (pdf_file.stem + "_scibert_results.json")
        with open(output_path, "w") as f:
            json.dump(result, f, indent=2)

        print(f"Saved: {output_path.name}\n")

# --- Run ---
if __name__ == "__main__":
    process_all_pdfs()


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at allenai/scibert_scivocab_uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cpu


Found 94 PDFs to process.


Processing PDFs:   1%|          | 1/94 [18:03<27:59:11, 1083.35s/it]

Saved extracted data for Alansaryi and Alsharif - 2024 - The Effect of Water Vapor and Humidity on the Topcon Photovoltaic Cell copy.pdf to extracted_results\Alansaryi and Alsharif - 2024 - The Effect of Water Vapor and Humidity on the Topcon Photovoltaic Cell copy_extracted.json


Processing PDFs:   2%|▏         | 2/94 [27:39<20:04:00, 785.22s/it] 

Saved extracted data for Bullock et al. - 2014 - Molybdenum oxide MoOx A versatile hole contact for silicon solar cells  Applied Physics Letters  copy.pdf to extracted_results\Bullock et al. - 2014 - Molybdenum oxide MoOx A versatile hole contact for silicon solar cells  Applied Physics Letters  copy_extracted.json
