In [None]:
import os
import json
import re
import pandas as pd
from pathlib import Path
from queue import Queue
from threading import Thread, Lock
from collections import defaultdict
from pydantic import BaseModel, Field, field_validator
from langchain.document_loaders import PyPDFLoader
from langchain_ollama import ChatOllama
from langchain.output_parsers import PydanticOutputParser, OutputFixingParser
from langchain_core.prompts import PromptTemplate

# ------------------ CONFIG ------------------

MODEL_NAME = "gemma3:4b"
CHUNK_SIZE = 6000
CHUNK_OVERLAP = 200
NUM_WORKERS = 2
INPUT_PDF_DIR = "PV1-Rhea"
OUTPUT_CSV = "pv_extraction_results_structured_new.csv"

COLUMN_MAP = {
    "title": "Title", "last_name": "Last Name", "year": "Year", "doi": "Digital Object Identifier (DOI)",
    "research_focus": "Research Focus", "key_findings": "Key Findings", "device_type": "Device Type",
    "absorber_material": "Absorber Material", "absorber_material_term_used": "Absorber Material Term Used",
    "absorber_dopant_material": "Absorber Dopant Material", "absorber_dopant_material_term_used": "Absorber Dopant Material Term Used",
    "absorber_dopant_polarity": "Absorber Dopant Polarity", "absorber_dopant_polarity_term_used": "Absorber Dopant Polarity Term Used",
    "front_surface_morphology": "Front Surface Morphology", "front_surface_morphology_term_used": "Front Surface Morphology Term Used",
    "rear_surface_morphology": "Rear Surface Morphology", "rear_surface_morphology_term_used": "Rear Surface Morphology Term Used",
    "front_surface_passivation_material": "Front Surface Passivation Material", "front_surface_passivation_material_term_used": "Front Surface Passivation Material Term Used",
    "rear_surface_passivation_material": "Rear Surface Passivation Material", "rear_surface_passivation_material_term_used": "Rear Surface Passivation Material Term Used",
    "negative_metallization_material": "Negative Metallization Material", "negative_metallization_material_term_used": "Negative Metallization Material Term Used",
    "positive_metallization_material": "Positive Metallization Material", "positive_metallization_material_term_used": "Positive Metallization Material Term Used",
    "efficiency_percent": "Efficiency (%)", "cell_area_cm2": "Cell Area (cm2)",
    "short_circuit_current_a": "Short-Circuit Current (A)", "short_circuit_current_density_ma_cm2": "Short-Circuit Current Density (mA/cm2)",
    "open_circuit_voltage_v": "Open-Circuit Voltage (V)", "fill_factor_percent": "Fill Factor (%)"
}

SECTION_FIELD_MAP = {
    "methods": [
        "absorber_material", "absorber_material_term_used",
        "absorber_dopant_material", "absorber_dopant_material_term_used",
        "absorber_dopant_polarity", "absorber_dopant_polarity_term_used",
        "front_surface_morphology", "front_surface_morphology_term_used",
        "rear_surface_morphology", "rear_surface_morphology_term_used",
        "front_surface_passivation_material", "front_surface_passivation_material_term_used",
        "rear_surface_passivation_material", "rear_surface_passivation_material_term_used",
        "negative_metallization_material", "negative_metallization_material_term_used",
        "positive_metallization_material", "positive_metallization_material_term_used",
        "device_type"
    ],
    "results": [
        "efficiency_percent", "short_circuit_current_a", "short_circuit_current_density_ma_cm2",
        "open_circuit_voltage_v", "fill_factor_percent", "cell_area_cm2"
    ],
    "discussion": ["key_findings"],
    "conclusion": ["key_findings"],
    "abstract": ["research_focus", "title", "year", "last_name", "doi"],
    "introduction": ["research_focus"]
}

# ------------------ Schema ------------------

class PVArticleData(BaseModel):
    title: str = Field("N/A")
    last_name: str = Field("N/A")
    year: str = Field("N/A")
    doi: str = Field("N/A")
    research_focus: str = Field("N/A")
    key_findings: str = Field("N/A")
    device_type: str = Field("N/A")
    absorber_material: str = Field("N/A")
    absorber_material_term_used: str = Field("N/A")
    absorber_dopant_material: str = Field("N/A")
    absorber_dopant_material_term_used: str = Field("N/A")
    absorber_dopant_polarity: str = Field("N/A")
    absorber_dopant_polarity_term_used: str = Field("N/A")
    front_surface_morphology: str = Field("N/A")
    front_surface_morphology_term_used: str = Field("N/A")
    rear_surface_morphology: str = Field("N/A")
    rear_surface_morphology_term_used: str = Field("N/A")
    front_surface_passivation_material: str = Field("N/A")
    front_surface_passivation_material_term_used: str = Field("N/A")
    rear_surface_passivation_material: str = Field("N/A")
    rear_surface_passivation_material_term_used: str = Field("N/A")
    negative_metallization_material: str = Field("N/A")
    negative_metallization_material_term_used: str = Field("N/A")
    positive_metallization_material: str = Field("N/A")
    positive_metallization_material_term_used: str = Field("N/A")
    efficiency_percent: str = Field("N/A")
    cell_area_cm2: str = Field("N/A")
    short_circuit_current_a: str = Field("N/A")
    short_circuit_current_density_ma_cm2: str = Field("N/A")
    open_circuit_voltage_v: str = Field("N/A")
    fill_factor_percent: str = Field("N/A")

    @field_validator("*", mode="before")
    def to_str(cls, v):
        return "N/A" if v is None else str(v)

# ------------------ Utils ------------------

def clean_response_data(response):
    if isinstance(response, dict):
        return response
    try:
        cleaned = response.strip().removeprefix("```json").removeprefix("```").removesuffix("```").strip()
        return json.loads(cleaned)
    except:
        return {}

def split_by_headings(text):
    pattern = re.compile(r"\n\s*(Abstract|Introduction|Methods?|Results?|Discussion|Conclusion)\s*\n", re.IGNORECASE)
    matches = list(pattern.finditer(text))
    sections = defaultdict(str)
    for i, match in enumerate(matches):
        section = match.group(1).lower()
        start = match.end()
        end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
        sections[section] = text[start:end].strip()
    return sections

# ------------------ LLM Setup ------------------

model = ChatOllama(model=MODEL_NAME)
raw_parser = PydanticOutputParser(pydantic_object=PVArticleData)
parser = OutputFixingParser.from_llm(llm=model, parser=raw_parser)

field_prompt = PromptTemplate.from_template("""\
You are extracting **only one field** from a scientific article on photovoltaic cells.

FIELD TO EXTRACT: {field_name}
SECTION TYPE: {section_name}

Use only the section below to extract your answer. If the field is not found, return "N/A".

Output only a valid JSON object like this:
{{"{field_key}": "..."}}.

SECTION TEXT:
{text}
""")

# ------------------ PDF Worker ------------------

def process_pdf_worker(queue: Queue, results: list, lock: Lock):
    while True:
        pdf_file = queue.get()
        if pdf_file is None:
            break

        try:
            print(f"📄 Processing: {pdf_file.name}")
            loader = PyPDFLoader(str(pdf_file))
            docs = loader.load()
            full_text = "\n\n".join(doc.page_content for doc in docs)
            sections = split_by_headings(full_text)

            field_data = {}

            for section_name, fields in SECTION_FIELD_MAP.items():
                section_text = sections.get(section_name.lower())
                if not section_text:
                    continue

                for field in fields:
                    if field in field_data:
                        continue

                    prompt = field_prompt.format(
                        text=section_text,
                        field_name=COLUMN_MAP.get(field, field),
                        field_key=field,
                        section_name=section_name
                    )
                    try:
                        response = model.invoke(prompt)
                        parsed = clean_response_data(response)
                        if isinstance(parsed, dict) and field in parsed:
                            field_data[field] = parsed[field]
                    except Exception as e:
                        print(f"⚠️ Error extracting {field}: {e}")

            if field_data:
                article = PVArticleData(**field_data)
                with lock:
                    results.append(article.model_dump())
                print(f"✅ Done: {pdf_file.name}")

        except Exception as e:
            print(f"❌ Failed to process {pdf_file.name}: {e}")
        finally:
            queue.task_done()

# ------------------ CSV Export ------------------

def save_results_to_csv(results, output_path):
    df = pd.DataFrame(results)
    df = df.rename(columns=COLUMN_MAP)
    df = df[list(COLUMN_MAP.values())]
    df.to_csv(output_path, index=False)
    print(f"📁 Saved to {output_path}")

# ------------------ Main ------------------

def run_extraction():
    pdf_folder = Path(INPUT_PDF_DIR)
    pdf_files = list(pdf_folder.rglob("*.pdf"))

    task_queue = Queue()
    results = []
    lock = Lock()

    threads = []
    for _ in range(NUM_WORKERS):
        t = Thread(target=process_pdf_worker, args=(task_queue, results, lock))
        t.start()
        threads.append(t)

    for file in pdf_files:
        task_queue.put(file)

    task_queue.join()

    for _ in threads:
        task_queue.put(None)
    for t in threads:
        t.join()

    if results:
        save_results_to_csv(results, OUTPUT_CSV)
        print(f"\n✅ Extracted {len(results)} / {len(pdf_files)} PDFs.")
    else:
        print("⚠️ No data extracted.")

if __name__ == "__main__":
    run_extraction()


📄 Processing: Alansaryi and Alsharif - 2024 - The Effect of Water Vapor and Humidity on the Topcon Photovoltaic Cell.pdf
📄 Processing: Feldmann - 2017 - Evaluation of TOPCon Technology on Large Area Solar Cells.pdf
📄 Processing: Kafle et al. - 2021 - TOPCon – Technology options for cost efficient industrial manufacturing.pdf
⚠️ Error extracting research_focus: POST predict: Post "http://127.0.0.1:51897/completion": read tcp 127.0.0.1:51939->127.0.0.1:51897: wsarecv: An existing connection was forcibly closed by the remote host. (status code: -1)⚠️ Error extracting key_findings: POST predict: Post "http://127.0.0.1:51897/completion": read tcp 127.0.0.1:51900->127.0.0.1:51897: wsarecv: An existing connection was forcibly closed by the remote host. (status code: -1)
📄 Processing: Ma et al. - 2024 - Comparative analysis of radiation-induced effects on the performance of p-type PERC and TOPCon solar.pdf



In [None]:
import os
import json
import re
import pandas as pd
from pathlib import Path
from queue import Queue
from threading import Thread, Lock
from collections import defaultdict
from pydantic import BaseModel, Field, field_validator
from langchain.document_loaders import PyPDFLoader
from langchain_ollama import ChatOllama
from langchain.output_parsers import PydanticOutputParser, OutputFixingParser
from langchain_core.prompts import PromptTemplate

# ------------------ CONFIG ------------------

MODEL_NAME = "gemma3:4b"
CHUNK_SIZE = 6000
CHUNK_OVERLAP = 200
NUM_WORKERS = 2
INPUT_PDF_DIR = "PV1-Rhea"
OUTPUT_CSV = "pv_extraction_results_structured_new.csv"

COLUMN_MAP = {
    "title": "Title", "last_name": "Last Name", "year": "Year", "doi": "Digital Object Identifier (DOI)",
    "research_focus": "Research Focus", "key_findings": "Key Findings", "device_type": "Device Type",
    "absorber_material": "Absorber Material", "absorber_material_term_used": "Absorber Material Term Used",
    "absorber_dopant_material": "Absorber Dopant Material", "absorber_dopant_material_term_used": "Absorber Dopant Material Term Used",
    "absorber_dopant_polarity": "Absorber Dopant Polarity", "absorber_dopant_polarity_term_used": "Absorber Dopant Polarity Term Used",
    "front_surface_morphology": "Front Surface Morphology", "front_surface_morphology_term_used": "Front Surface Morphology Term Used",
    "rear_surface_morphology": "Rear Surface Morphology", "rear_surface_morphology_term_used": "Rear Surface Morphology Term Used",
    "front_surface_passivation_material": "Front Surface Passivation Material", "front_surface_passivation_material_term_used": "Front Surface Passivation Material Term Used",
    "rear_surface_passivation_material": "Rear Surface Passivation Material", "rear_surface_passivation_material_term_used": "Rear Surface Passivation Material Term Used",
    "negative_metallization_material": "Negative Metallization Material", "negative_metallization_material_term_used": "Negative Metallization Material Term Used",
    "positive_metallization_material": "Positive Metallization Material", "positive_metallization_material_term_used": "Positive Metallization Material Term Used",
    "efficiency_percent": "Efficiency (%)", "cell_area_cm2": "Cell Area (cm2)",
    "short_circuit_current_a": "Short-Circuit Current (A)", "short_circuit_current_density_ma_cm2": "Short-Circuit Current Density (mA/cm2)",
    "open_circuit_voltage_v": "Open-Circuit Voltage (V)", "fill_factor_percent": "Fill Factor (%)"
}

SECTION_FIELD_MAP = {
    "methods": [
        "absorber_material", "absorber_material_term_used",
        "absorber_dopant_material", "absorber_dopant_material_term_used",
        "absorber_dopant_polarity", "absorber_dopant_polarity_term_used",
        "front_surface_morphology", "front_surface_morphology_term_used",
        "rear_surface_morphology", "rear_surface_morphology_term_used",
        "front_surface_passivation_material", "front_surface_passivation_material_term_used",
        "rear_surface_passivation_material", "rear_surface_passivation_material_term_used",
        "negative_metallization_material", "negative_metallization_material_term_used",
        "positive_metallization_material", "positive_metallization_material_term_used",
        "device_type"
    ],
    "results": [
        "efficiency_percent", "short_circuit_current_a", "short_circuit_current_density_ma_cm2",
        "open_circuit_voltage_v", "fill_factor_percent", "cell_area_cm2"
    ],
    "discussion": ["key_findings"],
    "conclusion": ["key_findings"],
    "abstract": ["research_focus", "title", "year", "last_name", "doi"],
    "introduction": ["research_focus"]
}

# ------------------ Schema ------------------

class PVArticleData(BaseModel):
    title: str = Field("N/A")
    last_name: str = Field("N/A")
    year: str = Field("N/A")
    doi: str = Field("N/A")
    research_focus: str = Field("N/A")
    key_findings: str = Field("N/A")
    device_type: str = Field("N/A")
    absorber_material: str = Field("N/A")
    absorber_material_term_used: str = Field("N/A")
    absorber_dopant_material: str = Field("N/A")
    absorber_dopant_material_term_used: str = Field("N/A")
    absorber_dopant_polarity: str = Field("N/A")
    absorber_dopant_polarity_term_used: str = Field("N/A")
    front_surface_morphology: str = Field("N/A")
    front_surface_morphology_term_used: str = Field("N/A")
    rear_surface_morphology: str = Field("N/A")
    rear_surface_morphology_term_used: str = Field("N/A")
    front_surface_passivation_material: str = Field("N/A")
    front_surface_passivation_material_term_used: str = Field("N/A")
    rear_surface_passivation_material: str = Field("N/A")
    rear_surface_passivation_material_term_used: str = Field("N/A")
    negative_metallization_material: str = Field("N/A")
    negative_metallization_material_term_used: str = Field("N/A")
    positive_metallization_material: str = Field("N/A")
    positive_metallization_material_term_used: str = Field("N/A")
    efficiency_percent: str = Field("N/A")
    cell_area_cm2: str = Field("N/A")
    short_circuit_current_a: str = Field("N/A")
    short_circuit_current_density_ma_cm2: str = Field("N/A")
    open_circuit_voltage_v: str = Field("N/A")
    fill_factor_percent: str = Field("N/A")

    @field_validator("*", mode="before")
    def to_str(cls, v):
        return "N/A" if v is None else str(v)

# ------------------ Utils ------------------

def clean_response_data(response):
    if isinstance(response, dict):
        return response
    try:
        cleaned = response.strip().removeprefix("```json").removeprefix("```").removesuffix("```").strip()
        return json.loads(cleaned)
    except:
        return {}

def split_by_headings(text):
    pattern = re.compile(r"\n\s*(Abstract|Introduction|Methods?|Results?|Discussion|Conclusion)\s*\n", re.IGNORECASE)
    matches = list(pattern.finditer(text))
    sections = defaultdict(str)
    for i, match in enumerate(matches):
        section = match.group(1).lower()
        start = match.end()
        end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
        sections[section] = text[start:end].strip()
    return sections

# ------------------ LLM Setup ------------------

model = ChatOllama(model=MODEL_NAME, base_url="http://localhost:11434")

raw_parser = PydanticOutputParser(pydantic_object=PVArticleData)
parser = OutputFixingParser.from_llm(llm=model, parser=raw_parser)

field_prompt = PromptTemplate.from_template("""\
You are extracting **only one field** from a scientific article on photovoltaic cells.

FIELD TO EXTRACT: {field_name}
SECTION TYPE: {section_name}

Use only the section below to extract your answer. If the field is not found, return "N/A".

Output only a valid JSON object like this:
{{"{field_key}": "..."}}.

SECTION TEXT:
{text}
""")

# ------------------ PDF Worker ------------------

def process_pdf_worker(queue: Queue, results: list, lock: Lock):
    while True:
        pdf_file = queue.get()
        if pdf_file is None:
            break

        try:
            print(f"📄 Processing: {pdf_file.name}")
            loader = PyPDFLoader(str(pdf_file))
            docs = loader.load()
            full_text = "\n\n".join(doc.page_content for doc in docs)
            sections = split_by_headings(full_text)

            field_data = {}

            for section_name, fields in SECTION_FIELD_MAP.items():
                section_text = sections.get(section_name.lower())
                if not section_text:
                    continue

                for field in fields:
                    if field in field_data:
                        continue

                    prompt = field_prompt.format(
                        text=section_text,
                        field_name=COLUMN_MAP.get(field, field),
                        field_key=field,
                        section_name=section_name
                    )
                    try:
                        response = model.invoke(prompt)
                        parsed = clean_response_data(response)
                        if isinstance(parsed, dict) and field in parsed:
                            field_data[field] = parsed[field]
                    except Exception as e:
                        print(f"⚠️ Error extracting {field}: {e}")

            if field_data:
                article = PVArticleData(**field_data)
                with lock:
                    results.append(article.model_dump())
                print(f"✅ Done: {pdf_file.name}")

        except Exception as e:
            print(f"❌ Failed to process {pdf_file.name}: {e}")
        finally:
            queue.task_done()

# ------------------ CSV Export ------------------

def save_results_to_csv(results, output_path):
    df = pd.DataFrame(results)
    df = df.rename(columns=COLUMN_MAP)
    df = df[list(COLUMN_MAP.values())]
    df.to_csv(output_path, index=False)
    print(f"📁 Saved to {output_path}")

# ------------------ Main ------------------

def run_extraction():
    pdf_folder = Path(INPUT_PDF_DIR)
    pdf_files = list(pdf_folder.rglob("*.pdf"))

    task_queue = Queue()
    results = []
    lock = Lock()

    threads = []
    for _ in range(NUM_WORKERS):
        t = Thread(target=process_pdf_worker, args=(task_queue, results, lock))
        t.start()
        threads.append(t)

    for file in pdf_files:
        task_queue.put(file)

    task_queue.join()

    for _ in threads:
        task_queue.put(None)
    for t in threads:
        t.join()

    if results:
        save_results_to_csv(results, OUTPUT_CSV)
        print(f"\n✅ Extracted {len(results)} / {len(pdf_files)} PDFs.")
    else:
        print("⚠️ No data extracted.")

if __name__ == "__main__":
    run_extraction()


📄 Processing: Alansaryi and Alsharif - 2024 - The Effect of Water Vapor and Humidity on the Topcon Photovoltaic Cell.pdf📄 Processing: Feldmann - 2017 - Evaluation of TOPCon Technology on Large Area Solar Cells.pdf

📄 Processing: Kafle et al. - 2021 - TOPCon – Technology options for cost efficient industrial manufacturing.pdf
⚠️ Error extracting key_findings: POST predict: Post "http://127.0.0.1:52709/completion": read tcp 127.0.0.1:53805->127.0.0.1:52709: wsarecv: An existing connection was forcibly closed by the remote host. (status code: -1)
📄 Processing: Ma et al. - 2024 - Comparative analysis of radiation-induced effects on the performance of p-type PERC and TOPCon solar.pdf


In [None]:
!pip install --upgrade langchain_ollama


In [1]:
from langchain_ollama import ChatOllama

MODEL_NAME = "gemma3:4b"
model = ChatOllama(model=MODEL_NAME, base_url="http://localhost:11434")
print(f"Using Ollama base_url: {model.base_url}")

try:
    response = model.invoke("Say hello")
    print("Response:", response)
except Exception as e:
    print("Error:", e)


Using Ollama base_url: http://localhost:11434
Error: POST predict: Post "http://127.0.0.1:53856/completion": read tcp 127.0.0.1:54165->127.0.0.1:53856: wsarecv: An existing connection was forcibly closed by the remote host. (status code: -1)
