In [2]:
!pip install langchain langchain-core langchain-community langchain-ollama pydantic pandas


Collecting langsmith<0.4,>=0.1.17 (from langchain)
  Obtaining dependency information for langsmith<0.4,>=0.1.17 from https://files.pythonhosted.org/packages/6a/f4/c206c0888f8a506404cb4f16ad89593bdc2f70cf00de26a1a0a7a76ad7a3/langsmith-0.3.45-py3-none-any.whl.metadata
  Downloading langsmith-0.3.45-py3-none-any.whl.metadata (15 kB)
Downloading langsmith-0.3.45-py3-none-any.whl (363 kB)
   ---------------------------------------- 0.0/363.0 kB ? eta -:--:--
   ---------------------------- ----------- 256.0/363.0 kB 5.2 MB/s eta 0:00:01
   ---------------------------------------- 363.0/363.0 kB 2.1 MB/s eta 0:00:00
Installing collected packages: langsmith
  Attempting uninstall: langsmith
    Found existing installation: langsmith 0.4.1
    Uninstalling langsmith-0.4.1:
      Successfully uninstalled langsmith-0.4.1
Successfully installed langsmith-0.3.45


In [None]:
import os
import json
import pandas as pd
from pathlib import Path
from pydantic import BaseModel, Field, field_validator
from typing import Union
from langchain.document_loaders import PyPDFLoader
from langchain_ollama import ChatOllama
from langchain.output_parsers import PydanticOutputParser, OutputFixingParser
from langchain_core.prompts import PromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter
from concurrent.futures import ThreadPoolExecutor, as_completed

class PVArticleData(BaseModel):
    title: str = Field("N/A")
    last_name: str = Field("N/A")
    year: str = Field("N/A")
    doi: str = Field("N/A")
    research_focus: str = Field("N/A")
    key_findings: str = Field("N/A")
    device_type: str = Field("N/A")
    absorber_material: str = Field("N/A")
    absorber_material_term_used: str = Field("N/A")
    absorber_dopant_material: str = Field("N/A")
    absorber_dopant_material_term_used: str = Field("N/A")
    absorber_dopant_polarity: str = Field("N/A")
    absorber_dopant_polarity_term_used: str = Field("N/A")
    front_surface_morphology: str = Field("N/A")
    front_surface_morphology_term_used: str = Field("N/A")
    rear_surface_morphology: str = Field("N/A")
    rear_surface_morphology_term_used: str = Field("N/A")
    front_surface_passivation_material: str = Field("N/A")
    front_surface_passivation_material_term_used: str = Field("N/A")
    rear_surface_passivation_material: str = Field("N/A")
    rear_surface_passivation_material_term_used: str = Field("N/A")
    negative_metallization_material: str = Field("N/A")
    negative_metallization_material_term_used: str = Field("N/A")
    positive_metallization_material: str = Field("N/A")
    positive_metallization_material_term_used: str = Field("N/A")
    efficiency_percent: str = Field("N/A")
    cell_area_cm2: str = Field("N/A")
    short_circuit_current_a: str = Field("N/A")
    short_circuit_current_density_ma_cm2: str = Field("N/A")
    open_circuit_voltage_v: str = Field("N/A")
    fill_factor_percent: str = Field("N/A")

    @field_validator("*", mode="before")
    def convert_to_string(cls, v):
        return "N/A" if v is None else str(v)

def clean_response_data(response_data):
    if isinstance(response_data, list):
        response_data = response_data[0] if response_data else {}
    if isinstance(response_data, PVArticleData):
        return response_data.model_dump()
    if isinstance(response_data, str):
        print("🧪 Cleaning string response...")
        cleaned = (
            response_data.strip()
            .removeprefix("```json").removeprefix("```")
            .removesuffix("```").strip()
        )
        try:
            parsed = json.loads(cleaned)
            return parsed[0] if isinstance(parsed, list) else parsed
        except json.JSONDecodeError:
            try:
                cleaned = cleaned.replace('\n', '').replace('\t', '').replace("'", '"')
                parsed = json.loads(cleaned)
                return parsed[0] if isinstance(parsed, list) else parsed
            except Exception as e:
                print(f"❌ JSON decode error after cleaning: {e}")
                return {}
    if isinstance(response_data, dict):
        return response_data
    print("⚠️ Unexpected response format:", type(response_data))
    return {}

# Initialize model and parser
model = ChatOllama(model="gemma3:4b")
raw_parser = PydanticOutputParser(pydantic_object=PVArticleData)
parser = OutputFixingParser.from_llm(llm=model, parser=raw_parser)

# Prompt with stronger constraints
prompt = PromptTemplate.from_template("""\
You are extracting structured data from academic articles on photovoltaic cells.
Focus ONLY on the most efficient cell mentioned in each article.

Format your response as a single **valid JSON object** only — no markdown, no commentary, no extra text.

If a value is unavailable, use "N/A".

Your output must match this schema:
{format_instructions}

ARTICLE:
{text}
""")

chain = prompt | model | parser

pdf_folder = Path("PV1-Rhea")
pdf_files = list(pdf_folder.rglob("*.pdf"))

def process_pdf(pdf_file):
    print(f"\n📄 Processing {pdf_file}...")
    try:
        loader = PyPDFLoader(str(pdf_file))
        docs = loader.load()
        full_text = "\n\n".join([doc.page_content for doc in docs])

        splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=200)
        chunks = splitter.split_text(full_text)

        for chunk in chunks:
            try:
                response = chain.invoke({
                    "text": chunk,
                    "format_instructions": parser.get_format_instructions()
                })
                cleaned_data = clean_response_data(response)
                if cleaned_data:
                    article_data = PVArticleData(**cleaned_data)
                    print(f"✅ Successfully extracted from {pdf_file}")
                    return article_data.model_dump()
            except Exception as e:
                print(f"⛔ Error in chunk: {e}")

        print(f"⚠️ No valid data extracted from any chunk of {pdf_file}")
        return None

    except Exception as e:
        print(f"❌ Failed to process {pdf_file}: {e}")
        return None

results = []
max_workers = 4

with ThreadPoolExecutor(max_workers=max_workers) as executor:
    futures = [executor.submit(process_pdf, pdf_file) for pdf_file in pdf_files]
    for future in as_completed(futures):
        result = future.result()
        if result:
            results.append(result)

if not results:
    print("⚠️ No PDFs were successfully processed!")
    exit()

df = pd.DataFrame(results)

column_map = {
    "title": "Title", "last_name": "Last Name", "year": "Year", "doi": "Digital Object Identifier (DOI)",
    "research_focus": "Research Focus", "key_findings": "Key Findings", "device_type": "Device Type",
    "absorber_material": "Absorber Material", "absorber_material_term_used": "Absorber Material Term Used",
    "absorber_dopant_material": "Absorber Dopant Material", "absorber_dopant_material_term_used": "Absorber Dopant Material Term Used",
    "absorber_dopant_polarity": "Absorber Dopant Polarity", "absorber_dopant_polarity_term_used": "Absorber Dopant Polarity Term Used",
    "front_surface_morphology": "Front Surface Morphology", "front_surface_morphology_term_used": "Front Surface Morphology Term Used",
    "rear_surface_morphology": "Rear Surface Morphology", "rear_surface_morphology_term_used": "Rear Surface Morphology Term Used",
    "front_surface_passivation_material": "Front Surface Passivation Material", "front_surface_passivation_material_term_used": "Front Surface Passivation Material Term Used",
    "rear_surface_passivation_material": "Rear Surface Passivation Material", "rear_surface_passivation_material_term_used": "Rear Surface Passivation Material Term Used",
    "negative_metallization_material": "Negative Metallization Material", "negative_metallization_material_term_used": "Negative Metallization Material Term Used",
    "positive_metallization_material": "Positive Metallization Material", "positive_metallization_material_term_used": "Positive Metallization Material Term Used",
    "efficiency_percent": "Efficiency (%)", "cell_area_cm2": "Cell Area (cm2)",
    "short_circuit_current_a": "Short-Circuit Current (A)", "short_circuit_current_density_ma_cm2": "Short-Circuit Current Density (mA/cm2)",
    "open_circuit_voltage_v": "Open-Circuit Voltage (V)", "fill_factor_percent": "Fill Factor (%)"
}

df = df.rename(columns=column_map)
df = df[list(column_map.values())]

output_file = "pv_extraction_results_ollama2.csv"
df.to_csv(output_file, index=False)
print(f"\n Extracted the folder to this pdf ")
print(f"\n✅ Saved extracted data to {output_file} ({len(results)} records)")
print(f"📊 Processing summary: {len(results)}/{len(pdf_files)} PDFs processed successfully")



📄 Processing PV1-Rhea\files\1337\Alansaryi and Alsharif - 2024 - The Effect of Water Vapor and Humidity on the Topcon Photovoltaic Cell.pdf...

📄 Processing PV1-Rhea\files\1341\Feldmann - 2017 - Evaluation of TOPCon Technology on Large Area Solar Cells.pdf...

📄 Processing PV1-Rhea\files\1344\Kafle et al. - 2021 - TOPCon – Technology options for cost efficient industrial manufacturing.pdf...

📄 Processing PV1-Rhea\files\1345\Ma et al. - 2024 - Comparative analysis of radiation-induced effects on the performance of p-type PERC and TOPCon solar.pdf...
✅ Successfully extracted from PV1-Rhea\files\1341\Feldmann - 2017 - Evaluation of TOPCon Technology on Large Area Solar Cells.pdf

📄 Processing PV1-Rhea\files\1347\Richter et al. - 2017 - n-Type Si solar cells with passivating electron contact Identifying sources for efficiency limitati.pdf...
✅ Successfully extracted from PV1-Rhea\files\1345\Ma et al. - 2024 - Comparative analysis of radiation-induced effects on the performance of p-type 