In [4]:
!langchain_google_genai

'langchain_google_genai' is not recognized as an internal or external command,
operable program or batch file.


In [7]:
!pip install langchain-google-genai


Collecting langchain-google-genai
  Obtaining dependency information for langchain-google-genai from https://files.pythonhosted.org/packages/5e/70/0747358eca996f713f715e2bfc2d0805804f8f705af57381fbee91bb475a/langchain_google_genai-2.1.5-py3-none-any.whl.metadata
  Downloading langchain_google_genai-2.1.5-py3-none-any.whl.metadata (5.2 kB)
Collecting filetype<2.0.0,>=1.2.0 (from langchain-google-genai)
  Obtaining dependency information for filetype<2.0.0,>=1.2.0 from https://files.pythonhosted.org/packages/18/79/1b8fa1bb3568781e84c9200f951c735f3f157429f44be0495da55894d620/filetype-1.2.0-py2.py3-none-any.whl.metadata
  Downloading filetype-1.2.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting google-ai-generativelanguage<0.7.0,>=0.6.18 (from langchain-google-genai)
  Obtaining dependency information for google-ai-generativelanguage<0.7.0,>=0.6.18 from https://files.pythonhosted.org/packages/e5/77/ca2889903a2d93b3072a49056d48b3f55410219743e338a1d7f94dc6455e/google_ai_generativelanguage

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-generativeai 0.8.4 requires google-ai-generativelanguage==0.6.15, but you have google-ai-generativelanguage 0.6.18 which is incompatible.
langchain 0.3.25 requires langsmith<0.4,>=0.1.17, but you have langsmith 0.4.1 which is incompatible.
langchain-community 0.3.24 requires langsmith<0.4,>=0.1.125, but you have langsmith 0.4.1 which is incompatible.


In [1]:
import os
import pandas as pd
from pathlib import Path
from pydantic import BaseModel, Field, field_validator
from typing import Union, List, Any
from langchain_community.document_loaders import PyPDFLoader
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.output_parsers import PydanticOutputParser
from langchain_core.prompts import PromptTemplate
import json

# --- Set Gemini API Key ---
os.environ["GOOGLE_API_KEY"] = "AIzaSyBT4CRnn0iq8LoqWUHnx-6qG6Xj-l6T7WM"

# --- Pydantic Schema ---
class PVArticleData(BaseModel):
    title: str = Field("N/A")
    last_name: str = Field("N/A")
    year: str = Field("N/A")
    doi: str = Field("N/A")
    research_focus: str = Field("N/A")
    key_findings: str = Field("N/A")
    device_type: str = Field("N/A")
    absorber_material: str = Field("N/A")
    absorber_material_term_used: str = Field("N/A")
    absorber_dopant_material: str = Field("N/A")
    absorber_dopant_material_term_used: str = Field("N/A")
    absorber_dopant_polarity: str = Field("N/A")
    absorber_dopant_polarity_term_used: str = Field("N/A")
    front_surface_morphology: str = Field("N/A")
    front_surface_morphology_term_used: str = Field("N/A")
    rear_surface_morphology: str = Field("N/A")
    rear_surface_morphology_term_used: str = Field("N/A")
    front_surface_passivation_material: str = Field("N/A")
    front_surface_passivation_material_term_used: str = Field("N/A")
    rear_surface_passivation_material: str = Field("N/A")
    rear_surface_passivation_material_term_used: str = Field("N/A")
    negative_metallization_material: str = Field("N/A")
    negative_metallization_material_term_used: str = Field("N/A")
    positive_metallization_material: str = Field("N/A")
    positive_metallization_material_term_used: str = Field("N/A")
    efficiency_percent: str = Field("N/A")
    cell_area_cm2: str = Field("N/A")
    short_circuit_current_a: str = Field("N/A")
    short_circuit_current_density_ma_cm2: str = Field("N/A")
    open_circuit_voltage_v: str = Field("N/A")
    fill_factor_percent: str = Field("N/A")

    @field_validator("*", mode="before")
    def convert_to_string(cls, v):
        return "N/A" if v is None else str(v)

def clean_response_data(response_data):
    if isinstance(response_data, list):
        response_data = response_data[0] if response_data else {}
    if isinstance(response_data, PVArticleData):
        return response_data.model_dump()
    if isinstance(response_data, str):
        cleaned_string = response_data.strip().strip("`json").strip("`")
        try:
            response_data = json.loads(cleaned_string)
            if isinstance(response_data, list) and response_data:
                response_data = response_data[0]
        except json.JSONDecodeError as e:
            print(f"⚠️ JSON parsing error: {e}")
            return {}
    return response_data if isinstance(response_data, dict) else {}

# --- LangChain Setup ---
model = ChatGoogleGenerativeAI(model="models/gemini-1.5-flash")
parser = PydanticOutputParser(pydantic_object=PVArticleData)

prompt = PromptTemplate.from_template("""
You are extracting structured data from academic articles on photovoltaic cells. 
Focus only on the most efficient cell mentioned in each article.

IMPORTANT: 
- Return ONLY a single JSON object (not a list). 
- Do not wrap the response in square brackets.
- Do not use markdown formatting or code blocks (no ```json or ```).
- Return pure JSON only.

{text}

Extract and format according to this schema:
{format_instructions}
""")

chain = prompt | model | parser

# --- Recursively Find PDFs ---
pdf_folder = Path("PV1-Rhea")
pdf_files = list(pdf_folder.rglob("*.pdf"))  # Recursive search

results = []

for pdf_file in pdf_files:
    print(f"📄 Processing {pdf_file.relative_to(pdf_folder)}...")
    try:
        loader = PyPDFLoader(str(pdf_file))
        docs = loader.load()
        full_text = "\n\n".join([doc.page_content for doc in docs])

        response = chain.invoke({
            "text": full_text,
            "format_instructions": parser.get_format_instructions()
        })

        cleaned_data = clean_response_data(response)
        if not cleaned_data:
            print(f"⚠️  No valid data extracted from {pdf_file.name}")
            continue

        article_data = PVArticleData(**cleaned_data)
        results.append(article_data.model_dump())
        print(f"✅ Extracted from {pdf_file.name}")

    except Exception as e:
        if "cryptography" in str(e).lower():
            print(f"🔒 Skipping encrypted PDF: {pdf_file.name}")
        elif "validation error" in str(e).lower():
            print(f"❌ Validation error in {pdf_file.name}: {e}")
        else:
            print(f"❌ Failed to process {pdf_file.name}: {e}")

if not results:
    print("⚠️ No PDFs were successfully processed!")
    exit()

# --- Save to CSV ---
df = pd.DataFrame(results)

column_map = {
    "title": "Title",
    "last_name": "Last Name",
    "year": "Year",
    "doi": "Digital Object Identifier (DOI)",
    "research_focus": "Research Focus",
    "key_findings": "Key Findings",
    "device_type": "Device Type",
    "absorber_material": "Absorber Material",
    "absorber_material_term_used": "Absorber Material Term Used",
    "absorber_dopant_material": "Absorber Dopant Material",
    "absorber_dopant_material_term_used": "Absorber Dopant Material Term Used",
    "absorber_dopant_polarity": "Absorber Dopant Polarity",
    "absorber_dopant_polarity_term_used": "Absorber Dopant Polarity Term Used",
    "front_surface_morphology": "Front Surface Morphology",
    "front_surface_morphology_term_used": "Front Surface Morphology Term Used",
    "rear_surface_morphology": "Rear Surface Morphology",
    "rear_surface_morphology_term_used": "Rear Surface Morphology Term Used",
    "front_surface_passivation_material": "Front Surface Passivation Material",
    "front_surface_passivation_material_term_used": "Front Surface Passivation Material Term Used",
    "rear_surface_passivation_material": "Rear Surface Passivation Material",
    "rear_surface_passivation_material_term_used": "Rear Surface Passivation Material Term Used",
    "negative_metallization_material": "Negative Metallization Material",
    "negative_metallization_material_term_used": "Negative Metallization Material Term Used",
    "positive_metallization_material": "Positive Metallization Material",
    "positive_metallization_material_term_used": "Positive Metallization Material Term Used",
    "efficiency_percent": "Efficiency (%)",
    "cell_area_cm2": "Cell Area (cm2)",
    "short_circuit_current_a": "Short-Circuit Current (A)",
    "short_circuit_current_density_ma_cm2": "Short-Circuit Current Density (mA/cm2)",
    "open_circuit_voltage_v": "Open-Circuit Voltage (V)",
    "fill_factor_percent": "Fill Factor (%)"
}

df = df.rename(columns=column_map)
df = df[list(column_map.values())]

output_file = "pv_extraction_results.csv"
df.to_csv(output_file, index=False)

print(f"✅ Saved extracted data to {output_file} ({len(results)} records)")
print(f"📊 Summary: {len(results)}/{len(pdf_files)} PDFs processed successfully")


📄 Processing files\1337\Alansaryi and Alsharif - 2024 - The Effect of Water Vapor and Humidity on the Topcon Photovoltaic Cell.pdf...
✅ Extracted from Alansaryi and Alsharif - 2024 - The Effect of Water Vapor and Humidity on the Topcon Photovoltaic Cell.pdf
📄 Processing files\1341\Feldmann - 2017 - Evaluation of TOPCon Technology on Large Area Solar Cells.pdf...
✅ Extracted from Feldmann - 2017 - Evaluation of TOPCon Technology on Large Area Solar Cells.pdf
📄 Processing files\1344\Kafle et al. - 2021 - TOPCon – Technology options for cost efficient industrial manufacturing.pdf...
✅ Extracted from Kafle et al. - 2021 - TOPCon – Technology options for cost efficient industrial manufacturing.pdf
📄 Processing files\1345\Ma et al. - 2024 - Comparative analysis of radiation-induced effects on the performance of p-type PERC and TOPCon solar.pdf...
✅ Extracted from Ma et al. - 2024 - Comparative analysis of radiation-induced effects on the performance of p-type PERC and TOPCon solar.pdf
📄 Proc

In [None]:
import os
import pandas as pd
from pathlib import Path
from pydantic import BaseModel, Field, field_validator
from typing import Union
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.chat_models import ChatOllama
from langchain_core.output_parsers import PydanticOutputParser
from langchain_core.prompts import PromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter
import json

class PVArticleData(BaseModel):
    title: str = Field("N/A")
    last_name: str = Field("N/A")
    year: str = Field("N/A")
    doi: str = Field("N/A")
    research_focus: str = Field("N/A")
    key_findings: str = Field("N/A")
    device_type: str = Field("N/A")
    absorber_material: str = Field("N/A")
    absorber_material_term_used: str = Field("N/A")
    absorber_dopant_material: str = Field("N/A")
    absorber_dopant_material_term_used: str = Field("N/A")
    absorber_dopant_polarity: str = Field("N/A")
    absorber_dopant_polarity_term_used: str = Field("N/A")
    front_surface_morphology: str = Field("N/A")
    front_surface_morphology_term_used: str = Field("N/A")
    rear_surface_morphology: str = Field("N/A")
    rear_surface_morphology_term_used: str = Field("N/A")
    front_surface_passivation_material: str = Field("N/A")
    front_surface_passivation_material_term_used: str = Field("N/A")
    rear_surface_passivation_material: str = Field("N/A")
    rear_surface_passivation_material_term_used: str = Field("N/A")
    negative_metallization_material: str = Field("N/A")
    negative_metallization_material_term_used: str = Field("N/A")
    positive_metallization_material: str = Field("N/A")
    positive_metallization_material_term_used: str = Field("N/A")
    efficiency_percent: str = Field("N/A")
    cell_area_cm2: str = Field("N/A")
    short_circuit_current_a: str = Field("N/A")
    short_circuit_current_density_ma_cm2: str = Field("N/A")
    open_circuit_voltage_v: str = Field("N/A")
    fill_factor_percent: str = Field("N/A")

    @field_validator("*", mode="before")
    def convert_to_string(cls, v):
        return "N/A" if v is None else str(v)

def clean_response_data(response_data):
    if isinstance(response_data, list):
        response_data = response_data[0] if response_data else {}
    if isinstance(response_data, PVArticleData):
        return response_data.model_dump()
    if isinstance(response_data, str):
        print("🧪 Cleaning string response...")
        cleaned = response_data.strip().removeprefix("```json").removeprefix("```").removesuffix("```").strip()
        try:
            parsed = json.loads(cleaned)
            return parsed[0] if isinstance(parsed, list) else parsed
        except json.JSONDecodeError as e:
            print(f"❌ JSON decode error: {e}")
            return {}
    if isinstance(response_data, dict):
        return response_data
    print("⚠️ Unexpected response format:", type(response_data))
    return {}

model = ChatOllama(model="gemma3:4b")
parser = PydanticOutputParser(pydantic_object=PVArticleData)

prompt = PromptTemplate.from_template("""
You are extracting structured data from academic articles on photovoltaic cells.
Focus only on the most efficient cell mentioned in each article.

IMPORTANT:
- Return ALL fields from the schema.
- If any field is not available, set its value to \"N/A\".
- Return ONLY a single JSON object.
- Do not use markdown or code formatting.
- Do not wrap in square brackets or code blocks.

Article:
{text}

Format like:
{format_instructions}
""")

chain = prompt | model | parser

pdf_folder = Path("PV1-Rhea")
pdf_files = list(pdf_folder.rglob("*.pdf"))
results = []

for pdf_file in pdf_files:
    print(f"\n📄 Processing {pdf_file}...")
    try:
        loader = PyPDFLoader(str(pdf_file))
        docs = loader.load()
        full_text = "\n\n".join([doc.page_content for doc in docs])

        splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200)
        chunks = splitter.split_text(full_text)

        for chunk in chunks:
            try:
                response = chain.invoke({
                    "text": chunk,
                    "format_instructions": parser.get_format_instructions()
                })
                print("🔎 Raw response:", response)

                cleaned_data = clean_response_data(response)
                if cleaned_data:
                    article_data = PVArticleData(**cleaned_data)
                    results.append(article_data.model_dump())
                    print(f"✅ Successfully extracted from {pdf_file}")
                    break  # Use only first successful chunk
            except Exception as e:
                print(f"⛔ Error in chunk: {e}")

        else:
            print(f"⚠️ No valid data extracted from any chunk of {pdf_file}")

    except Exception as e:
        print(f"❌ Failed to process {pdf_file}: {e}")

if not results:
    print("⚠️ No PDFs were successfully processed!")
    exit()

df = pd.DataFrame(results)

column_map = {
    "title": "Title", "last_name": "Last Name", "year": "Year", "doi": "Digital Object Identifier (DOI)",
    "research_focus": "Research Focus", "key_findings": "Key Findings", "device_type": "Device Type",
    "absorber_material": "Absorber Material", "absorber_material_term_used": "Absorber Material Term Used",
    "absorber_dopant_material": "Absorber Dopant Material", "absorber_dopant_material_term_used": "Absorber Dopant Material Term Used",
    "absorber_dopant_polarity": "Absorber Dopant Polarity", "absorber_dopant_polarity_term_used": "Absorber Dopant Polarity Term Used",
    "front_surface_morphology": "Front Surface Morphology", "front_surface_morphology_term_used": "Front Surface Morphology Term Used",
    "rear_surface_morphology": "Rear Surface Morphology", "rear_surface_morphology_term_used": "Rear Surface Morphology Term Used",
    "front_surface_passivation_material": "Front Surface Passivation Material", "front_surface_passivation_material_term_used": "Front Surface Passivation Material Term Used",
    "rear_surface_passivation_material": "Rear Surface Passivation Material", "rear_surface_passivation_material_term_used": "Rear Surface Passivation Material Term Used",
    "negative_metallization_material": "Negative Metallization Material", "negative_metallization_material_term_used": "Negative Metallization Material Term Used",
    "positive_metallization_material": "Positive Metallization Material", "positive_metallization_material_term_used": "Positive Metallization Material Term Used",
    "efficiency_percent": "Efficiency (%)", "cell_area_cm2": "Cell Area (cm2)",
    "short_circuit_current_a": "Short-Circuit Current (A)", "short_circuit_current_density_ma_cm2": "Short-Circuit Current Density (mA/cm2)",
    "open_circuit_voltage_v": "Open-Circuit Voltage (V)", "fill_factor_percent": "Fill Factor (%)"
}

df = df.rename(columns=column_map)
df = df[list(column_map.values())]

output_file = "pv_extraction_results_ollama.csv"
df.to_csv(output_file, index=False)
print(f"\n✅ Saved extracted data to {output_file} ({len(results)} records)")
print(f"📊 Processing summary: {len(results)}/{len(pdf_files)} PDFs processed successfully")



📄 Processing PV1-Rhea\files\1337\Alansaryi and Alsharif - 2024 - The Effect of Water Vapor and Humidity on the Topcon Photovoltaic Cell.pdf...
🔎 Raw response: title='N/A' last_name='N/A' year='N/A' doi='N/A' research_focus='N/A' key_findings='N/A' device_type='N/A' absorber_material='N/A' absorber_material_term_used='N/A' absorber_dopant_material='N/A' absorber_dopant_material_term_used='N/A' absorber_dopant_polarity='N/A' absorber_dopant_polarity_term_used='N/A' front_surface_morphology='N/A' front_surface_morphology_term_used='N/A' rear_surface_morphology='N/A' rear_surface_morphology_term_used='N/A' front_surface_passivation_material='N/A' front_surface_passivation_material_term_used='N/A' rear_surface_passivation_material='N/A' rear_surface_passivation_material_term_used='N/A' negative_metallization_material='N/A' negative_metallization_material_term_used='N/A' positive_metallization_material='N/A' positive_metallization_material_term_used='N/A' efficiency_percent='N/A' cell_area_

In [None]:
import os
import pandas as pd
from pathlib import Path
from pydantic import BaseModel, Field, field_validator
from typing import Union
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.chat_models import ChatOllama
from langchain_core.output_parsers import PydanticOutputParser
from langchain_core.prompts import PromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter
import json
from concurrent.futures import ThreadPoolExecutor, as_completed

class PVArticleData(BaseModel):
    title: str = Field("N/A")
    last_name: str = Field("N/A")
    year: str = Field("N/A")
    doi: str = Field("N/A")
    research_focus: str = Field("N/A")
    key_findings: str = Field("N/A")
    device_type: str = Field("N/A")
    absorber_material: str = Field("N/A")
    absorber_material_term_used: str = Field("N/A")
    absorber_dopant_material: str = Field("N/A")
    absorber_dopant_material_term_used: str = Field("N/A")
    absorber_dopant_polarity: str = Field("N/A")
    absorber_dopant_polarity_term_used: str = Field("N/A")
    front_surface_morphology: str = Field("N/A")
    front_surface_morphology_term_used: str = Field("N/A")
    rear_surface_morphology: str = Field("N/A")
    rear_surface_morphology_term_used: str = Field("N/A")
    front_surface_passivation_material: str = Field("N/A")
    front_surface_passivation_material_term_used: str = Field("N/A")
    rear_surface_passivation_material: str = Field("N/A")
    rear_surface_passivation_material_term_used: str = Field("N/A")
    negative_metallization_material: str = Field("N/A")
    negative_metallization_material_term_used: str = Field("N/A")
    positive_metallization_material: str = Field("N/A")
    positive_metallization_material_term_used: str = Field("N/A")
    efficiency_percent: str = Field("N/A")
    cell_area_cm2: str = Field("N/A")
    short_circuit_current_a: str = Field("N/A")
    short_circuit_current_density_ma_cm2: str = Field("N/A")
    open_circuit_voltage_v: str = Field("N/A")
    fill_factor_percent: str = Field("N/A")

    @field_validator("*", mode="before")
    def convert_to_string(cls, v):
        return "N/A" if v is None else str(v)

def clean_response_data(response_data):
    if isinstance(response_data, list):
        response_data = response_data[0] if response_data else {}
    if isinstance(response_data, PVArticleData):
        return response_data.model_dump()
    if isinstance(response_data, str):
        print("🧪 Cleaning string response...")
        cleaned = response_data.strip().removeprefix("```json").removeprefix("```").removesuffix("```").strip()
        try:
            parsed = json.loads(cleaned)
            return parsed[0] if isinstance(parsed, list) else parsed
        except json.JSONDecodeError as e:
            print(f"❌ JSON decode error: {e}")
            return {}
    if isinstance(response_data, dict):
        return response_data
    print("⚠️ Unexpected response format:", type(response_data))
    return {}

model = ChatOllama(model="gemma3:4b")
parser = PydanticOutputParser(pydantic_object=PVArticleData)

prompt = PromptTemplate.from_template("""\
You are extracting structured data from academic articles on photovoltaic cells.
Focus only on the most efficient cell mentioned in each article.

IMPORTANT:
- Return ALL fields from the schema.
- If any field is not available, set its value to "N/A".
- Return ONLY a single JSON object.
- Do not use markdown or code formatting.
- Do not wrap in square brackets or code blocks.

Article:
{text}

Format like:
{format_instructions}
""")

chain = prompt | model | parser

pdf_folder = Path("PV1-Rhea")
pdf_files = list(pdf_folder.rglob("*.pdf"))

def process_pdf(pdf_file):
    print(f"\n📄 Processing {pdf_file}...")
    try:
        loader = PyPDFLoader(str(pdf_file))
        docs = loader.load()
        full_text = "\n\n".join([doc.page_content for doc in docs])

        # Larger chunk size to reduce number of calls
        splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=200)
        chunks = splitter.split_text(full_text)

        for chunk in chunks:
            try:
                response = chain.invoke({
                    "text": chunk,
                    "format_instructions": parser.get_format_instructions()
                })
                #print("🔎 Raw response:", response)  # Optional: comment out to reduce output

                cleaned_data = clean_response_data(response)
                if cleaned_data:
                    article_data = PVArticleData(**cleaned_data)
                    print(f"✅ Successfully extracted from {pdf_file}")
                    return article_data.model_dump()
            except Exception as e:
                print(f"⛔ Error in chunk: {e}")

        print(f"⚠️ No valid data extracted from any chunk of {pdf_file}")
        return None

    except Exception as e:
        print(f"❌ Failed to process {pdf_file}: {e}")
        return None

results = []

# Adjust max_workers depending on your CPU/RAM
max_workers = 4

with ThreadPoolExecutor(max_workers=max_workers) as executor:
    futures = [executor.submit(process_pdf, pdf_file) for pdf_file in pdf_files]
    for future in as_completed(futures):
        result = future.result()
        if result:
            results.append(result)

if not results:
    print("⚠️ No PDFs were successfully processed!")
    exit()

df = pd.DataFrame(results)

column_map = {
    "title": "Title", "last_name": "Last Name", "year": "Year", "doi": "Digital Object Identifier (DOI)",
    "research_focus": "Research Focus", "key_findings": "Key Findings", "device_type": "Device Type",
    "absorber_material": "Absorber Material", "absorber_material_term_used": "Absorber Material Term Used",
    "absorber_dopant_material": "Absorber Dopant Material", "absorber_dopant_material_term_used": "Absorber Dopant Material Term Used",
    "absorber_dopant_polarity": "Absorber Dopant Polarity", "absorber_dopant_polarity_term_used": "Absorber Dopant Polarity Term Used",
    "front_surface_morphology": "Front Surface Morphology", "front_surface_morphology_term_used": "Front Surface Morphology Term Used",
    "rear_surface_morphology": "Rear Surface Morphology", "rear_surface_morphology_term_used": "Rear Surface Morphology Term Used",
    "front_surface_passivation_material": "Front Surface Passivation Material", "front_surface_passivation_material_term_used": "Front Surface Passivation Material Term Used",
    "rear_surface_passivation_material": "Rear Surface Passivation Material", "rear_surface_passivation_material_term_used": "Rear Surface Passivation Material Term Used",
    "negative_metallization_material": "Negative Metallization Material", "negative_metallization_material_term_used": "Negative Metallization Material Term Used",
    "positive_metallization_material": "Positive Metallization Material", "positive_metallization_material_term_used": "Positive Metallization Material Term Used",
    "efficiency_percent": "Efficiency (%)", "cell_area_cm2": "Cell Area (cm2)",
    "short_circuit_current_a": "Short-Circuit Current (A)", "short_circuit_current_density_ma_cm2": "Short-Circuit Current Density (mA/cm2)",
    "open_circuit_voltage_v": "Open-Circuit Voltage (V)", "fill_factor_percent": "Fill Factor (%)"
}

df = df.rename(columns=column_map)
df = df[list(column_map.values())]

output_file = "pv_extraction_results_ollama.csv"
df.to_csv(output_file, index=False)

print(f"\n✅ Saved extracted data to {output_file} ({len(results)} records)")
print(f"📊 Processing summary: {len(results)}/{len(pdf_files)} PDFs processed successfully")



📄 Processing PV1-Rhea\files\1337\Alansaryi and Alsharif - 2024 - The Effect of Water Vapor and Humidity on the Topcon Photovoltaic Cell.pdf...

📄 Processing PV1-Rhea\files\1341\Feldmann - 2017 - Evaluation of TOPCon Technology on Large Area Solar Cells.pdf...

📄 Processing PV1-Rhea\files\1344\Kafle et al. - 2021 - TOPCon – Technology options for cost efficient industrial manufacturing.pdf...

📄 Processing PV1-Rhea\files\1345\Ma et al. - 2024 - Comparative analysis of radiation-induced effects on the performance of p-type PERC and TOPCon solar.pdf...
✅ Successfully extracted from PV1-Rhea\files\1337\Alansaryi and Alsharif - 2024 - The Effect of Water Vapor and Humidity on the Topcon Photovoltaic Cell.pdf

📄 Processing PV1-Rhea\files\1347\Richter et al. - 2017 - n-Type Si solar cells with passivating electron contact Identifying sources for efficiency limitati.pdf...
⛔ Error in chunk: Invalid json output: Here is the JSON object with all fields from the schema:

{
    "efficiency_percen

In [1]:
import os
import json
import pandas as pd
from langchain_core.prompts import PromptTemplate
from langchain_ollama import OllamaLLM

# --- SETTINGS ---
CHUNKS_FOLDER = "chunkTest"
LLM_MODEL = "llama3.2:1b"
OUTPUT_CSV = "pv_extraction_results_ollama.csv"

# --- SCHEMA FIELDS JSON template ---
SCHEMA_FIELDS = """
{
  "title": "N/A",
  "last_name": "N/A",
  "year": "N/A",
  "doi": "N/A",
  "research_focus": "N/A",
  "key_findings": "N/A",
  "device_type": "N/A",
  "absorber_material": "N/A",
  "absorber_material_term_used": "N/A",
  "absorber_dopant_material": "N/A",
  "absorber_dopant_material_term_used": "N/A",
  "absorber_dopant_polarity": "N/A",
  "absorber_dopant_polarity_term_used": "N/A",
  "front_surface_morphology": "N/A",
  "front_surface_morphology_term_used": "N/A",
  "rear_surface_morphology": "N/A",
  "rear_surface_morphology_term_used": "N/A",
  "front_surface_passivation_material": "N/A",
  "front_surface_passivation_material_term_used": "N/A",
  "rear_surface_passivation_material": "N/A",
  "rear_surface_passivation_material_term_used": "N/A",
  "negative_metallization_material": "N/A",
  "negative_metallization_material_term_used": "N/A",
  "positive_metallization_material": "N/A",
  "positive_metallization_material_term_used": "N/A",
  "efficiency_percent": "N/A",
  "cell_area_cm2": "N/A",
  "short_circuit_current_a": "N/A",
  "short_circuit_current_density_ma_cm2": "N/A",
  "open_circuit_voltage_v": "N/A",
  "fill_factor_percent": "N/A"
}
"""

# --- PROMPT TEMPLATE ---
PROMPT_TEMPLATE = PromptTemplate.from_template(
    """
You are a scientific research assistant.

Extract the following structured fields from the given academic article text (focused on photovoltaic cells). 
Only report the most efficient solar cell device described in the text.

Return ONLY a valid JSON object using this exact schema. Use "N/A" where data is not available.

Schema:
{schema_fields}

Text:
{text}
"""
)

# --- Load chunk file content ---
def load_chunk_text(filepath: str) -> str:
    try:
        with open(filepath, "r", encoding="utf-8") as f:
            return f.read().strip()
    except UnicodeDecodeError:
        with open(filepath, "r", encoding="latin-1") as f:
            return f.read().strip()

# --- Call LLM to extract data from a chunk ---
def extract_data_from_chunk(llm: OllamaLLM, text: str) -> dict | None:
    prompt = PROMPT_TEMPLATE.format(text=text, schema_fields=SCHEMA_FIELDS)
    raw_response = llm.invoke(prompt).strip()

    try:
        data = json.loads(raw_response)
        if not isinstance(data, dict):
            print("⚠️ LLM returned JSON but not an object")
            return None
        return data
    except json.JSONDecodeError as e:
        print(f"❌ JSON parsing error: {e}")
        print(f"Raw LLM output was:\n{raw_response}")
        return None

# --- Save or overwrite row 1 of CSV ---
def save_or_update_result(new_record: dict, csv_path: str):
    schema_dict = json.loads(SCHEMA_FIELDS)
    schema_keys = list(schema_dict.keys())

    # Build cleaned record
    clean_record = {key: new_record.get(key, "N/A") for key in schema_keys}
    new_row_df = pd.DataFrame([clean_record])

    if os.path.exists(csv_path):
        df = pd.read_csv(csv_path)

        # Ensure all schema columns exist
        for col in schema_keys:
            if col not in df.columns:
                df[col] = "N/A"

        # Modify row 1 (index 0)
        for col in schema_keys:
            df.at[0, col] = clean_record[col]
        print("🔄 Modified row 1 (index 0)")
    else:
        # Create new file with this row
        df = new_row_df
        print("➕ Created new CSV with row 1")

    df.to_csv(csv_path, index=False)
    print(f"✅ Saved CSV: {csv_path}")

# --- Main loop ---
if __name__ == "__main__":
    llm = OllamaLLM(model=LLM_MODEL)

    files_processed = 0
    files_skipped = 0

    for root, _, files in os.walk(CHUNKS_FOLDER):
        for file in sorted(files):
            if file.endswith(".txt"):
                filepath = os.path.join(root, file)
                print(f"\n📄 Processing file: {filepath}")

                text = load_chunk_text(filepath)
                if not text:
                    print("⚠️ File empty, skipping.")
                    files_skipped += 1
                    continue

                data = extract_data_from_chunk(llm, text)
                if data is None:
                    print("⚠️ Extraction failed for this chunk.")
                    files_skipped += 1
                    continue

                save_or_update_result(data, OUTPUT_CSV)
                files_processed += 1

    print(f"\n🎉 Finished processing. Files processed: {files_processed}, skipped: {files_skipped}")



📄 Processing file: chunkTest\Kafle et al. - 2021 - TOPCon – Technology options for cost efficient industrial manufacturing\chunk_000.txt
❌ JSON parsing error: Extra data: line 34 column 1 (char 1046)
Raw LLM output was:
{
  "title": "",
  "last_name": "",
  "year": "",
  "doi": "",
  "research_focus": "",
  "key_findings": "",
  "device_type": "",
  "absorber_material": "",
  "absorber_material_term_used": "",
  "absorber_dopant_material": "",
  "absorber_dopant_material_term_used": "",
  "absorber_dopant_polarity": "",
  "absorber_dopant_polarity_term_used": "",
  "front_surface_morphology": "",
  "front_surface_morphology_term_used": "",
  "rear_surface_morphology": "",
  "rear_surface_morphology_term_used": "",
  "front_surface_passivation_material": "",
  "front_surface_passivation_material_term_used": "",
  "rear_surface_passivation_material": "",
  "rear_surface_passivation_material_term_used": "",
  "negative_metallization_material": "",
  "negative_metallization_material_term_

ValueError: Incompatible indexer with Series

In [3]:
import os
import json
import pandas as pd
from typing import List
from pydantic import BaseModel, Field, field_validator, ValidationError
from langchain_core.documents import Document
from langchain_community.llms import Ollama
from langchain_core.output_parsers import PydanticOutputParser
from langchain_core.prompts import PromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader

# --- SETTINGS ---
PDF_FOLDER = "pdfs"  # Folder containing PDF files
LLM_MODEL = "llama3.2:1b"
OUTPUT_CSV = "pv_extraction_results_ollama.csv"
MAX_PROMPT_CHARS = 15000  # Avoid exceeding LLM token limit

# --- SCHEMA ---
class PVArticleData(BaseModel):
    title: str = Field("N/A")
    last_name: str = Field("N/A")
    year: str = Field("N/A")
    doi: str = Field("N/A")
    research_focus: str = Field("N/A")
    key_findings: str = Field("N/A")
    device_type: str = Field("N/A")
    absorber_material: str = Field("N/A")
    absorber_material_term_used: str = Field("N/A")
    absorber_dopant_material: str = Field("N/A")
    absorber_dopant_material_term_used: str = Field("N/A")
    absorber_dopant_polarity: str = Field("N/A")
    absorber_dopant_polarity_term_used: str = Field("N/A")
    front_surface_morphology: str = Field("N/A")
    front_surface_morphology_term_used: str = Field("N/A")
    rear_surface_morphology: str = Field("N/A")
    rear_surface_morphology_term_used: str = Field("N/A")
    front_surface_passivation_material: str = Field("N/A")
    front_surface_passivation_material_term_used: str = Field("N/A")
    rear_surface_passivation_material: str = Field("N/A")
    rear_surface_passivation_material_term_used: str = Field("N/A")
    negative_metallization_material: str = Field("N/A")
    negative_metallization_material_term_used: str = Field("N/A")
    positive_metallization_material: str = Field("N/A")
    positive_metallization_material_term_used: str = Field("N/A")
    efficiency_percent: str = Field("N/A")
    cell_area_cm2: str = Field("N/A")
    short_circuit_current_a: str = Field("N/A")
    short_circuit_current_density_ma_cm2: str = Field("N/A")
    open_circuit_voltage_v: str = Field("N/A")
    fill_factor_percent: str = Field("N/A")

    @field_validator("*", mode="before")
    def convert_to_string(cls, v):
        return str(v) if v is not None else "N/A"

# --- PROMPT TEMPLATE ---
prompt_template = PromptTemplate.from_template("""
You are extracting structured data from academic articles on photovoltaic cells.
Focus only on the most efficient cell mentioned.

Return only a JSON object (no markdown or explanation). Do not wrap in triple backticks.

TEXT:
{text}

Extract and format according to this schema:
{format_instructions}
""")

# --- EXTRACT DATA FROM SINGLE PDF ---
def process_pdf(path: str) -> dict | None:
    try:
        loader = PyPDFLoader(path)
        docs = loader.load()
        splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
        split_docs = splitter.split_documents(docs)
        full_text = "\n\n".join(doc.page_content for doc in split_docs)
        trimmed_text = full_text[:MAX_PROMPT_CHARS]

        llm = Ollama(model=LLM_MODEL)
        parser = PydanticOutputParser(pydantic_object=PVArticleData)
        prompt = prompt_template.format(
            text=trimmed_text,
            format_instructions=parser.get_format_instructions()
        )

        try:
            raw_output = llm.invoke(prompt).strip()
        except Exception as e:
            print(f"⚠️ LLM error for {path}: {e}")
            return None

        if not raw_output or raw_output.lower().startswith(("error", "none", "no data")):
            print(f"⚠️ Skipped {path}: No informative output.")
            return None

        try:
            parsed = json.loads(raw_output)
        except json.JSONDecodeError:
            print(f"⚠️ JSON decode error for {path}")
            print(f"🔍 Output: {raw_output[:1000]}...")
            return None

        try:
            record = PVArticleData(**parsed)
        except ValidationError as ve:
            print(f"⚠️ Validation error for {path}: {ve}")
            return None

        result = record.model_dump()
        result["source"] = path
        print(f"✅ Extracted data from {os.path.basename(path)}")
        return result

    except Exception as e:
        print(f"⚠️ Failed to process {path}: {e}")
        return None

# --- MAIN ---
if __name__ == "__main__":
    all_results = []

    for file in os.listdir(PDF_FOLDER):
        if file.endswith(".pdf"):
            pdf_path = os.path.join(PDF_FOLDER, file)
            data = process_pdf(pdf_path)
            if data:
                all_results.append(data)

    if all_results:
        df_new = pd.DataFrame(all_results)
        if os.path.exists(OUTPUT_CSV):
            df_existing = pd.read_csv(OUTPUT_CSV)
            df_combined = pd.concat([df_existing, df_new], ignore_index=True)
        else:
            df_combined = df_new

        df_combined.to_csv(OUTPUT_CSV, index=False)
        print(f"✅ Saved {len(all_results)} rows to {OUTPUT_CSV}")
    else:
        print("⚠️ No valid data extracted from any PDFs.")


⚠️ JSON decode error for pdfs\pdf1.pdf
🔍 Output: Here is the formatted JSON output for the provided schema:
```
{
  "properties": {
    "title": {
      "default": "",
      "title": "Moisture Ingress on Topcon Cell",
      "type": "string"
    },
    "last_name": {
      "default": "",
      "title": "Last Name",
      "type": "string"
    },
    "year": {
      "default": "",
      "title": "Year",
      "type": "string"
    },
    "doi": {
      "default": "",
      "title": "Doi",
      "type": "string"
    },
    "research_focus": {
      "default": "",
      "title": "Research Focus",
      "type": "string"
    },
    "key_findings": {
      "default": "",
      "title": "Key Findings",
      "type": "string"
    },
    "device_type": {
      "default": "",
      "title": "Device Type",
      "type": "string"
    },
    "absorber_material": {
      "default": "",
      "title": "Absorber Material",
      "type": "string"
    },
    "absorber_material_term_used": {
      "default"

In [3]:
!pdfplumber

'pdfplumber' is not recognized as an internal or external command,
operable program or batch file.


In [5]:
import os
import json
import pdfplumber
import pandas as pd
from typing import List
from concurrent.futures import ThreadPoolExecutor, as_completed
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_community.llms import Ollama
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import PydanticOutputParser
from pydantic import BaseModel, Field, field_validator, ValidationError

# --- SETTINGS ---
PDF_PATH = "pdf1.pdf"  # Replace with your PDF file path
OUTPUT_CSV = "pv_extraction_from_pdf.csv"
LLM_MODEL = "llama3.2:1b"
MAX_THREADS = 4

# --- SCHEMA DEFINITION ---
class PVArticleData(BaseModel):
    title: str = Field("N/A")
    last_name: str = Field("N/A")
    year: str = Field("N/A")
    doi: str = Field("N/A")
    research_focus: str = Field("N/A")
    key_findings: str = Field("N/A")
    device_type: str = Field("N/A")
    absorber_material: str = Field("N/A")
    absorber_material_term_used: str = Field("N/A")
    absorber_dopant_material: str = Field("N/A")
    absorber_dopant_material_term_used: str = Field("N/A")
    absorber_dopant_polarity: str = Field("N/A")
    absorber_dopant_polarity_term_used: str = Field("N/A")
    front_surface_morphology: str = Field("N/A")
    front_surface_morphology_term_used: str = Field("N/A")
    rear_surface_morphology: str = Field("N/A")
    rear_surface_morphology_term_used: str = Field("N/A")
    front_surface_passivation_material: str = Field("N/A")
    front_surface_passivation_material_term_used: str = Field("N/A")
    rear_surface_passivation_material: str = Field("N/A")
    rear_surface_passivation_material_term_used: str = Field("N/A")
    negative_metallization_material: str = Field("N/A")
    negative_metallization_material_term_used: str = Field("N/A")
    positive_metallization_material: str = Field("N/A")
    positive_metallization_material_term_used: str = Field("N/A")
    efficiency_percent: str = Field("N/A")
    cell_area_cm2: str = Field("N/A")
    short_circuit_current_a: str = Field("N/A")
    short_circuit_current_density_ma_cm2: str = Field("N/A")
    open_circuit_voltage_v: str = Field("N/A")
    fill_factor_percent: str = Field("N/A")

    @field_validator("*", mode="before")
    def convert_to_string(cls, v):
        return str(v) if v is not None else "N/A"

# --- PROMPT ---
prompt_template = PromptTemplate.from_template("""
You are extracting structured data from academic articles on photovoltaic cells.
Focus only on the most efficient cell mentioned.

Return only a JSON object (no markdown or explanation). Do not wrap in triple backticks.

TEXT:
{text}

Extract and format according to this schema:
{format_instructions}
""")

# --- TEXT SPLITTER ---
def split_pdf_into_chunks(pdf_path: str) -> List[Document]:
    loader = PyPDFLoader(pdf_path)
    all_pages = loader.load()

    splitter = RecursiveCharacterTextSplitter(
        chunk_size=2000,
        chunk_overlap=200,
        separators=["\n\n", "\n", " ", ""]
    )
    return splitter.split_documents(all_pages)

# --- LLM PROCESSING FUNCTION ---
def extract_data_from_chunk(doc: Document) -> dict | None:
    try:
        llm = Ollama(model=LLM_MODEL)
        parser = PydanticOutputParser(pydantic_object=PVArticleData)
        prompt = prompt_template.format(
            text=doc.page_content,
            format_instructions=parser.get_format_instructions()
        )
        raw_output = llm.invoke(prompt).strip()
        parsed = json.loads(raw_output)
        record = PVArticleData(**parsed)
        result = record.model_dump()
        result["source"] = doc.metadata.get("page", "N/A")
        print(f"✅ Processed chunk from page {result['source']}")
        return result
    except (json.JSONDecodeError, ValidationError, ValueError) as e:
        print(f"❌ Skipped chunk due to error: {e}")
        return None

# --- MAIN PIPELINE ---
if __name__ == "__main__":
    if not os.path.exists(PDF_PATH):
        print("❌ PDF file not found.")
        exit()

    chunks = split_pdf_into_chunks(PDF_PATH)
    print(f"🔍 Loaded and split PDF into {len(chunks)} chunks.")

    extracted_results = []
    with ThreadPoolExecutor(max_workers=MAX_THREADS) as executor:
        futures = [executor.submit(extract_data_from_chunk, doc) for doc in chunks]
        for future in as_completed(futures):
            result = future.result()
            if result:
                extracted_results.append(result)

    if extracted_results:
        df = pd.DataFrame(extracted_results)
        df.to_csv(OUTPUT_CSV, index=False)
        print(f"✅ Saved extracted data to {OUTPUT_CSV}")
    else:
        print("⚠️ No valid data extracted.")


🔍 Loaded and split PDF into 29 chunks.


  llm = Ollama(model=LLM_MODEL)


❌ Skipped chunk due to error: Extra data: line 3 column 1 (char 273)
✅ Processed chunk from page 0
✅ Processed chunk from page 0
❌ Skipped chunk due to error: Expecting ',' delimiter: line 35 column 39 (char 1666)
✅ Processed chunk from page 1
❌ Skipped chunk due to error: Expecting ',' delimiter: line 1 column 554 (char 553)
✅ Processed chunk from page 2
✅ Processed chunk from page 3
✅ Processed chunk from page 3
✅ Processed chunk from page 3
✅ Processed chunk from page 3
❌ Skipped chunk due to error: Expecting ',' delimiter: line 1 column 882 (char 881)
✅ Processed chunk from page 4
✅ Processed chunk from page 5
✅ Processed chunk from page 5
✅ Processed chunk from page 5
✅ Processed chunk from page 5
✅ Processed chunk from page 6
✅ Processed chunk from page 6
✅ Processed chunk from page 6
✅ Processed chunk from page 6
✅ Processed chunk from page 7
✅ Processed chunk from page 7
✅ Processed chunk from page 7
✅ Processed chunk from page 8
✅ Processed chunk from page 8
✅ Processed chunk 

In [1]:
import os
import json
import pandas as pd
from typing import List
from pydantic import BaseModel, Field, field_validator, ValidationError
from langchain_core.documents import Document
from langchain_community.llms import Ollama
from langchain_core.output_parsers import PydanticOutputParser
from langchain_core.prompts import PromptTemplate
from concurrent.futures import ProcessPoolExecutor, as_completed

# --- SETTINGS ---
CHUNKS_FOLDER = "chunkTest"
LLM_MODEL = "llama3.2:1b"
OUTPUT_CSV = "pv_extraction_results_ollama.csv"
MAX_PROCESSES = 4  # Adjust to number of CPU cores or RAM limits

# --- Define Schema ---
class PVArticleData(BaseModel):
    title: str = Field("N/A")
    last_name: str = Field("N/A")
    year: str = Field("N/A")
    doi: str = Field("N/A")
    research_focus: str = Field("N/A")
    key_findings: str = Field("N/A")
    device_type: str = Field("N/A")
    absorber_material: str = Field("N/A")
    absorber_material_term_used: str = Field("N/A")
    absorber_dopant_material: str = Field("N/A")
    absorber_dopant_material_term_used: str = Field("N/A")
    absorber_dopant_polarity: str = Field("N/A")
    absorber_dopant_polarity_term_used: str = Field("N/A")
    front_surface_morphology: str = Field("N/A")
    front_surface_morphology_term_used: str = Field("N/A")
    rear_surface_morphology: str = Field("N/A")
    rear_surface_morphology_term_used: str = Field("N/A")
    front_surface_passivation_material: str = Field("N/A")
    front_surface_passivation_material_term_used: str = Field("N/A")
    rear_surface_passivation_material: str = Field("N/A")
    rear_surface_passivation_material_term_used: str = Field("N/A")
    negative_metallization_material: str = Field("N/A")
    negative_metallization_material_term_used: str = Field("N/A")
    positive_metallization_material: str = Field("N/A")
    positive_metallization_material_term_used: str = Field("N/A")
    efficiency_percent: str = Field("N/A")
    cell_area_cm2: str = Field("N/A")
    short_circuit_current_a: str = Field("N/A")
    short_circuit_current_density_ma_cm2: str = Field("N/A")
    open_circuit_voltage_v: str = Field("N/A")
    fill_factor_percent: str = Field("N/A")

    @field_validator("*", mode="before")
    def convert_to_string(cls, v):
        return str(v) if v is not None else "N/A"

# --- Prompt Template ---
prompt_template = PromptTemplate.from_template("""
You are extracting structured data from academic articles on photovoltaic cells.
Focus only on the most efficient cell mentioned.

Return only a JSON object (no markdown or explanation). Do not wrap in triple backticks.

TEXT:
{text}

Extract and format according to this schema:
{format_instructions}
""")

# --- Load .txt chunks as Documents ---
def read_file(path: str) -> Document | None:
    try:
        with open(path, "r", encoding="utf-8") as f:
            content = f.read().strip()
    except UnicodeDecodeError:
        try:
            with open(path, "r", encoding="latin-1") as f:
                content = f.read().strip()
        except:
            return None
    return Document(page_content=content, metadata={"source": path}) if content else None

def load_chunks() -> List[Document]:
    txt_paths = [
        os.path.join(root, fn)
        for root, _, files in os.walk(CHUNKS_FOLDER)
        for fn in files if fn.endswith(".txt")
    ]
    docs = []
    for path in txt_paths:
        doc = read_file(path)
        if doc:
            docs.append(doc)
    return docs

# --- LLM Worker Function (Run in subprocess) ---
def extract_data_from_doc(doc_text: str, source: str) -> dict | None:
    try:
        llm = Ollama(model=LLM_MODEL)
        parser = PydanticOutputParser(pydantic_object=PVArticleData)
        prompt = prompt_template.format(
            text=doc_text,
            format_instructions=parser.get_format_instructions()
        )
        raw_output = llm.invoke(prompt).strip()
        parsed = json.loads(raw_output)
        record = PVArticleData(**parsed)
        result = record.model_dump()
        result["source"] = source
        print(f"✅ Processed: {source}")
        return result
    except (json.JSONDecodeError, ValidationError, ValueError) as e:
        print(f"❌ Skipped {source} due to error: {e}")
        return None

# --- Main Pipeline ---
if __name__ == "__main__":
    all_documents = load_chunks()
    if not all_documents:
        print("⚠️ No documents found.")
        exit()

    tasks = [(doc.page_content, doc.metadata["source"]) for doc in all_documents]

    extracted_results = []
    with ProcessPoolExecutor(max_workers=MAX_PROCESSES) as executor:
        futures = [executor.submit(extract_data_from_doc, text, src) for text, src in tasks]
        for future in as_completed(futures):
            result = future.result()
            if result:
                extracted_results.append(result)

    if extracted_results:
        df_new = pd.DataFrame(extracted_results)
        if os.path.exists(OUTPUT_CSV):
            df_existing = pd.read_csv(OUTPUT_CSV)
            df_combined = pd.concat([df_existing, df_new], ignore_index=True)
        else:
            df_combined = df_new
        df_combined.to_csv(OUTPUT_CSV, index=False)
        print(f"✅ Saved extracted data to {OUTPUT_CSV}")
    else:
        print("⚠️ No valid data extracted.")


BrokenProcessPool: A process in the process pool was terminated abruptly while the future was running or pending.

In [1]:
import os
import asyncio
from concurrent.futures import ThreadPoolExecutor
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_ollama import OllamaLLM

PDF_FOLDER = "pdfs"
CHUNK_SIZE = 500
CHUNK_OVERLAP = 50
MAX_CHUNKS_PER_PDF = 5
MAX_WORKERS = 8

llm = OllamaLLM(model="llama3.2:1b")  # Shared LLM instance
splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)

executor = ThreadPoolExecutor(max_workers=MAX_WORKERS)

def load_and_split_pdf(pdf_path):
    loader = PyPDFLoader(pdf_path)
    documents = loader.load()
    chunks = splitter.split_documents(documents)
    return chunks[:MAX_CHUNKS_PER_PDF] if MAX_CHUNKS_PER_PDF else chunks

def process_chunk_sync(content, filename, i):
    try:
        llm.invoke(content)
        print(f" {filename} | Chunk {i+1}")
    except Exception as e:
        print(f"Error on chunk {i+1} of {filename}: {e}")

async def process_pdf(filename):
    if not filename.endswith(".pdf"):
        return

    pdf_path = os.path.join(PDF_FOLDER, filename)
    try:
        chunks = await asyncio.get_event_loop().run_in_executor(executor, load_and_split_pdf, pdf_path)

        tasks = [
            asyncio.get_event_loop().run_in_executor(
                executor, process_chunk_sync, chunk.page_content, filename, i
            )
            for i, chunk in enumerate(chunks)
        ]
        await asyncio.gather(*tasks)

    except Exception as e:
        print(f" Failed to process {filename}: {e}")

async def main():
    pdf_files = [f for f in os.listdir(PDF_FOLDER) if f.endswith(".pdf")]
    tasks = [process_pdf(f) for f in pdf_files]
    await asyncio.gather(*tasks)

if __name__ == "__main__":
    import nest_asyncio                 
    nest_asyncio.apply()  # Allows nested use of asyncio.run()
    asyncio.get_event_loop().run_until_complete(main())



FileNotFoundError: [WinError 3] The system cannot find the path specified: 'pdfs'

In [2]:
import os
import asyncio
from concurrent.futures import ThreadPoolExecutor
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_ollama import OllamaLLM

# Configuration
PDF_FOLDER = "pdfs"
CHUNK_SIZE = 500
CHUNK_OVERLAP = 50
MAX_CHUNKS_PER_PDF = 5
MAX_WORKERS = os.cpu_count() or 8

# Shared objects
llm = OllamaLLM(model="llama3.2:1b")
splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
executor = ThreadPoolExecutor(max_workers=MAX_WORKERS)

# Purely synchronous processing for one PDF file
def process_pdf_sync(filename):
    if not filename.endswith(".pdf"):
        return

    pdf_path = os.path.join(PDF_FOLDER, filename)
    try:
        loader = PyPDFLoader(pdf_path)
        documents = loader.load()
        chunks = splitter.split_documents(documents)
        for i, chunk in enumerate(chunks[:MAX_CHUNKS_PER_PDF]):
            try:
                llm.invoke(chunk.page_content)
                print(f" {filename} | Chunk {i+1}")
            except Exception as e:
                print(f" Error on chunk {i+1} of {filename}: {e}")
    except Exception as e:
        print(f" Failed to process {filename}: {e}")

# Async wrapper that delegates sync task to a thread
async def main():
    pdf_files = [f for f in os.listdir(PDF_FOLDER) if f.endswith(".pdf")]
    loop = asyncio.get_running_loop()
    tasks = [
        loop.run_in_executor(executor, process_pdf_sync, f)
        for f in pdf_files
    ]
    await asyncio.gather(*tasks)

if __name__ == "__main__":
    import nest_asyncio
    nest_asyncio.apply()  # Allows nested use of asyncio.run()
    asyncio.get_event_loop().run_until_complete(main())



 JinkoSolar Eagle 54HM G6 Datasheet (420–440 W, N-Type TOPCon).pdf | Chunk 4
 Model_Based_Continuous_Improvement_of_Industrial_p.pdf | Chunk 4
 JinkoSolar Eagle 72 G6B Datasheet (570–590 W, N-Type Bifacial).pdf | Chunk 4
 1-s2.0-S1876610215008851-main.pdf | Chunk 5
 Adani ELAN SHINE TOPCon Datasheet (550–575 W, N-Type Bifacial).pdf | Chunk 5
 1-s2.0-S1876610215008206-main.pdf | Chunk 5
 Qcells Q.TRON BLK M-G2+ Series Datasheet (415–440 Wp, 2024).pdf | Chunk 1
 Intl J of Energy Research - 2021 - Gawusu - The dynamics of green supply chain management within the framework of renewable.pdf | Chunk 5
 Model_Based_Continuous_Improvement_of_Industrial_p.pdf | Chunk 5
 JinkoSolar Eagle 54HM G6 Datasheet (420–440 W, N-Type TOPCon).pdf | Chunk 5
 JinkoSolar Eagle 72 G6B Datasheet (570–590 W, N-Type Bifacial).pdf | Chunk 5
 Rayzon TOPCon Datasheet (570–590 W, N-Type Bifacial, 2024).pdf | Chunk 1
 Resistive_Power_Loss_Analysis_of_PV_Modules_Made_From_Halved_15.615.6_cm2_Silicon_PERC_Solar_Cells_Wi

In [None]:
import os
from langchain_core.documents import Document
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain_community.llms import Ollama

CHUNKS_FOLDER = "chunks"
VECTORSTORE_PATH = "vectorstore"

#  Recursively load .txt chunks from all subfolders
def load_chunks():
    print(" Recursively loading .txt chunks...")
    docs = []
    for root, _, files in os.walk(CHUNKS_FOLDER):
        for filename in files:
            if filename.endswith(".txt"):
                path = os.path.join(root, filename)
                try:
                    with open(path, "r", encoding="utf-8") as f:
                        content = f.read().strip()
                except UnicodeDecodeError:
                    try:
                        with open(path, "r", encoding="latin-1") as f:
                            content = f.read().strip()
                        print(f" Non-UTF8 file read with latin-1: {os.path.relpath(path, CHUNKS_FOLDER)}")
                    except Exception as e:
                        print(f" Skipping unreadable file: {os.path.relpath(path, CHUNKS_FOLDER)} - {e}")
                        continue

                if content:
                    docs.append(Document(
                        page_content=content,
                        metadata={"source": os.path.relpath(path, CHUNKS_FOLDER)}
                    ))
                else:
                    print(f" Skipped empty file: {os.path.relpath(path, CHUNKS_FOLDER)}")
    print(f" Loaded {len(docs)} non-empty documents.")
    return docs


#  Embed and prepare vectorstore
def prepare_vectorstore(documents):
    embedder = OllamaEmbeddings(model="llama3.2:1b")
    
    if os.path.exists(VECTORSTORE_PATH):
        print(" Loading existing vectorstore...")
        return FAISS.load_local(VECTORSTORE_PATH, embedder)

    print(" Creating new FAISS vectorstore...")
    if not documents:
        raise ValueError(" No documents to index. Check your chunks folder.")
    vs = FAISS.from_documents(documents, embedder)
    vs.save_local(VECTORSTORE_PATH)
    return vs

#  Create the RAG QA chain
def create_qa_chain(vectorstore):
    llm = OllamaLLM(model="llama3.2:1b")
    retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
    return RetrievalQA.from_chain_type(llm=llm, retriever=retriever)

#  Main loop
if __name__ == "__main__":
    documents = load_chunks()
    vectorstore = prepare_vectorstore(documents)
    qa_chain = create_qa_chain(vectorstore)

    print("\n RAG is ready. Ask anything (type 'exit' to quit):")
    while True:
        query = input("> ")
        if query.lower() == "exit":
            break
        result = qa_chain.run(query)
        print(f" {result}")


 Recursively loading .txt chunks...
 Non-UTF8 file read with latin-1: 1-s2.0-S0038092X16303383-main\chunk_009.txt
 Non-UTF8 file read with latin-1: 1-s2.0-S0927024815001415-main\chunk_052.txt
 Non-UTF8 file read with latin-1: 1-s2.0-S0927024815001415-main\chunk_059.txt
 Non-UTF8 file read with latin-1: 1-s2.0-S0927024815001415-main\chunk_061.txt
 Non-UTF8 file read with latin-1: 1-s2.0-S0927024815003244-main\chunk_055.txt
 Non-UTF8 file read with latin-1: 1-s2.0-S0927024816000313-main\chunk_065.txt
 Non-UTF8 file read with latin-1: 1-s2.0-S0927024816300071-main\chunk_072.txt
 Non-UTF8 file read with latin-1: 1-s2.0-S1876610215007183-main\chunk_038.txt
 Non-UTF8 file read with latin-1: 1-s2.0-S1876610215007420-main\chunk_030.txt
 Non-UTF8 file read with latin-1: 1-s2.0-S1876610215007420-main\chunk_031.txt
 Non-UTF8 file read with latin-1: 1-s2.0-S1876610215007420-main\chunk_033.txt
 Non-UTF8 file read with latin-1: 1-s2.0-S1876610215007420-main\chunk_036.txt
 Non-UTF8 file read with lat

In [None]:
import os
from concurrent.futures import ThreadPoolExecutor
from langchain_core.documents import Document
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain_community.llms import Ollama

CHUNKS_FOLDER = "chunks"
VECTORSTORE_PATH = "vectorstore"

# 🧹 Load a single file
def load_file(path):
    try:
        with open(path, "r", encoding="utf-8") as f:
            content = f.read().strip()
    except UnicodeDecodeError:
        try:
            with open(path, "r", encoding="latin-1") as f:
                content = f.read().strip()
            print(f" Non-UTF8 file read with latin-1: {os.path.relpath(path, CHUNKS_FOLDER)}")
        except Exception as e:
            print(f" Could not read file: {path} - {e}")
            return None
    if not content:
        print(f" Skipped empty file: {os.path.relpath(path, CHUNKS_FOLDER)}")
        return None
    return Document(page_content=content, metadata={"source": os.path.relpath(path, CHUNKS_FOLDER)})

#  Load all documents concurrently from chunks/
def load_chunks():
    print(" Loading .txt files from 'chunks/' recursively...")
    paths = [
        os.path.join(root, file)
        for root, _, files in os.walk(CHUNKS_FOLDER)
        for file in files if file.endswith(".txt")
    ]
    with ThreadPoolExecutor() as executor:
        docs = list(executor.map(load_file, paths))
    documents = [doc for doc in docs if doc]
    print(f" Loaded {len(documents)} documents.")
    return documents

#  Prepare or load FAISS vectorstore
def prepare_vectorstore(documents):
    embedder = OllamaEmbeddings(model="nomic-embed-text")

    if os.path.exists(VECTORSTORE_PATH):
        print(" Loading existing FAISS vectorstore...")
        return FAISS.load_local(VECTORSTORE_PATH, embedder)

    print(" Building new FAISS vectorstore...")
    if not documents:
        raise ValueError(" No documents found in chunks/.")
    vectorstore = FAISS.from_documents(documents, embedder)
    vectorstore.save_local(VECTORSTORE_PATH)
    return vectorstore

#  Create RAG chain
def create_qa_chain(vectorstore):
    llm = Ollama(model="llama3.2:1b")
    retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
    return RetrievalQA.from_chain_type(llm=llm, retriever=retriever, return_source_documents=True)

#  Main
if __name__ == "__main__":
    docs = load_chunks()
    vectordb = prepare_vectorstore(docs)
    qa = create_qa_chain(vectordb)



In [None]:
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.chains.qa_with_sources import load_qa_with_sources_chain
from langchain_community.llms import Ollama

# Load embedding model and FAISS vectorstore
embedder = OllamaEmbeddings(model="nomic-embed-text")
vectorstore = FAISS.load_local("vectorstore", embedder, allow_dangerous_deserialization=True)

# Create retriever from vectorstore
retriever = vectorstore.as_retriever(search_kwargs={"k": 5})

# Load Ollama LLM
llm = Ollama(model="llama3.2:1b")

# Load QA chain that only uses provided sources (no external info)
qa_chain = load_qa_with_sources_chain(llm, chain_type="stuff")  # "stuff" uses only retrieved context

# Define a function that queries only the FAISS data
def query_faiss_only(question):
    docs = retriever.get_relevant_documents(question)
    result = qa_chain({"input_documents": docs, "question": question})
    return result

# Example usage
query = """
Act like a researcher assistant. Summarize what you know about 'dopedsilicon' 
and return the response in JSON format with the keys: "definition", "applications", and "source".
"""
response = query_faiss_only(query)

# Output the result and source documents
print(response["output_text"])
